diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in index 284dec30f..7c9943d4f 100644 --- a/heartbeat/storage-mon.in +++ b/heartbeat/storage-mon.in @@ -1,399 +1,410 @@ #!@BASH_SHELL@ # # Copyright (C) 2021 Red Hat, Inc. All rights reserved. # # Authors: Christine Caulfield # Fabio M. Di Nitto # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # Checks storage I/O status of all given drives and writes the #health-storage # status into the CIB # Implementation is heavily based on ocf:pacemaker:HealtSMART # # It sends a single block on IO to a radom location on the device and reports any errors returned. # If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some # instances). # # It's worth making a note in the RA description that the smartmon RA is also recommended (this # does not replace it), and that Pacemaker health checking should be configued. # # https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health ####################################################################### ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # STORAGEMON=${HA_BIN}/storage_mon ATTRDUP=${HA_SBIN_DIR}/attrd_updater PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}" OCF_RESKEY_CRM_meta_interval_default="0" OCF_RESKEY_io_timeout_default="10" OCF_RESKEY_check_interval_default="30" OCF_RESKEY_inject_errors_default="" OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" OCF_RESKEY_daemonize_default="false" # Explicitly list all environment variables used, to make static analysis happy : ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} : ${OCF_RESKEY_drives:=""} : ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} : ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}} : ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} : ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} : ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}} ####################################################################### meta_data() { cat < 1.0 System health agent that checks the storage I/O status of the given drives and updates the #health-storage attribute. Usage is highly recommended in combination with the HealthSMART monitoring agent. The agent currently support a maximum of 25 devices per instance. storage I/O health status Location to store the resource state in. State file The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". Drives to check Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default). Disk I/O timeout Specify interval between I/O checks in seconds.(Only supported with the damonize option.) I/O check interval Used only for testing! Specify % of I/O errors to simulate drives failures. Specify % of I/O errors to simulate drives failures Specifies to start storage-mon as a daemon and check for devices. start storage-mon with daemon END return $OCF_SUCCESS } ####################################################################### storage-mon_usage() { cat < /dev/null 2>&1 case "$?" in 0) rc=$OCF_SUCCESS;; 1|2) rc=$OCF_NOT_RUNNING;; *) rc=$OCF_ERR_GENERIC;; esac if [ $rc -ne $OCF_SUCCESS ]; then return "$rc" fi if [ "$1" = "pid_check_only" ]; then return "$rc" fi # generate client command line cmdline="" cmdline="$cmdline --client --attrname ${ATTRNAME}" while : do # 0 : Normal. # greater than 0 : monitoring error. # 255(-1) : communication system error. # 254(-2) : Not all checks completed for first device in daemon mode. $STORAGEMON $cmdline rc=$? case "$rc" in 254|255) # If there is a communication error or the initial check of all devices has not been completed, # it will loop and try to reconnect. # When everything ends with a communication error during monitor, a monitor timeout occurs. ocf_log debug "client monitor error : $rc" ;; 0) status="green" break ;; *) status="red" break ;; esac done storage-mon_update_attribute $status return "$?" fi } storage-mon_start() { if ! ocf_is_true "$OCF_RESKEY_daemonize"; then storage-mon_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi touch "${OCF_RESKEY_state_file}" else storage-mon_init # generate command line cmdline="" for DRIVE in ${OCF_RESKEY_drives}; do cmdline="$cmdline --device $DRIVE --score 1" done cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}" if [ -n "${OCF_RESKEY_inject_errors}" ]; then cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" fi $STORAGEMON $cmdline if [ "$?" -ne 0 ]; then return $OCF_ERR_GENERIC fi + + #Wait until monitor confirms the startup pid according to the ocf resource specification. + while true; do + storage-mon_monitor pid_check_only + rc="$?" + if [ $rc -eq $OCF_SUCCESS ]; then + break + fi + sleep 1 + ocf_log debug "storage-mon daemon still hasn't started yet. Waiting..." + done fi } storage-mon_stop() { storage-mon_monitor rc=$? if ! ocf_is_true "$OCF_RESKEY_daemonize"; then if [ $rc -eq $OCF_SUCCESS ]; then rm "${OCF_RESKEY_state_file}" fi else case "$rc" in $OCF_SUCCESS) ;; $OCF_NOT_RUNNING) return "$OCF_SUCCESS";; *) return "$rc";; esac kill -TERM $(cat "${PIDFILE}") if [ "$?" -ne 0 ]; then return $OCF_ERR_GENERIC fi while true; do storage-mon_monitor pid_check_only rc="$?" case "$rc" in $OCF_SUCCESS) ;; $OCF_NOT_RUNNING) return "$OCF_SUCCESS";; *) return "$rc";; esac sleep 1 done fi return $OCF_SUCCESS } storage-mon_validate() { storage-mon_init if ! ocf_is_true "$OCF_RESKEY_daemonize"; then # Is the state directory writable? state_dir=$(dirname "${OCF_RESKEY_state_file}") touch "$state_dir/$$" if [ $? -ne 0 ]; then return $OCF_ERR_CONFIGURED fi rm "$state_dir/$$" fi return $OCF_SUCCESS } case "$__OCF_ACTION" in start) storage-mon_start;; stop) storage-mon_stop;; monitor) storage-mon_monitor;; validate-all) storage-mon_validate;; meta-data) meta_data;; usage|help) storage-mon_usage $OCF_SUCCESS;; *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc # vim: set filetype=sh: