diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in index 5b289fe55..875095670 100644 --- a/heartbeat/storage-mon.in +++ b/heartbeat/storage-mon.in @@ -1,263 +1,263 @@ #!@BASH_SHELL@ # # Copyright (C) 2021 Red Hat, Inc. All rights reserved. # # Authors: Christine Caulfield <ccaulfie@redhat.com> # Fabio M. Di Nitto <fdinitto@redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # Checks storage I/O status of all given drives and writes the #health-storage # status into the CIB # Implementation is heavily based on ocf:pacemaker:HealtSMART # # It sends a single block on IO to a radom location on the device and reports any errors returned. # If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some # instances). # # It's worth making a note in the RA description that the smartmon RA is also recommended (this # does not replace it), and that Pacemaker health checking should be configued. # # https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health ####################################################################### ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # STORAGEMON=$HA_BIN/storage_mon ATTRDUP=/usr/sbin/attrd_updater OCF_RESKEY_CRM_meta_interval_default="0" OCF_RESKEY_io_timeout_default="10" OCF_RESKEY_inject_errors_default="" OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" # Explicitly list all environment variables used, to make static analysis happy : ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} : ${OCF_RESKEY_drives:=""} : ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} : ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} : ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} ####################################################################### meta_data() { cat <<END <?xml version="1.0"?> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> <resource-agent name="storage-mon"> <version>1.0</version> <longdesc lang="en"> System health agent that checks the storage I/O status of the given drives and updates the #health-storage attribute. Usage is highly recommended in combination -with storage-mon monitoring agent. The agent currently support a maximum of 25 +with the HealthSMART monitoring agent. The agent currently support a maximum of 25 devices per instance. </longdesc> <shortdesc lang="en">storage I/O health status</shortdesc> <parameters> <parameter name="state_file" unique="1"> <longdesc lang="en"> Location to store the resource state in. </longdesc> <shortdesc lang="en">State file</shortdesc> <content type="string" default="${OCF_RESKEY_state_file_default}" /> </parameter> <parameter name="drives" unique="1" required="1"> <longdesc lang="en"> The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". </longdesc> <shortdesc lang="en">Drives to check</shortdesc> <content type="string" default="" /> </parameter> <parameter name="io_timeout" unique="0"> <longdesc lang="en"> Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default). </longdesc> <shortdesc lang="en">Disk I/O timeout</shortdesc> <content type="integer" default="${OCF_RESKEY_io_timeout_default}" /> </parameter> <parameter name="inject_errors" unique="0"> <longdesc lang="en"> Used only for testing! Specify % of I/O errors to simulate drives failures. </longdesc> <shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc> <content type="integer" default="${OCF_RESKEY_inject_errors_default}" /> </parameter> </parameters> <actions> <action name="start" timeout="10s" /> <action name="stop" timeout="120s" /> <action name="monitor" timeout="120s" interval="30s" start-delay="0s" /> <action name="meta-data" timeout="5s" /> <action name="validate-all" timeout="10s" /> </actions> </resource-agent> END return $OCF_SUCCESS } ####################################################################### storage-mon_usage() { cat <<END usage: $0 {start|stop|monitor|validate-all|meta-data} Expects to have a fully populated OCF RA-compliant environment set. END return $1 } storage-mon_init() { #Test for presence of storage_mon helper if [ ! -x "$STORAGEMON" ] ; then ocf_log err "${STORAGEMON} not installed." exit $OCF_ERR_INSTALLED fi i=0 for DRIVE in ${OCF_RESKEY_drives}; do if [ ! -e "$DRIVE" ] ; then ocf_log err "${DRIVE} not found on the system" exit $OCF_ERR_INSTALLED fi i=$((i + 1)) done if [ "$i" -gt "25" ]; then ocf_log err "Too many drives ($i) configured for this agent. Max 25." exit $OCF_ERR_CONFIGURED fi if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then ocf_log err "Minimum timeout is 1. Recommended 10 (default)." exit $OCF_ERR_CONFIGURED fi if [ -n "${OCF_RESKEY_inject_errors}" ]; then if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then ocf_log err "Inject errors % has to be a value between 1 and 100." exit $OCF_ERR_CONFIGURED fi fi } storage-mon_validate() { storage-mon_init # Is the state directory writable? state_dir=$(dirname "$OCF_RESKEY_state_file") touch "$state_dir/$$" if [ $? -ne 0 ]; then return $OCF_ERR_CONFIGURED fi rm "$state_dir/$$" return $OCF_SUCCESS } storage-mon_monitor() { storage-mon_init # Monitor _MUST!_ differentiate correctly between running # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). # That is THREE states, not just yes/no. if [ ! -f "${OCF_RESKEY_state_file}" ]; then return $OCF_NOT_RUNNING fi # generate command line cmdline="" for DRIVE in ${OCF_RESKEY_drives}; do cmdline="$cmdline --device $DRIVE --score 1" done cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" if [ -n "${OCF_RESKEY_inject_errors}" ]; then cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" fi $STORAGEMON $cmdline if [ $? -ne 0 ]; then status="red" else status="green" fi "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s" return $OCF_SUCCESS } storage-mon_start() { storage-mon_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi touch "${OCF_RESKEY_state_file}" } storage-mon_stop() { storage-mon_monitor if [ $? -eq $OCF_SUCCESS ]; then rm "${OCF_RESKEY_state_file}" fi return $OCF_SUCCESS } storage-mon_validate() { storage-mon_init # Is the state directory writable? state_dir=$(dirname "${OCF_RESKEY_state_file}") touch "$state_dir/$$" if [ $? -ne 0 ]; then return $OCF_ERR_CONFIGURED fi rm "$state_dir/$$" return $OCF_SUCCESS } case "$__OCF_ACTION" in start) storage-mon_start;; stop) storage-mon_stop;; monitor) storage-mon_monitor;; validate-all) storage-mon_validate;; meta-data) meta_data;; usage|help) storage-mon_usage $OCF_SUCCESS;; *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc # vim: set filetype=sh: