Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
index 7c9943d4f..5edb96979 100644
--- a/heartbeat/storage-mon.in
+++ b/heartbeat/storage-mon.in
@@ -1,410 +1,410 @@
#!@BASH_SHELL@
#
# Copyright (C) 2021 Red Hat, Inc. All rights reserved.
#
# Authors: Christine Caulfield <ccaulfie@redhat.com>
# Fabio M. Di Nitto <fdinitto@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#
# Checks storage I/O status of all given drives and writes the #health-storage
# status into the CIB
# Implementation is heavily based on ocf:pacemaker:HealtSMART
#
# It sends a single block on IO to a radom location on the device and reports any errors returned.
# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some
# instances).
#
# It's worth making a note in the RA description that the smartmon RA is also recommended (this
# does not replace it), and that Pacemaker health checking should be configued.
#
# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health
#######################################################################
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#
STORAGEMON=${HA_BIN}/storage_mon
ATTRDUP=${HA_SBIN_DIR}/attrd_updater
PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid
ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}"
OCF_RESKEY_CRM_meta_interval_default="0"
OCF_RESKEY_io_timeout_default="10"
OCF_RESKEY_check_interval_default="30"
OCF_RESKEY_inject_errors_default=""
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
OCF_RESKEY_daemonize_default="false"
# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
: ${OCF_RESKEY_drives:=""}
: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}}
: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="storage-mon" version="1.0">
<version>1.0</version>
<longdesc lang="en">
System health agent that checks the storage I/O status of the given drives and
updates the #health-storage attribute. Usage is highly recommended in combination
with the HealthSMART monitoring agent. The agent currently support a maximum of 25
devices per instance.
</longdesc>
<shortdesc lang="en">storage I/O health status</shortdesc>
<parameters>
<parameter name="state_file" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${OCF_RESKEY_state_file_default}" />
</parameter>
<parameter name="drives" unique="1" required="1">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="io_timeout" unique="0">
<longdesc lang="en">
Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default).
</longdesc>
<shortdesc lang="en">Disk I/O timeout</shortdesc>
<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
</parameter>
<parameter name="check_interval" unique="0">
<longdesc lang="en">
Specify interval between I/O checks in seconds.(Only supported with the damonize option.)
</longdesc>
<shortdesc lang="en">I/O check interval</shortdesc>
<content type="integer" default="${OCF_RESKEY_check_interval_default}" />
</parameter>
<parameter name="inject_errors" unique="0">
<longdesc lang="en">
Used only for testing! Specify % of I/O errors to simulate drives failures.
</longdesc>
<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc>
<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
</parameter>
<parameter name="daemonize" unique="0">
<longdesc lang="en">
Specifies to start storage-mon as a daemon and check for devices.
</longdesc>
<shortdesc lang="en">start storage-mon with daemon</shortdesc>
<content type="boolean" default="${OCF_RESKEY_daemonize_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="10s" />
<action name="stop" timeout="120s" />
<action name="monitor" timeout="120s" interval="30s" start-delay="0s" />
<action name="meta-data" timeout="5s" />
<action name="validate-all" timeout="10s" />
</actions>
</resource-agent>
END
return $OCF_SUCCESS
}
#######################################################################
-storage-mon_usage() {
+storage_mon_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
return $1
}
-storage-mon_init() {
+storage_mon_init() {
#Test for presence of storage_mon helper
if [ ! -x "$STORAGEMON" ] ; then
ocf_log err "${STORAGEMON} not installed."
exit $OCF_ERR_INSTALLED
fi
if [ ! -x "$ATTRDUP" ] ; then
ocf_log err "${ATTRDUP} not installed."
exit $OCF_ERR_INSTALLED
fi
i=0
for DRIVE in ${OCF_RESKEY_drives}; do
if [ ! -e "$DRIVE" ] ; then
ocf_log err "${DRIVE} not found on the system"
exit $OCF_ERR_INSTALLED
fi
i=$((i + 1))
done
if [ "$i" -gt "25" ]; then
ocf_log err "Too many drives ($i) configured for this agent. Max 25."
exit $OCF_ERR_CONFIGURED
fi
if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)."
exit $OCF_ERR_CONFIGURED
fi
if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then
ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}."
exit $OCF_ERR_CONFIGURED
fi
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then
ocf_log err "Inject errors % has to be a value between 1 and 100."
exit $OCF_ERR_CONFIGURED
fi
fi
}
-storage-mon_update_attribute() {
+storage_mon_update_attribute() {
while :
do
"$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s"
rc=$?
if [ $rc -eq 0 ]; then
break
fi
ocf_log debug "${1} attribute by attrd_updater failed"
if [ "$1" = "red" ]; then
# If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately.
return $OCF_ERR_GENERIC
fi
done
return $OCF_SUCCESS
}
-storage-mon_monitor() {
+storage_mon_monitor() {
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
- storage-mon_init
+ storage_mon_init
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ ! -f "${OCF_RESKEY_state_file}" ]; then
return $OCF_NOT_RUNNING
fi
# generate command line
cmdline=""
for DRIVE in ${OCF_RESKEY_drives}; do
cmdline="$cmdline --device $DRIVE --score 1"
done
cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
fi
$STORAGEMON $cmdline
if [ $? -ne 0 ]; then
status="red"
else
status="green"
fi
- storage-mon_update_attribute $status
+ storage_mon_update_attribute $status
return "$?"
else
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
case "$?" in
0) rc=$OCF_SUCCESS;;
1|2) rc=$OCF_NOT_RUNNING;;
*) rc=$OCF_ERR_GENERIC;;
esac
if [ $rc -ne $OCF_SUCCESS ]; then
return "$rc"
fi
if [ "$1" = "pid_check_only" ]; then
return "$rc"
fi
# generate client command line
cmdline=""
cmdline="$cmdline --client --attrname ${ATTRNAME}"
while :
do
# 0 : Normal.
# greater than 0 : monitoring error.
# 255(-1) : communication system error.
# 254(-2) : Not all checks completed for first device in daemon mode.
$STORAGEMON $cmdline
rc=$?
case "$rc" in
254|255)
# If there is a communication error or the initial check of all devices has not been completed,
# it will loop and try to reconnect.
# When everything ends with a communication error during monitor, a monitor timeout occurs.
ocf_log debug "client monitor error : $rc"
;;
0)
status="green"
break
;;
*)
status="red"
break
;;
esac
done
- storage-mon_update_attribute $status
+ storage_mon_update_attribute $status
return "$?"
fi
}
-storage-mon_start() {
+storage_mon_start() {
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
- storage-mon_monitor
+ storage_mon_monitor
if [ $? -eq $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch "${OCF_RESKEY_state_file}"
else
- storage-mon_init
+ storage_mon_init
# generate command line
cmdline=""
for DRIVE in ${OCF_RESKEY_drives}; do
cmdline="$cmdline --device $DRIVE --score 1"
done
cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
fi
$STORAGEMON $cmdline
if [ "$?" -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
#Wait until monitor confirms the startup pid according to the ocf resource specification.
while true; do
- storage-mon_monitor pid_check_only
+ storage_mon_monitor pid_check_only
rc="$?"
if [ $rc -eq $OCF_SUCCESS ]; then
break
fi
sleep 1
ocf_log debug "storage-mon daemon still hasn't started yet. Waiting..."
done
fi
}
-storage-mon_stop() {
- storage-mon_monitor
+storage_mon_stop() {
+ storage_mon_monitor
rc=$?
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
if [ $rc -eq $OCF_SUCCESS ]; then
rm "${OCF_RESKEY_state_file}"
fi
else
case "$rc" in
$OCF_SUCCESS)
;;
$OCF_NOT_RUNNING)
return "$OCF_SUCCESS";;
*)
return "$rc";;
esac
kill -TERM $(cat "${PIDFILE}")
if [ "$?" -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
while true; do
- storage-mon_monitor pid_check_only
+ storage_mon_monitor pid_check_only
rc="$?"
case "$rc" in
$OCF_SUCCESS)
;;
$OCF_NOT_RUNNING)
return "$OCF_SUCCESS";;
*)
return "$rc";;
esac
sleep 1
done
fi
return $OCF_SUCCESS
}
-storage-mon_validate() {
- storage-mon_init
+storage_mon_validate() {
+ storage_mon_init
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
# Is the state directory writable?
state_dir=$(dirname "${OCF_RESKEY_state_file}")
touch "$state_dir/$$"
if [ $? -ne 0 ]; then
return $OCF_ERR_CONFIGURED
fi
rm "$state_dir/$$"
fi
return $OCF_SUCCESS
}
case "$__OCF_ACTION" in
- start) storage-mon_start;;
- stop) storage-mon_stop;;
- monitor) storage-mon_monitor;;
- validate-all) storage-mon_validate;;
+ start) storage_mon_start;;
+ stop) storage_mon_stop;;
+ monitor) storage_mon_monitor;;
+ validate-all) storage_mon_validate;;
meta-data) meta_data;;
- usage|help) storage-mon_usage $OCF_SUCCESS;;
- *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
+ usage|help) storage_mon_usage $OCF_SUCCESS;;
+ *) storage_mon_usage $OCF_ERR_UNIMPLEMENTED;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
# vim: set filetype=sh:

File Metadata

Mime Type
text/x-diff
Expires
Tue, Feb 25, 6:15 AM (1 d, 14 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1464704
Default Alt Text
(12 KB)

Event Timeline