diff --git a/extra/resources/HealthSMART.in b/extra/resources/HealthSMART.in index 881bedf09b..f3294c6453 100755 --- a/extra/resources/HealthSMART.in +++ b/extra/resources/HealthSMART.in @@ -1,359 +1,372 @@ #!@BASH_PATH@ # # ocf:pacemaker:HealthSMART resource agent # # Copyright 2009-2022 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # (GPLv2) WITHOUT ANY WARRANTY. # # # Checks the S.M.A.R.T. status of all given drives and writes the #health-smart # status into the CIB # ####################################################################### ####################################################################### # Initialization: : ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"} . "${OCF_FUNCTIONS}" : ${__OCF_ACTION:="$1"} # Explicitly list all environment variables used, to make static analysis happy : ${OCF_RESKEY_CRM_meta_interval:=0} : ${OCF_RESKEY_CRM_meta_globally_unique:="true"} : ${OCF_RESKEY_temp_warning:=""} : ${OCF_RESKEY_temp_lower_limit:=""} : ${OCF_RESKEY_temp_upper_limit:=""} : ${OCF_RESKEY_drives:="/dev/sda"} : ${OCF_RESKEY_devices:=""} : ${OCF_RESKEY_state:=""} : ${OCF_RESKEY_smartctl:="/usr/sbin/smartctl"} : ${OCF_RESKEY_dampen:="5s"} # Turn these into arrays so we can iterate them later. DRIVES=(${OCF_RESKEY_drives}) DEVICES=(${OCF_RESKEY_devices}) ####################################################################### meta_data() { cat <<END <?xml version="1.0"?> <resource-agent name="HealthSMART" version="1.0"> <version>1.1</version> <longdesc lang="en"> System health agent that checks the S.M.A.R.T. status of the given drives and updates the #health-smart attribute. </longdesc> <shortdesc lang="en">SMART health status</shortdesc> <parameters> <parameter name="state" unique-group="state"> <longdesc lang="en"> Location to store the resource state in. </longdesc> <shortdesc lang="en">State file</shortdesc> <content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" /> </parameter> <parameter name="drives" reloadable="1"> <longdesc lang="en"> The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". </longdesc> <shortdesc lang="en">Drives to check</shortdesc> <content type="string" default="/dev/sda" /> </parameter> <parameter name="devices" reloadable="1"> <longdesc lang="en"> The device type(s) to assume for the drive(s) being tested as a SPACE separated list. </longdesc> <shortdesc lang="en">Device types</shortdesc> <content type="string" /> </parameter> <parameter name="temp_lower_limit" reloadable="1"> <longdesc lang="en"> Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red. </longdesc> <shortdesc lang="en">Lower limit for the red smart attribute</shortdesc> <content type="string" default="0"/> </parameter> <parameter name="temp_upper_limit" reloadable="1"> <longdesc lang="en"> Upper limit of the temperature if deg C of the drives(s). If the drive reports a temperature higher than this value the status of #health-smart will be red. </longdesc> <shortdesc lang="en">Upper limit for red smart attribute</shortdesc> <content type="string" default="60"/> </parameter> <parameter name="temp_warning" reloadable="1"> <longdesc lang="en"> Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow. </longdesc> <shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc> <content type="string" default="5"/> </parameter> <parameter name="smartctl" reloadable="1"> <longdesc lang="en"> The path to the smartctl program, used for querying device health. </longdesc> <shortdesc lang="en">The path to the smartctl program</shortdesc> <contest type="string" default="/usr/sbin/smartctl"/> </parameter> <parameter name="dampen" reloadable="1"> <longdesc lang="en"> The time to wait (dampening) for further changes to occur </longdesc> <shortdesc lang="en">Dampening interval</shortdesc> <content type="integer" default="5s"/> </parameter> </parameters> <actions> <action name="start" timeout="10s" /> <action name="stop" timeout="10s" /> <action name="monitor" timeout="10s" interval="10s" start-delay="0s" /> <action name="meta-data" timeout="5s" /> <action name="validate-all" timeout="10s" /> <action name="reload-agent" timeout="20s" /> </actions> </resource-agent> END } ####################################################################### check_temperature() { if [ $1 -lt ${lower_red_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C" attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -gt ${upper_red_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C" attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -lt ${lower_yellow_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C" attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -gt ${upper_yellow_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C" attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" return 1 fi } +common_checks() { + # Each item in $OCF_RESKEY_drives must have a corresponding item in + # $OCF_RESKEY_devices with the device type. Alternately, + # $OCF_RESKEY_devices can be empty. + drives_len=${#DRIVES[@]} + devices_len=${#DEVICES[@]} + + if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then + ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives." + exit $OCF_ERR_ARGS + fi + + # Each item in $OCF_RESKEY_drives must look like a device node. + for d in "${DRIVES[@]}"; do + if [[ "$d" != /dev/* ]]; then + ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d" + exit $OCF_ERR_ARGS + fi + done +} + init_smart() { #Set temperature defaults if [ -z "${OCF_RESKEY_temp_warning}" ]; then yellow_threshold=5 else yellow_threshold=${OCF_RESKEY_temp_warning} fi if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then lower_red_limit=0 else lower_red_limit=${OCF_RESKEY_temp_lower_limit} fi lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold})) if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then upper_red_limit=60 else upper_red_limit=${OCF_RESKEY_temp_upper_limit} fi upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold})) - #Test for presence of smartctl - if [ ! -x "${OCF_RESKEY_smartctl}" ] ; then - ocf_log err "${OCF_RESKEY_smartctl} not installed." - exit $OCF_ERR_INSTALLED - fi - - # Each item in $OCF_RESKEY_drives must have a corresponding item in - # $OCF_RESKEY_devices with the device type. Alternately, - # $OCF_RESKEY_devices can be empty. First, check that this is the case. - drives_len=${#DRIVES[@]} - devices_len=${#DEVICES[@]} - - if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then - ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives." - exit $OCF_ERR_ARGS - fi - for ndx in ${!DRIVES[*]}; do DRIVE=${DRIVES[$ndx]} if [ -n "${OCF_RESKEY_devices}" ]; then DEVICE=${DEVICES[$ndx]} "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" if [ $? -ne 0 ] ; then ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} exit $OCF_ERR_INSTALLED fi else "${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" if [ $? -ne 0 ] ; then ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} exit $OCF_ERR_INSTALLED fi fi done } HealthSMART_usage() { cat <<END usage: $0 {start|stop|monitor|validate-all|meta-data|reload-agent} Expects to have a fully populated OCF RA-compliant environment set. END } HealthSMART_start() { HealthSMART_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi touch "${OCF_RESKEY_state}" } HealthSMART_stop() { attrd_updater -D -n "#health-smart" -d "${OCF_RESKEY_dampen}" rm "${OCF_RESKEY_state}" if [ $? -eq 0 ]; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC fi } HealthSMART_monitor() { + common_checks + + # Test for presence of smartctl + check_binary smartctl init_smart # Monitor _MUST!_ differentiate correctly between running # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). # That is THREE states, not just yes/no. if [ -f "${OCF_RESKEY_state}" ]; then for ndx in ${!DRIVES[*]}; do DRIVE=${DRIVES[$ndx]} if [ -n "${OCF_RESKEY_devices}" ]; then DEVICE=${DEVICES[$ndx]} # Check overall S.M.A.R.T. status "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED" if [ $? -ne 0 ]; then attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" return $OCF_SUCCESS fi # Check drive temperature(s) check_temperature "$("${OCF_RESKEY_smartctl}" -d "${DEVICE}" -A "${DRIVE}" | awk '/^194/ { print $10 }')" if [ $? -ne 0 ]; then return $OCF_SUCCESS fi else "${OCF_RESKEY_smartctl}" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED" if [ $? -ne 0 ]; then attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" return $OCF_SUCCESS fi check_temperature "$("${OCF_RESKEY_smartctl}" -A "${DRIVE}" | awk '/^194/ { print $10 }')" if [ $? -ne 0 ]; then return $OCF_SUCCESS fi fi done attrd_updater -n "#health-smart" -U "green" -d "${OCF_RESKEY_dampen}" return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } HealthSMART_validate() { + common_checks + # Host-specific checks if [ "$OCF_CHECK_LEVEL" = "10" ]; then + # Test for presence of smartctl + check_binary smartctl + init_smart # Is the state directory writable? state_dir=$(dirname "$OCF_RESKEY_state") touch "$state_dir/$$" if [ $? -ne 0 ]; then return $OCF_ERR_ARGS fi rm "$state_dir/$$" fi return $OCF_SUCCESS } HealthSMART_reload_agent() { return $OCF_SUCCESS } if [ -z "$OCF_RESKEY_state" ]; then if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" # Strip off the trailing clone marker OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/) else OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" fi fi case "$__OCF_ACTION" in start) HealthSMART_start;; stop) HealthSMART_stop;; monitor) HealthSMART_monitor;; validate-all) HealthSMART_validate;; reload-agent) HealthSMART_reload_agent;; meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) HealthSMART_usage exit $OCF_SUCCESS ;; *) HealthSMART_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc # vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: