diff --git a/agents/ocf/HealthCPU.in b/agents/ocf/HealthCPU.in index 4bd6e3672a..14e4b0741f 100755 --- a/agents/ocf/HealthCPU.in +++ b/agents/ocf/HealthCPU.in @@ -1,224 +1,223 @@ #!/bin/sh # # ocf:pacemaker:HealthCPU resource agent # # Copyright 2004-2023 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # (GPLv2) WITHOUT ANY WARRANTY. # # # Measures CPUs idling and writes #health-cpu status into the CIB # ################################ # # TODO: Enter default values # Error handling in getting uptime # ################################## ####################################################################### # Initialization: : ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"} . "${OCF_FUNCTIONS}" : ${__OCF_ACTION:="$1"} ####################################################################### meta_data() { cat < 1.1 System health agent that measures the CPU idling and updates the #health-cpu attribute. System health CPU usage Location to store the resource state in. State file Lower (!) limit of idle percentage to switch the health attribute to yellow. I.e. the #health-cpu will go yellow if the %idle of the CPU falls below 50%. Lower limit for yellow health attribute Lower (!) limit of idle percentage to switch the health attribute to red. I.e. the #health-cpu will go red if the %idle of the CPU falls below 10%. Lower limit for red health attribute The time to wait (dampening) in seconds for further changes before writing The time to wait (dampening) in seconds for further changes before writing END } ####################################################################### healthcpu_usage() { cat < 1.1 System health agent that checks the S.M.A.R.T. status of the given drives and updates the #health-smart attribute. SMART health status Location to store the resource state in. State file The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". Drives to check The device type(s) to assume for the drive(s) being tested as a SPACE separated list. Device types Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red. Lower limit for the red smart attribute Upper limit of the temperature if deg C of the drives(s). If the drive reports a temperature higher than this value the status of #health-smart will be red. Upper limit for red smart attribute Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow. Deg C below/above the upper limits for yellow smart attribute The path to the smartctl program, used for querying device health. The path to the smartctl program The time to wait (dampening) for further changes to occur Dampening interval END } ####################################################################### check_temperature() { if [ $1 -lt ${lower_red_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C" - attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -gt ${upper_red_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C" - attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -lt ${lower_yellow_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C" - attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" + attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}" return 1 fi if [ $1 -gt ${upper_yellow_limit} ] ; then ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C" - attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" + attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}" return 1 fi } common_checks() { # Each item in $OCF_RESKEY_drives must have a corresponding item in # $OCF_RESKEY_devices with the device type. Alternately, # $OCF_RESKEY_devices can be empty. drives_len=${#DRIVES[@]} devices_len=${#DEVICES[@]} if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives." exit $OCF_ERR_ARGS fi # Each item in $OCF_RESKEY_drives must look like a device node. for d in "${DRIVES[@]}"; do if [[ "$d" != /dev/* ]]; then ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d" exit $OCF_ERR_ARGS fi done } init_smart() { #Set temperature defaults if [ -z "${OCF_RESKEY_temp_warning}" ]; then yellow_threshold=5 else yellow_threshold=${OCF_RESKEY_temp_warning} fi if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then lower_red_limit=0 else lower_red_limit=${OCF_RESKEY_temp_lower_limit} fi lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold})) if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then upper_red_limit=60 else upper_red_limit=${OCF_RESKEY_temp_upper_limit} fi upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold})) for ndx in ${!DRIVES[*]}; do DRIVE=${DRIVES[$ndx]} if [ -n "${OCF_RESKEY_devices}" ]; then DEVICE=${DEVICES[$ndx]} "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" if [ $? -ne 0 ] ; then ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} exit $OCF_ERR_INSTALLED fi else "${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" if [ $? -ne 0 ] ; then ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} exit $OCF_ERR_INSTALLED fi fi done } HealthSMART_usage() { cat < 1.1 This agent's monitor action records the speed of a specified network interface as a node attribute. The attribute can be used in rules to prefer nodes based on network speeds. This agent can monitor physical interfaces, bonded interfaces, bridges, VLANs, or any combination thereof. For example: *) Bridge on top of one 10Gbps interface (eth2) and 802.3ad bonding (bond0) built on two 1Gbps interfaces (eth0 and eth1). *) Active-backup bonding built on top of one physical interface and one VLAN on another interface. For STP-enabled bridges, this agent tries to determine the network topology, and by default looks only on ports which are connected to an upstream switch. This can be overridden by 'bridge_ports' parameter. Active interfaces in this case are those in "forwarding" state. For balancing bonded interfaces, this agent uses 80% of the sum of the speeds of underlying "up" ports. For non-balancing bonded interfaces ("active-backup" and probably "broadcast"), only the speed of the currently active port is considered. Network interface speed monitor Name of the node attribute to set Attribute name If this is set, monitor this network interface. One of iface or ip must be set. Network interface If this is set instead of iface, monitor the interface that holds this IP address. The address may be specified in dotted-quad notation for IPv4 (for example, 192.168.1.1) or hexadecimal notation for IPv6 (for example, 2001:db8:DC28:0:0:FC57:D4C8:1FFF). One of iface or ip must be set. IPv4 or IPv6 address If set and iface is a bridge, consider these bridge ports (by default, all ports which have designated_bridge=root_id) Bridge ports Relative weight of 1Gbps in interface speed. Can be used to tune how big attribute value will be. Weight of 1Gbps - + The time to wait (dampening) for further changes to occur. Dampening interval Log more verbosely. Verbose logging + END } usage() { cat < 1.1 Every time the monitor action is run, this resource agent records (in the CIB) the current number of nodes the host can connect to using the system fping (preferred) or ping tool. node connectivity PID file PID file The time to wait (dampening) further changes occur Dampening interval The name of the attributes to set. This is the name to be used in the constraints. Attribute name The number by which to multiply the number of connected ping nodes by Value multiplier A space separated list of ping nodes to count. Host list Number of ping attempts, per host, before declaring it dead no. of ping attempts How long, in seconds, to wait before declaring a ping lost ping timeout in seconds A catch all for any other options that need to be passed to ping. Extra Options Resource is failed if the score is less than failure_score. Default never fails. failure_score Use fping rather than ping, if found. If set to 0, fping will not be used even if present. Use fping if available Enables to use default attrd_updater verbose logging on every call. Verbose logging END } ####################################################################### ping_conditional_log() { level="$1"; shift if [ $OCF_RESKEY_debug -gt 0 ]; then ocf_log "$level" "$*" fi } ping_usage() { cat <&1); rc=$? active=$(echo "$fping_output" | grep "is alive" | wc -l) case $rc in 0) if [ $OCF_RESKEY_debug -gt 1 ]; then ping_conditional_log info "$fping_output" fi ;; 1) for h in $(echo "$fping_output" | grep "is unreachable" | awk '{print $1}'); do ping_conditional_log warn "$h is inactive: $fping_output" done ;; *) ocf_log err "Unexpected result for '$cmd' $rc: $(echo "$fping_output" | tr '\n' ';')" ;; esac return $active } ping_check() { active=0 for host in $OCF_RESKEY_host_list; do p_exe=ping case $(uname) in Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";; Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";; FreeBSD) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";; *) ocf_log err "Unknown host type: $(uname)"; exit $OCF_ERR_INSTALLED;; esac case "$host" in *:*) p_exe=ping6 esac ping_output=$($p_exe $p_args $OCF_RESKEY_options $host 2>&1); rc=$? case $rc in 0) active=$(expr $active + 1) if [ $OCF_RESKEY_debug -gt 1 ]; then ping_conditional_log info "$ping_output" fi ;; 1) ping_conditional_log warn "$host is inactive: $ping_output";; *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $ping_output";; esac done return $active } ping_update() { if use_fping; then fping_check active=$? else ping_check active=$? fi score=$(expr $active \* $OCF_RESKEY_multiplier) - if [ "$__OCF_ACTION" = "start" ] ; then - attrd_updater -n "$OCF_RESKEY_name" -B "$score" -d "$OCF_RESKEY_dampen" - else - attrd_updater -n "$OCF_RESKEY_name" -U "$score" -d "$OCF_RESKEY_dampen" - fi + attrd_updater -n "$OCF_RESKEY_name" -B "$score" -d "$OCF_RESKEY_dampen" rc=$? case $rc in 0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;; *) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";; esac if [ $rc -ne 0 ]; then return $rc fi if [ -n "$OCF_RESKEY_failure_score" ] && [ "$score" -lt "$OCF_RESKEY_failure_score" ]; then ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)" return 1 fi return 0 } use_fping() { ocf_is_true "$OCF_RESKEY_use_fping" && have_binary fping; } # return values: # 4 IPv4 # 6 IPv6 # 0 indefinite (i.e. hostname) host_family() { case $1 in *[0-9].*[0-9].*[0-9].*[0-9]) return 4 ;; *:*) return 6 ;; *) return 0 ;; esac } # return values same as host_family plus # 99 ambiguous families hosts_family() { # For fping allow only same IP versions or hostnames family=0 for host in $OCF_RESKEY_host_list; do host_family "$host" f=$? if [ $family -ne 0 ] && [ $f -ne 0 ] && [ $f -ne $family ] ; then family=99 break fi [ $f -ne 0 ] && family=$f done return $family } integer=$(echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*') case "${OCF_RESKEY_timeout}" in *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=$(expr $integer / 1000);; *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=$(expr $integer \* 60);; *[0-9]h|*[0-9]hr) OCF_RESKEY_timeout=$(expr $integer \* 60 \* 60);; *) OCF_RESKEY_timeout=$integer;; esac if [ -z "${OCF_RESKEY_timeout}" ]; then if [ -n "$OCF_RESKEY_host_list" ]; then host_count=$(echo $OCF_RESKEY_host_list | awk '{print NF}') OCF_RESKEY_timeout=$(expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts) OCF_RESKEY_timeout=$(expr $OCF_RESKEY_timeout / 1100) # Convert to seconds and finish 10% early else OCF_RESKEY_timeout=5 fi fi if [ ${OCF_RESKEY_timeout} -lt 1 ]; then OCF_RESKEY_timeout=5 elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then # ping actually complains if this value is too high, 5 minutes is plenty OCF_RESKEY_timeout=300 fi if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then : ${OCF_RESKEY_pidfile:="${HA_VARRUN%%/}/ping-${OCF_RESKEY_name}"} else : ${OCF_RESKEY_pidfile:="${HA_VARRUN%%/}/ping-${OCF_RESOURCE_INSTANCE}"} fi # Check the debug option case "${OCF_RESKEY_debug}" in true|True|TRUE|1) OCF_RESKEY_debug=1;; false|False|FALSE|0) OCF_RESKEY_debug=0;; verbose|Verbose|VERBOSE|2) OCF_RESKEY_debug=2;; *) ocf_log warn "Value for 'debug' is incorrect. Please specify 'true', 'false', or 'verbose', not: ${OCF_RESKEY_debug}" OCF_RESKEY_debug=false ;; esac case "$__OCF_ACTION" in meta-data) meta_data exit $OCF_SUCCESS ;; start) ping_start;; stop) ping_stop;; monitor) ping_monitor;; validate-all) ping_validate;; -reload-agent) ping_reload_agent;; +reload-agent) ping_reload_agent;; usage|help) ping_usage exit $OCF_SUCCESS ;; *) ping_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? # vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: