diff --git a/agent/sbd.in b/agent/sbd.in index 82ad2f6..0057023 100644 --- a/agent/sbd.in +++ b/agent/sbd.in @@ -1,302 +1,303 @@ #!/bin/bash # # This STONITH script drives the shared-storage stonith plugin. # # Copyright (C) 2013 Lars Marowsky-Bree # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Main code if [ -z "$sbd_device" ]; then if [ -f @CONFIGDIR@/sbd ]; then source @CONFIGDIR@/sbd sbd_device=$SBD_DEVICE fi fi SBD_DEVS=${sbd_device%;} sbd_device=${SBD_DEVS//;/ -d } sbd_cmd_output() { local pid=$1 local fd=$2 local call_wait=$3 local any_output=0 local failed=0 local rc=0 running=0 unknown_hanging=0 # Async IO timeout defaults to 3 seconds while read -t 5 line; do echo "$line" any_output=1 # Indicator of failure in case that stderr is retrieved if [[ "$line" == *"sbd failed"* ]]; then failed=1 fi done <&$fd # Command exited if ! $(kill -0 $pid > /dev/null 2>&1); then # Safe now to retrieve any remaining output without specifying timeout while read line; do echo "$line" any_output=1 if [[ "$line" == *"sbd failed"* ]]; then failed=1 fi done <&$fd # Determine the exit status # bash's wait command only recongizes the latest child even if the pids of the previous children were saved. if [ $call_wait -ne 0 ]; then wait $pid return $? # Let's assume one that printed anything other than explicit failure to stdout has succeeded. elif [ $any_output -ne 0 -a $failed -eq 0 ]; then return 0 else return 1 fi # Command still existing else running=1 # Failed but hanging. Don't wait for it any more. if [ $failed -ne 0 ]; then return 1 else unknown_hanging=1 return 1 fi fi return $rc } sbd_cmd_get_stdout() { local devices=${SBD_DEVS//;/ } local cmd="$1" local rc=0 local success_count=0 local unknown_hanging_procs="" for device in $devices; do exec {fd}< <(sbd -d $device $cmd) pid=$! sbd_cmd_output $pid $fd 1 cmd_rc=$? if [ $cmd_rc -eq 0 ]; then success_count=$((success_count + 1)) else rc=$cmd_rc fi if [ $unknown_hanging -ne 0 ]; then unknown_hanging_procs+="$pid:$fd " fi done if [ -z "$unknown_hanging_procs" -o $success_count -gt 0 ]; then return $rc fi # We didn't get any successful output # Desperately wait for the ones hanging in unknown state while true; do local running_count=0 for proc in $unknown_hanging_procs; do pid=${proc%:*} fd=${proc#*:} sbd_cmd_output $pid $fd 0 cmd_rc=$? if [ $cmd_rc -eq 0 ]; then success_count=$((success_count + 1)) else rc=$cmd_rc fi if [ $running -ne 0 ]; then running_count=$((running_count + 1)) fi done if [ $success_count -gt 0 -o $running_count -eq 0 ]; then return $rc fi done return $rc } sbd_check_device() { if [ -z "$sbd_device" ]; then ha_log.sh err "No sbd device(s) found in the configuration." exit 1 fi } sbd_validate_timeout() { case "$timeout_bypass" in yes|true|1|YES|TRUE|ja|on|ON) return ;; esac crm_timeout=$[$(crm_attribute -t crm_config -G -n stonith-timeout -d 20s -q | sed -e 's/\(.*\)s/\1/' -e 's/\(.*\)m/\1*60/')] sbd_timeout=$(sbd_cmd_get_stdout dump | perl -ne 'if (/msgwait.*: (\d+)/) { print "$1\n"; }' | head -n 1) if [ -z "$sbd_timeout" -o "$sbd_timeout" = "0" ]; then return fi sbd_timeout_min=$[$sbd_timeout*12/10] if [ "$sbd_timeout_min" -lt 20 ]; then sbd_timeout_min=20 fi sbd_timeout_suggested=$[$sbd_timeout_min*12/10] if [ "$crm_timeout" -lt "$sbd_timeout_min" ]; then ha_log.sh err "The CIB property stonith-timeout is set too low for sbd to ever succeed" ha_log.sh err "Recommended value is $sbd_timeout_suggested, updating configuration." crm_attribute -t crm_config -n stonith-timeout -v $sbd_timeout_suggested exit 1 fi } case $1 in gethosts) sbd_check_device echo `sbd_cmd_get_stdout list | cut -f2 | sort | uniq` exit 0 ;; off|reset) sbd_check_device sbd_validate_timeout message=$1 case "$crashdump" in yes|true|1|YES|TRUE|ja|on|ON) message="crashdump" ;; esac - sbd -d $sbd_device message $2 $message + exec {fd}< <(sbd -d $sbd_device message $2 $message) + wait $! exit $? ;; status) sbd_check_device sbd_validate_timeout error_output=$(sbd -d $sbd_device list 2>&1 >/dev/null) if [ $? -ne 0 ]; then error_message=$(echo "$error_output" | grep -v "please check the logs") ha_log.sh err "sbd list failed: $error_message" exit 1 fi exit 0 ;; on) exit 1 ;; getconfignames) echo "sbd_device crashdump timeout_bypass" exit 0 ;; getinfo-devid) echo "Shared storage STONITH device" exit 0 ;; getinfo-devname) echo "Shared storage STONITH device" exit 0 ;; getinfo-devdescr) cat << DESC sbd uses a shared storage device as a medium to communicate fencing requests. This allows clusters without network power switches; the downside is that access to the shared storage device becomes a Single Point of Failure. It requires sbd to be configured on all nodes. Please read http://linux-ha.org/wiki/SBD_Fencing! DESC exit 0 ;; getinfo-devurl) echo "http://linux-ha.org/wiki/SBD_Fencing" exit 0 ;; getinfo-xml) cat << SSHXML Crashdump instead of regular fence If SBD is given a fence command, this option will instead perform a kernel crash of a reboot or power-off, which on a properly configured system can lead to a crashdump for analysis. This is less safe for production environments. Please use with caution and for debugging purposes only. SBD device(s) The block device used for the SBD partition. Up to three can be specified if separated by a semicolon. (Please check the documentation if specifying two.) If not specified, will default to the value from @CONFIGDIR@/sbd. Permit a seemingly too short stonith-timeout The sbd agent will try to detect a too short stonith-timeout (relative to msgwait) in the Pacemaker configuration and automatically correct it. Should that logic fail in your environment or you have legitimate need to use a shorter timeout, you can disable it via this parameter. SSHXML exit 0 ;; *) exit 1 ;; esac