diff --git a/script/lsb/booth-arbitrator b/script/lsb/booth-arbitrator index d90fe4b..3cdafc0 100755 --- a/script/lsb/booth-arbitrator +++ b/script/lsb/booth-arbitrator @@ -1,157 +1,169 @@ #!/bin/bash # -# BOOTH daemon init script for LSB-compliant Linux distributions. +# BOOTH daemon init script for SUSE Linux based distributions +# (almost LSB-compliant, except for s/startproc/start_daemon/ etc.) # # booth-arbitrator BOOTH arbitrator daemon # # chkconfig: - 20 20 # processname: boothd # pidfile: /var/run/booth.pid # description: Cluster Ticket Registry ### BEGIN INIT INFO # Provides: booth # Required-Start: $network $syslog # Required-Stop: $network $syslog # Should-Start: # Should-Stop: # Default-Start: 3 5 # Default-Stop: 0 6 # Short-Description: start and stop BOOTH arbitrator daemon ### END INIT INFO prog="boothd" exec="/usr/sbin/$prog" CONF_DIR=/etc/booth BOOTH_DAEMON_STARTED=0 BOOTH_DAEMON_STARTING=1 BOOTH_DAEMON_EXIST=2 BOOTH_DAEMON_NOT_RUNNING=3 BOOTH_ERROR_GENERIC=4 OCF_ERR_GENERIC=1 OCF_NOT_RUNNING=7 . /etc/rc.status check_status() { local rc rc=$BOOTH_ERROR_GENERIC eval `"$exec" status "${cnf:+-c$cnf}" ; echo rc=$?` - case $rc in + case $rc in 0) - case "$booth_state" in + # shellcheck disable=SC2154 + case "$booth_state" in started) return $BOOTH_DAEMON_STARTED;; starting) return $BOOTH_DAEMON_STARTING;; - *) return $BOOTH_ERROR_GENERIC;; + *) return $BOOTH_ERROR_GENERIC;; esac ;; $OCF_NOT_RUNNING) return $BOOTH_DAEMON_NOT_RUNNING;; $OCF_ERR_GENERIC) return $BOOTH_ERROR_GENERIC;; *) return $BOOTH_ERROR_GENERIC;; esac } status() { echo -n "BOOTH daemon is " if check_status; then + # shellcheck disable=SC2154 echo "running - PID $booth_lockpid for $booth_cfg_name, $booth_addr_string:$booth_port" return 0 else echo "stopped" return 3 fi } start() { local rc [ -x $exec ] || exit 5 check_status; rc=$? case "$rc" in $BOOTH_DAEMON_STARTED|$BOOTH_DAEMON_STARTING|$BOOTH_DAEMON_EXIST) echo "BOOTH daemon is running - PID $booth_lockpid for $booth_cfg_name, $booth_addr_string:$booth_port" return 0 ;; $BOOTH_ERROR_GENERIC|$BOOTH_DAEMON_NOT_RUNNING) echo -n $"Starting BOOTH arbitrator daemon: " startproc $exec start "${cnf:+-c$cnf}" rc_status -v ;; *) return 1;; esac } stop() { local rc wait_time wait_time=5 check_status; rc=$? case $rc in - $BOOTH_DAEMON_STARTED);; - $BOOTH_DAEMON_STARTING);; - $BOOTH_DAEMON_EXIST);; + $BOOTH_DAEMON_STARTED|$BOOTH_DAEMON_STARTING|$BOOTH_DAEMON_EXIST) + ;; $BOOTH_DAEMON_NOT_RUNNING) echo "BOOTH arbitrator daemon is not running." return 0 - ;; + ;; *) return 1;; esac - + echo -n $"Stopping BOOTH arbitrator daemon: " # $exec stop "${cnf:+-c$cnf}" # sleep 1 pkill -TERM -s $booth_lockpid boothd sleep 0.1 check_status; rc=$? while [ $rc -ne $BOOTH_DAEMON_NOT_RUNNING -a $wait_time -gt 0 ] do wait_time=$((wait_time-1)) sleep 1 check_status; rc=$? done if [ $rc -ne $BOOTH_DAEMON_NOT_RUNNING ]; then pkill -KILL -s $booth_lockpid boothd sleep 1 check_status; rc=$? fi test $rc -eq $BOOTH_DAEMON_NOT_RUNNING rc_status -v } foreach() { - local cnf cnf_base + local cnf local rc=0 for cnf in ${BOOTH_CONF_FILE:-$CONF_DIR/*.conf} ; do - cnf_base=`basename $cnf` "$@" rc=$((rc|$?)) done return $rc } restart() { stop start } +condrestart() { + local rc + + check_status; rc=$? + + case "$rc" in + $BOOTH_DAEMON_STARTED|$BOOTH_DAEMON_STARTING|$BOOTH_DAEMON_EXIST) + # shellcheck disable=SC2154 + [ ! -f "$booth_lockfile" ] || restart + ;; + esac +} + case "$1" in - start|stop|restart) - foreach $1 - ;; - reload|force-reload) - foreach restart - ;; - condrestart|try-restart) - [ ! -f "$booth_lockfile" ] || restart - ;; - status) - foreach status - ;; - *) - echo $"Usage: $0 {start|stop|restart|try-restart|condrestart|reload|force-reload|status}" - exit 2 +start|stop|restart|condrestart|status) + foreach $1 + ;; +reload|force-reload) + foreach restart + ;; +try-restart) + foreach condrestart + ;; +*) + echo $"Usage: $0 {start|stop|restart|try-restart|condrestart|reload|force-reload|status}" + exit 2 + ;; esac diff --git a/script/ocf/booth-site b/script/ocf/booth-site index 809928c..8178e35 100755 --- a/script/ocf/booth-site +++ b/script/ocf/booth-site @@ -1,203 +1,210 @@ #!/bin/bash # vim: set sw=4 : # # Resource Agent for BOOTH site daemon. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: DEFAULT_BIN="boothd" DEFAULT_CONF="/etc/booth/booth.conf" : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +# shellcheck source=/usr/lib/ocf/lib/heartbeat/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### booth_site_meta_data() { cat < 1.0 This Resource Agent can control the BOOTH site daemon. It assumes that the binary boothd is in your default PATH. In most cases, it should be run as a primitive resource. BOOTH site daemon The configuration name (or configuration filename) to use. BOOTH Options Any additional options to start the BOOTH daemon with BOOTH Options The daemon to start The daemon to start END } ####################################################################### booth_site_usage() { cat < /dev/null } booth_site_start() { local rc booth_site_status rc=$? case $rc in 0) ocf_log info "boothd already running" return $OCF_SUCCESS ;; $OCF_NOT_RUNNING) ;; esac + # shellcheck disable=SC2154 + # (OCF_RESKEY_args: injected by CRM) $OCF_RESKEY_daemon daemon -c $OCF_RESKEY_config $OCF_RESKEY_args || return $OCF_ERR_GENERIC sleep 1 while ! booth_monitor_basic; do sleep 1 done return $OCF_SUCCESS } booth_site_stop() { local pid pid=`get_booth_pid` if [ -z "$pid" ]; then ocf_log info "boothd already stopped" return $OCF_SUCCESS fi ocf_stop_processes TERM 5 $pid while is_booth_running; do sleep 1 done return $OCF_SUCCESS } booth_site_restart() { booth_site_stop booth_site_start } booth_site_reload() { booth_site_restart } booth_site_monitor() { booth_site_status case $? in 0) return $OCF_SUCCESS ;; $OCF_NOT_RUNNING) return $OCF_NOT_RUNNING ;; esac } booth_site_validate_all() { if ! test -f $OCF_RESKEY_config; then ocf_log err "$OCF_RESKEY_config does not exist" return $OCF_ERR_INSTALLED fi + # shellcheck disable=SC2154 + # (OCF_RESKEY_CRM_meta_globally_unique: injected by CRM) if ocf_is_true $OCF_RESKEY_CRM_meta_globally_unique; then ocf_log err "$OCF_RESOURCE_INSTANCE must be configured with the globally_unique=false meta attribute" return $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } : ${OCF_RESKEY_daemon:=$DEFAULT_BIN} : ${OCF_RESKEY_config:=$DEFAULT_CONF} +# shellcheck disable=SC2034 +# (OCF_REQUIRED_BINARIES consumed by ocf_rarun) OCF_REQUIRED_BINARIES=${OCF_RESKEY_daemon} ocf_rarun $* diff --git a/script/ocf/geostore b/script/ocf/geostore index 85842a8..c180418 100755 --- a/script/ocf/geostore +++ b/script/ocf/geostore @@ -1,112 +1,114 @@ #!/bin/sh # # # geostore OCF RA. Just an example on how to use # geo-attr.sh # # Copyright (c) 2015 Dejan Muhamedagic # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +# shellcheck source=/usr/lib/ocf/lib/heartbeat/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +# shellcheck source=script/ocf/geo_attr.sh . ${OCF_ROOT}/lib/booth/geo_attr.sh ####################################################################### geostore_meta_data() { cat < 1.0 This is the geostore Resource Agent. It's a sample for how to use geo_attr.sh. Sample GEO attribute RA `geo_attr_meta_data` END } ####################################################################### geostore_usage() { cat < 1.0 This is sharedrsc Resource Agent. It just keeps some resource while running and releases it when stopped. The resource is a directory on a shared filesystem or on a filesystem which is remotely accessible over ssh. Used for booth testing, i.e. to make sure that no two sites keep the same ticket. shared resource (booth testing) Location of the shared directory. If it's of the form "[user@]host:path" then it is assumed that the directory is to be accessed over ssh on that host. Otherwise, it must be a directory on a shared filesystem (such as nfs or ocfs2). shared directory location END } ####################################################################### sharedrsc_usage() { cat < $1/owner" } removecmd() { echo "test -d $1 && test \"\`cat $1/owner\`\" = `uname -n` && rm $1/owner && rmdir $1" } testdir() { runcmd testcmd $1 } makedir() { runcmd makecmd $1 } removedir() { runcmd removecmd $1 } sharedrsc_monitor() { if testdir $DIR; then return $OCF_SUCCESS else return $OCF_NOT_RUNNING fi } sharedrsc_start() { if sharedrsc_monitor; then return $OCF_SUCCESS fi makedir $DIR && return $OCF_SUCCESS local owner if ! owner=`runcmd getowner $DIR`; then owner="... nobody, it's only half-claimed" fi + # shellcheck disable=SC2154 + # (OCF_RESKEY_dir: injected by CRM) ocf_log err "eek, $OCF_RESKEY_dir already owned by $owner" return $OCF_ERR_GENERIC } sharedrsc_stop() { sharedrsc_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi removedir $DIR } sharedrsc_getconfig() { local colon_pos colon_pos=`expr index "$OCF_RESKEY_dir" ":"` if [ $colon_pos -gt 0 ]; then SSH_HOST=`echo $OCF_RESKEY_dir | cut -d: -f 1` DIR=`echo $OCF_RESKEY_dir | cut -d: -f 2` else SSH_HOST= DIR=$OCF_RESKEY_dir fi } sharedrsc_validate_all() { if [ `expr index $DIR /` -ne 1 ]; then ocf_log err "dir must be an absolute path" return $OCF_ERR_INSTALLED fi return $OCF_SUCCESS } +# shellcheck disable=SC2034 +# (OCF_REQUIRED_PARAMS consumed by ocf_rarun) OCF_REQUIRED_PARAMS="dir" ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/script/service-runnable.in b/script/service-runnable.in index 9ea33d4..2f58641 100755 --- a/script/service-runnable.in +++ b/script/service-runnable.in @@ -1,61 +1,61 @@ #!/bin/bash # This script is part of Booth. # It checks whether the given resource (service) still has a chance # to run on the local cluster, so that booth knows whether to # acquire the ticket here. service="${1:?Need a resource name as first argument.}" if [ -z "$service" ]; then @LOGGER@ "$0: bad usage: no resource name" exit 1 fi tmpshadow=`mktemp booth-check.XXXXXX` if [ $? -ne 0 -o ! -f "$tmpshadow" ]; then @LOGGER@ "$0: mktemp failed" exit 1 fi trap "rm -f $tmpshadow" EXIT # We expect an output like # p_dummy (ocf::pacemaker:Dummy): Started geo-rz2-a status=`crm_simulate -O $tmpshadow --ticket-grant "$BOOTH_TICKET" --simulate --live-check 2>&1` if [ $? -ne 0 ]; then @LOGGER@ "$0: crm_simulate failed" @LOGGER@ "$0: crm_simulate: $status" exit 1 fi if echo "$status" | sed -n '/^Revised cluster status:/,$p' | - egrep "^[[:space:]]+$service[[:space:]]+\(.*\):[[:space:]]+Started ([^[:space:]]+) *$" >/dev/null + egrep "^[[:space:]]+${service}[[:space:]]+\(.*\):[[:space:]]+Started ([^[:space:]]+) *$" >/dev/null then # can be started - we're done. exit 0 fi # If target-role is Stopped, it judges with being stopped explicitly. output=$(crm_resource --meta --get-parameter="target-role" --resource=$service 2>/dev/null) rc=$? if [ $rc -eq 0 -a "$output" = "Stopped" ]; then exit 0 fi # is ticket in standby? output=$(crm_ticket --ticket "$BOOTH_TICKET" --get-attr standby) rc=$? if [ $rc -eq 0 -a "$output" = true ]; then exit 0 fi # Some error occured. # Try to help the admin with a bit of diagnostic. # # disallow ms-resources, ie. only primitives wanted here if ! crm_resource -l | grep -v ":" | grep "$service" ; then @LOGGER@ "Defined resource '$service' in $BOOTH_CONF_PATH is not a primitive??" fi exit 1 diff --git a/test/live_test.sh b/test/live_test.sh index f8644a2..c131f8c 100755 --- a/test/live_test.sh +++ b/test/live_test.sh @@ -1,1351 +1,1353 @@ #!/bin/sh # # see README-testing for more information # do some basic booth operation tests for the given config # PROG=`basename $0` usage() { cat<[:]] $PROG [ ...] EOF if [ $1 -eq 0 ]; then list_all examples fi exit } list_all() { echo "Tests:" grep "^test_.*{$" $0 | sed 's/test_//;s/(.*//;s/^/ /' echo echo "Netem functions:" grep "^NETEM_ENV_.*{$" $0 | sed 's/NETEM_ENV_//;s/(.*//;s/^/ /' } examples() { cat< /dev/null } stop_site() { manage_site $1 stop } stop_arbitrator() { manage_arbitrator $1 stop } restart_site() { manage_site $1 restart } cleanup_site() { manage_site $1 cleanup } reload_site() { runcmd $1 OCF_ROOT=/usr/lib/ocf /usr/lib/ocf/resource.d/pacemaker/booth-site reload } restart_arbitrator() { manage_arbitrator $1 restart } booth_status() { test "`runcmd $1 booth status | get_stat_fld booth_state`" = "started" } cleanup_booth() { local h procs for h in $sites; do cleanup_site $h & procs="$! $procs" done >/dev/null 2>&1 wait $procs wait_timeout } cleanup_dep_rsc() { local dep_rsc=`get_rsc` test -z "$dep_rsc" && return local h procs for h in $sites; do runcmd $h crm -w resource cleanup $dep_rsc & procs="$! $procs" done >/dev/null 2>&1 wait $procs } check_dep_rsc() { local dep_rsc=`get_rsc` test -z "$dep_rsc" && return 0 local h for h in $sites; do runcmd $h BOOTH_TICKET=$tkt /usr/share/booth/service-runnable $dep_rsc || return 1 done return 0 } stop_booth() { local h rc for h in $sites; do stop_site $h rc=$((rc|$?)) done >/dev/null 2>&1 for h in $arbitrators; do stop_arbitrator $h rc=$((rc|$?)) done >/dev/null 2>&1 wait_timeout return $rc } start_booth() { local h rc for h in $sites; do start_site $h rc=$((rc|$?)) done >/dev/null 2>&1 for h in $arbitrators; do start_arbitrator $h rc=$((rc|$?)) done >/dev/null 2>&1 wait_timeout return $rc } restart_booth() { local h procs for h in $sites; do restart_site $h & procs="$! $procs" done >/dev/null 2>&1 for h in $arbitrators; do restart_arbitrator $h done >/dev/null 2>&1 wait $procs wait_timeout } reboot_test() { cleanup_booth restart_booth cleanup_dep_rsc } is_we_server() { local h for h in $sites $arbitrators; do ip a l | fgrep -wq $h && return done return 1 } is_pacemaker_running() { local h for h in $sites; do runcmd $h crmadmin -D >/dev/null || return 1 done return 0 } sync_conf() { local h rc=0 local tmpf for h in $sites $arbitrators; do rsync -q -e "ssh $SSH_OPTS" $1 root@$h:$run_cnf rc=$((rc|$?)) if [ -n "$authfile" ]; then tmpf=`mktemp` scp -q $(get_site 1):$authfile $tmpf && rsync -q -e "ssh $SSH_OPTS" $tmpf root@$h:$authfile rc=$((rc|$?)) rm -f $tmpf fi done return $rc } dump_conf() { echo "test configuration file $cnf:" grep -v '^#' $cnf | grep -v '^[[:space:]]*$' | sed "s/^/$cnf: /" } forall() { local h rc=0 for h in $sites $arbitrators; do - runcmd $h $@ + runcmd $h "$@" rc=$((rc|$?)) done return $rc } forall_withname() { local h rc=0 output for h in $sites $arbitrators; do - output=`runcmd $h $@` + output=`runcmd $h "$@"` rc=$((rc|$?)) echo $h: $output done return $rc } forall_sites() { local h rc=0 for h in $sites; do - runcmd $h $@ + runcmd $h "$@" rc=$((rc|$?)) done return $rc } forall_fun() { local h rc=0 f=$1 for h in $sites $arbitrators; do $f $h rc=$((rc|$?)) [ $rc -ne 0 ] && break done return $rc } # run on all hosts whatever function produced on stdout forall_fun2() { local h rc=0 f f=$1 shift 1 for h in $sites $arbitrators; do - $f $@ | ssh $SSH_OPTS $h + $f "$@" | ssh $SSH_OPTS $h rc=$((rc|$?)) [ $rc -ne 0 ] && break done return $rc } run_site() { local n=$1 h shift 1 h=`echo $sites | awk '{print $'$n'}'` - runcmd $h $@ + runcmd $h "$@" } run_arbitrator() { local n=$1 h shift 1 h=`echo $arbitrators | awk '{print $'$n'}'` - runcmd $h $@ + runcmd $h "$@" } # need to get logs from _all_ clusters' nodes get_all_nodes() { for h in $sites; do runcmd $h crm_node -l | awk '{print $2}' done } extract_value() { sed 's/ *#.*//;s/.*=//;s/"//g;s/^ *//;s/ *$//' } get_extern_ip() { grep "^$1" | awk ' { if(/# *external[_-]ip=/) print $NF; else print; } ' | extract_value } get_value() { grep "^$1" | extract_value } # get internal IP for the external address internal_ip() { fgrep "$1" $cnf | extract_value } get_rsc() { awk ' n && /^[[:space:]]*before-acquire-handler/ {print $NF; exit} n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } get_attr() { awk ' n && /^[[:space:]]*attr-prereq = auto .* eq / {print $4,$6; exit} n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } get_mode() { awk ' n && /^[[:space:]]*mode/ {print $NF; exit} n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } set_site_attr() { local site site=$1 set -- `get_attr` run_site $site geostore set $1 $2 } del_site_attr() { local site site=$1 set -- `get_attr` run_site $site geostore delete $1 } break_external_prog() { run_site $1 crm configure "location $PREFNAME `get_rsc` rule -inf: defined \#uname" } show_pref() { run_site $1 crm configure show $PREFNAME > /dev/null } repair_external_prog() { run_site $1 crm configure delete __pref_booth_live_test } get_tkt() { grep "^ticket=" | head -1 | sed 's/ticket=//;s/"//g' } get_tkt_settings() { awk ' n && /^[[:space:]]*(expire|timeout|renewal-freq)/ { sub(" = ", "=", $0); gsub("-", "_", $0); sub("^[[:space:]]*", "T_", $0); if ($0 ~ /ms$/) { sub("ms$", "", $0); eq = match($0, "="); print substr($0, 1, eq)""substr($0, eq+1)/1000; } else { print; } next } n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $1 } wait_exp() { + # shellcheck disable=SC2154 + # (T_expire: defined with get_tkt_settings) sleep $T_expire } wait_renewal() { sleep $T_renewal_freq } wait_timeout() { sleep $MIN_TIMEOUT } set_netem_env() { local modfun args modfun=`echo $1 | sed 's/:.*//'` args=`echo $1 | sed 's/[^:]*//;s/:/ /g'` if ! is_function NETEM_ENV_$modfun; then echo "NETEM_ENV_$modfun: doesn't exist" exit 1 fi NETEM_ENV_$modfun $args } reset_netem_env() { [ -z "$NETEM_ENV" ] && return [ -n "$__NETEM_RESET" ] && return __NETEM_RESET=1 forall $ABSPATH $run_cnf __netem__ netem_reset } setup_netem() { [ -z "$NETEM_ENV" ] && return __NETEM_RESET= echo "-------------------------------------------------- (netem)" | logmsg for env in $NETEM_ENV; do set_netem_env $env done trap "reset_netem_env" EXIT } cib_status() { local h=$1 stat stat=`runcmd $h crm_ticket -L | grep "^$tkt" | awk '{print $2}'` test "$stat" != "-1" } is_cib_granted() { local stat h=$1 stat=`runcmd $h crm_ticket -L | grep "^$tkt" | awk '{print $2}'` [ "$stat" = "granted" ] } check_cib_consistency() { local h gh="" rc=0 for h in $sites; do if is_cib_granted $h; then [ -n "$gh" ] && rc=1 # granted twice gh="$gh `internal_ip $h`" fi done [ -z "$gh" ] && gh="none" if [ $rc -eq 0 ]; then echo $gh return $rc fi cat<= 0 ? x : -x; } } ' | sort -n | tail -1 } booth_leader_consistency() { test `booth_list_fld 2 | sort -u | wc -l` -eq 1 } # are there two leaders or is it just that some booths are outdated booth_leader_consistency_2() { test `booth_list_fld 2 | sort -u | grep -iv none | wc -l` -le 1 } # do all booths have the same info? # possible differences: # a) more than one leader # b) some booths not uptodate (have no leader for the ticket) # c) ticket expiry times differ check_booth_consistency() { - local tlist tlist_validate rc rc_lead maxdiff + local tlist rc rc_lead maxdiff tlist=`forall_withname booth list 2>/dev/null | grep $tkt` # Check time consistency ticket_times=$(echo "$tlist" | booth_list_fld 3) if [[ $ticket_times == *"INF"* ]]; then rc=0 else maxdiff=`echo "$tlist" | max_booth_time_diff` test "$maxdiff" -eq 0 rc=$? fi # Check leader consistency echo "$tlist" | booth_leader_consistency rc_lead=$? if [ $rc_lead -ne 0 ]; then echo "$tlist" | booth_leader_consistency_2 rc_lead=$(($rc_lead + $?)) # rc_lead=2 if the prev test failed fi rc=$(($rc | $rc_lead<<1)) test $rc -eq 0 && return cat</dev/null wait_timeout } run_report() { local start_ts=$1 end_ts=$2 name=$3 local hb_report_opts="" local quick_opt="" logmsg "running hb_report" hb_report -Q 2>&1 | grep -sq "illegal.option" || quick_opt="-Q" if [ `id -u` != 0 ]; then hb_report_opts="-u root" fi hb_report $hb_report_opts $quick_opt -f "`date -d @$((start_ts-5))`" \ -t "`date -d @$((end_ts+60))`" \ -n "$all_nodes $arbitrators" $name 2>&1 | logmsg } runtest() { local start_ts end_ts local rc booth_status dep_rsc_status - local start_time end_time local usrmsg rc=0 TEST=$1 - start_time=`date` + start_ts=`date` # to have the expanded form in the logfile start_ts=`date +%s` echo -n "Testing: $1 (ticket: $tkt)... " can_run_test $1 || return 0 echo "==================================================" | logmsg echo "starting booth test $1 ..." | logmsg if is_function setup_$1; then echo "-------------------------------------------------- (setup)" | logmsg setup_$1 rc=$? [ "$rc" -ne 0 ] && rc=$ERR_SETUP_FAILED fi if [ "$rc" -eq 0 ]; then setup_netem echo "-------------------------------------------------- (test)" | logmsg test_$1 rc=$? fi case $rc in 0) # wait a bit more if we're losing packets [ -n "$PKT_LOSS" ] && wait_timeout echo "-------------------------------------------------- (check)" | logmsg check_$1 rc=$? if [ $rc -eq 0 ]; then usrmsg="SUCCESS" else usrmsg="check FAIL: $rc" fi ;; $ERR_SETUP_FAILED) usrmsg="setup FAIL" ;; *) usrmsg="test FAIL: $rc" ;; esac - end_time=`date` + end_ts=`date` # to have the expanded form in the logfile end_ts=`date +%s` echo "finished booth test $1 ($tkt): $usrmsg" | logmsg echo "==================================================" | logmsg is_function recover_$1 && recover_$1 reset_netem_env #sleep 3 all_booth_status booth_status=$? check_dep_rsc dep_rsc_status=$? if [ $((rc|booth_status|dep_rsc_status)) -eq 0 ]; then echo OK [ "$GET_REPORT" ] && run_report $start_ts $end_ts $TEST else echo "$usrmsg (running hb_report ... $1.tar.bz2; see also $logf)" [ $booth_status -ne 0 ] && echo "unexpected: some booth daemons not running" [ $dep_rsc_status -ne 0 ] && echo "unexpected: dependent resource failure" run_report $start_ts $end_ts $TEST reboot_test master_rc=1 fi revoke_ticket } # # the tests # # most tests start by granting ticket grant_ticket() { run_site $1 booth grant -w $tkt >/dev/null } grant_ticket_cib() { run_site $1 booth grant -C $tkt >/dev/null } ## TEST: grant ## # just a grant test_grant() { grant_ticket 1 } check_grant() { check_consistency `get_internal_site 1` } ## TEST: longgrant ## # just a grant followed by three expire times setup_longgrant() { grant_ticket 1 } test_longgrant() { wait_exp wait_exp wait_exp } check_longgrant() { check_consistency `get_internal_site 1` } ## TEST: longgrant2 ## # just a grant followed by 10 expire times setup_longgrant2() { grant_ticket_cib 1 } test_longgrant2() { local i + # shellcheck disable=SC2034 + # (variable exists merely out of necessity) for i in `seq 10`; do wait_exp done } check_longgrant2() { check_consistency `get_internal_site 1` } ## TEST: grant_noarb ## # just a grant with no arbitrators setup_grant_noarb() { local h for h in $arbitrators; do stop_arbitrator $h || return 1 done >/dev/null 2>&1 #sleep 1 } test_grant_noarb() { grant_ticket 1 } check_grant_noarb() { check_consistency `get_internal_site 1` } recover_grant_noarb() { local h for h in $arbitrators; do start_arbitrator $h done >/dev/null 2>&1 } applicable_grant_noarb() { [ -n "$arbitrators" ] } ## TEST: revoke ## # just a revoke setup_revoke() { grant_ticket 1 } test_revoke() { revoke_ticket } check_revoke() { check_consistency } ## TEST: grant_elsewhere ## # just a grant to another site test_grant_elsewhere() { run_site 1 booth grant -w -s `get_internal_site 2` $tkt >/dev/null } check_grant_elsewhere() { check_consistency `get_internal_site 2` } ## TEST: grant_site_lost ## # grant with one site lost setup_grant_site_lost() { stop_site `get_site 2` booth_status `get_site 2` && return 1 return 0 } test_grant_site_lost() { grant_ticket 1 wait_exp } check_grant_site_lost() { check_consistency `get_internal_site 1` } recover_grant_site_lost() { start_site `get_site 2` } ## TEST: grant_site_reappear ## # grant with one site lost then reappearing setup_grant_site_reappear() { stop_site `get_site 2` booth_status `get_site 2` && return 1 return 0 #sleep 1 } test_grant_site_reappear() { grant_ticket 1 || return $ERR_SETUP_FAILED check_cib `get_internal_site 1` || return $ERR_SETUP_FAILED wait_timeout start_site `get_site 2` || return $ERR_SETUP_FAILED wait_timeout wait_timeout } check_grant_site_reappear() { check_consistency `get_internal_site 1` && is_cib_granted `get_site 1` } recover_grant_site_reappear() { start_site `get_site 2` } ## TEST: simultaneous_start_even ## # simultaneous start of even number of members setup_simultaneous_start_even() { grant_ticket_cib 2 || return 1 stop_booth || return 1 #wait_timeout } test_simultaneous_start_even() { local serv for serv in $(echo $sites | sed "s/`get_site 1` //"); do start_site $serv & done for serv in $arbitrators; do start_arbitrator $serv & done wait_renewal start_site `get_site 1` wait_timeout wait_timeout } check_simultaneous_start_even() { check_consistency `get_internal_site 2` } ## TEST: slow_start_granted ## # slow start setup_slow_start_granted() { grant_ticket_cib 1 || return 1 stop_booth || return 1 #wait_timeout } test_slow_start_granted() { for serv in $sites; do start_site $serv wait_timeout done for serv in $arbitrators; do start_arbitrator $serv wait_timeout done } check_slow_start_granted() { check_consistency `get_internal_site 1` } ## TEST: restart_granted ## # restart with ticket granted setup_restart_granted() { grant_ticket_cib 1 } test_restart_granted() { restart_site `get_site 1` || return 1 wait_timeout } check_restart_granted() { check_consistency `get_internal_site 1` } ## TEST: reload_granted ## # reload with ticket granted setup_reload_granted() { grant_ticket_cib 1 } test_reload_granted() { reload_site `get_site 1` || return 1 wait_timeout } check_reload_granted() { check_consistency `get_internal_site 1` } ## TEST: restart_granted_nocib ## # restart with ticket granted (but cib empty) setup_restart_granted_nocib() { grant_ticket_cib 1 } test_restart_granted_nocib() { stop_site_clean `get_site 1` || return 1 #wait_timeout start_site `get_site 1` || return 1 wait_timeout wait_timeout wait_timeout } check_restart_granted_nocib() { check_consistency `get_internal_site 1` } ## TEST: restart_notgranted ## # restart with ticket not granted setup_restart_notgranted() { grant_ticket_cib 1 } test_restart_notgranted() { stop_site `get_site 2` || return 1 #sleep 1 start_site `get_site 2` || return 1 wait_timeout } check_restart_notgranted() { check_consistency `get_internal_site 1` } ## TEST: failover ## # ticket failover setup_failover() { grant_ticket 1 [ -n "`get_attr`" ] && set_site_attr 2 return 0 } test_failover() { stop_site_clean `get_site 1` || return 1 booth_status `get_site 1` && return 1 wait_exp wait_timeout wait_timeout wait_timeout } check_failover() { check_consistency any } recover_failover() { start_site `get_site 1` } ## TEST: split_leader ## # split brain (leader alone) setup_split_leader() { grant_ticket_cib 1 [ -n "`get_attr`" ] && set_site_attr 2 return 0 } test_split_leader() { - run_site 1 $iprules stop $port >/dev/null + run_site 1 $iprules stop $port >/dev/null wait_exp wait_timeout wait_timeout wait_timeout wait_timeout check_cib any || return 1 - run_site 1 $iprules start $port >/dev/null + run_site 1 $iprules start $port >/dev/null wait_timeout wait_timeout wait_timeout } check_split_leader() { check_consistency any } recover_split_leader() { - run_site 1 $iprules start $port >/dev/null + run_site 1 $iprules start $port >/dev/null } ## TEST: split_follower ## # split brain (follower alone) setup_split_follower() { grant_ticket_cib 1 } test_split_follower() { - run_site 2 $iprules stop $port >/dev/null + run_site 2 $iprules stop $port >/dev/null wait_exp wait_timeout - run_site 2 $iprules start $port >/dev/null + run_site 2 $iprules start $port >/dev/null wait_timeout } check_split_follower() { check_consistency `get_internal_site 1` } ## TEST: split_edge ## # split brain (leader alone) setup_split_edge() { grant_ticket_cib 1 } test_split_edge() { - run_site 1 $iprules stop $port >/dev/null + run_site 1 $iprules stop $port >/dev/null wait_exp - run_site 1 $iprules start $port >/dev/null + run_site 1 $iprules start $port >/dev/null wait_timeout wait_timeout } check_split_edge() { check_consistency any } ## TEST: external_prog_failed ## # external test prog failed setup_external_prog_failed() { grant_ticket 1 || return 1 [ -n "`get_attr`" ] && set_site_attr 2 break_external_prog 1 show_pref 1 || return 1 } test_external_prog_failed() { wait_renewal wait_timeout } check_external_prog_failed() { check_consistency any && [ `booth_where_granted` != `get_internal_site 1` ] } recover_external_prog_failed() { repair_external_prog 1 } applicable_external_prog_failed() { [ -n "`get_rsc`" ] } ## TEST: attr_prereq_ok ## # failover with attribute prerequisite setup_attr_prereq_ok() { grant_ticket 1 || return 1 set_site_attr 2 stop_site_clean `get_site 1` booth_status `get_site 1` && return 1 return 0 } test_attr_prereq_ok() { wait_exp wait_timeout } check_attr_prereq_ok() { check_consistency `get_internal_site 2` } recover_attr_prereq_ok() { start_site `get_site 1` del_site_attr 2 } applicable_attr_prereq_ok() { [ -n "`get_attr`" ] } ## TEST: attr_prereq_fail ## # failover with failed attribute prerequisite setup_attr_prereq_fail() { grant_ticket 1 || return 1 del_site_attr 2 >/dev/null 2>&1 stop_site_clean `get_site 1` booth_status `get_site 1` && return 1 return 0 } test_attr_prereq_fail() { wait_exp wait_exp wait_exp } check_attr_prereq_fail() { check_consistency && booth_where_granted | grep -qwi none } recover_attr_prereq_fail() { start_site `get_site 1` } applicable_attr_prereq_fail() { [ -n "`get_attr`" ] } # # environment modifications # # packet loss at one site 30% NETEM_ENV_single_loss() { run_site 1 $ABSPATH $run_cnf __netem__ netem_loss ${1:-30} PKT_LOSS=${1:-30} } # packet loss everywhere 30% NETEM_ENV_loss() { forall $ABSPATH $run_cnf __netem__ netem_loss ${1:-30} PKT_LOSS=${1:-30} } # network delay 100ms NETEM_ENV_net_delay() { forall $ABSPATH $run_cnf __netem__ netem_delay ${1:-100} } # duplicate packets NETEM_ENV_duplicate() { forall $ABSPATH $run_cnf __netem__ netem_duplicate ${1:-10} } # reorder packets NETEM_ENV_reorder() { forall $ABSPATH $run_cnf __netem__ netem_reorder ${1:-25} ${2:-50} } # need this if we're run from a local directory or such get_prog_abspath() { local p p=`run_site 1 rpm -ql booth-test | fgrep -w $PROG` echo ${p:-/usr/share/booth/tests/test/live_test.sh} } [ -f "$cnf" ] || { echo "ERROR: configuration file $cnf doesn't exist" usage 1 } is_pacemaker_running || { echo "ERROR: sites must run pacemaker" exit 1 } sites=`get_extern_ip site < $cnf` arbitrators=`get_extern_ip arbitrator < $cnf` internal_sites=`get_value site < $cnf` internal_arbitrators=`get_value arbitrator < $cnf` all_nodes=`get_all_nodes` port=`get_value port < $cnf` : ${port:=9929} -site_cnt=`echo $internal_sites | wc -w` -arbitrator_cnt=`echo $internal_arbitrators | wc -w` if [ "$1" = "__netem__" ]; then shift 1 _JUST_NETEM=1 - local_netem_env $@ + local_netem_env "$@" exit fi [ -z "$internal_sites" ] && { echo no sites in $cnf usage 1 } exec 2>$logf BASH_XTRACEFD=2 PS4='+ `date +"%T"`: ' set -x WE_SERVER="" is_we_server && WE_SERVER=1 PREFNAME=__pref_booth_live_test authfile=`get_value authfile < $cnf` run_site 1 'test -f '"$authfile"' || booth-keygen '"$authfile" TESTS="$@" MANUAL_TESTS="$@" : ${TESTS:="grant longgrant grant_noarb grant_elsewhere grant_site_lost grant_site_reappear revoke simultaneous_start_even slow_start_granted restart_granted reload_granted restart_granted_nocib restart_notgranted failover split_leader split_follower split_edge external_prog_failed attr_prereq_ok attr_prereq_fail"} : ${MANUAL_TESTS:="grant longgrant grant_noarb grant_elsewhere grant_site_lost restart_granted reload_granted split_leader split_follower split_edge "} #get total number od lines in the file conf_file_size=$(grep -c $ $cnf) #get line numbers for all tickets ticket_line_numbers=$(grep -n ticket $cnf | cut -d: -f1) read -a TICKET_LINES<<< $ticket_line_numbers #save the part of config located before ticket definitions sed -n "1,$((${TICKET_LINES[0]}-1))p" $cnf > ${cnf}_main.config #create a separate file for every ticket data number_of_tickets=0 for i in $(seq 0 1 $((${#TICKET_LINES[@]}-1))); do ticket_line_start=${TICKET_LINES[i]} ticket_line_end=$((${TICKET_LINES[i+1]}-1)) if [ ${ticket_line_end} -lt 0 ]; then # for the last ticket ticket_line_end=${conf_file_size} fi sed -n "${ticket_line_start},${ticket_line_end}p" $cnf > ${cnf}_${number_of_tickets}.ticket number_of_tickets=$((number_of_tickets+1)) done master_rc=0 # updated in runtest for i in `seq 0 $(($number_of_tickets-1))` do cat ${cnf}_main.config > booth_${i}.conf cat ${cnf}_${i}.ticket >> booth_${i}.conf tkt=`get_tkt < booth_${i}.conf` if [ -z "$tkt" ]; then echo "Skipping empty ticket.." continue fi sync_conf booth_${i}.conf || exit reboot_test all_booth_status || { start_booth all_booth_status || { echo "some booth servers couldn't be started" exit 1 } } ABSPATH=`get_prog_abspath` dump_conf | logmsg eval `get_tkt_settings booth_${i}.conf` + # shellcheck disable=SC2154 + # (T_timeout: defined with get_tkt_settings) MIN_TIMEOUT=`awk -v tm=$T_timeout 'BEGIN{ if (tm >= 2) print tm; else print 2*tm; }'` [ -z "$T_expire" ] && { echo set $tkt expire time in $cnf usage 1 } if [ -z "$T_renewal_freq" ]; then T_renewal_freq=$((T_expire/2)) fi revoke_ticket T_mode=`get_mode` T_mode_lowercase=$(echo "$T_mode" | tr '[:upper:]' '[:lower:]') if [[ $T_mode_lowercase == *"manual"* ]]; then echo "Running tests for manual tickets.." for t in $MANUAL_TESTS; do runtest $t done else echo "Running tests for automatic Raft tickets.." for t in $TESTS; do runtest $t done fi done exit $master_rc