Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F2823021
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
16 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/test/live_test.sh b/test/live_test.sh
index 992ad22..d2a1718 100755
--- a/test/live_test.sh
+++ b/test/live_test.sh
@@ -1,852 +1,865 @@
#!/bin/sh
#
# see README-testing for more information
# do some basic booth operation tests for the given config
#
PROG=`basename $0`
usage() {
cat<<EOF
usage:
[NETEM_ENV=<envfun>[:<val>]] $PROG <booth.conf> [<test> ...]
EOF
if [ $1 -eq 0 ]; then
list_all
examples
fi
exit
}
list_all() {
echo "Tests:"
grep "^test_.*{$" $0 | sed 's/test_//;s/(.*//;s/^/ /'
echo
echo "Netem functions:"
grep "^NETEM_ENV_.*{$" $0 | sed 's/NETEM_ENV_//;s/(.*//;s/^/ /'
}
examples() {
cat<<EOF
Examples:
$0 booth.conf
$0 booth-5node.conf grant revoke
NETEM_ENV=net_delay:150 $0 mybooth.conf
EOF
}
[ $# -eq 0 ] && usage 0
cnf=$1
shift 1
logf=test_booth.log
iprules=/usr/share/booth/tests/test/booth_path
: ${HA_LOGFACILITY:="syslog"}
logmsg() {
logger -t "BOOTHTEST" -p $HA_LOGFACILITY.info -- $@
}
ext_prog_log() {
local cmd="$@"
echo "run: $cmd" | logmsg
$cmd
}
get_stat_fld() {
local fld=$1
sed "s/.* $fld=//;s/ .*//;s/'//g"
}
# tc netem, simulate packet loss, wan, etc
netem_delay() {
ext_prog_log tc qdisc add dev $1 root netem delay $2ms $(($2/10))ms
}
netem_loss() {
ext_prog_log tc qdisc add dev $1 root netem loss $2%
}
netem_reset() {
ext_prog_log tc qdisc del dev $1 root netem
}
local_netem_env() {
local fun=$1 arg=$2
local t netif=""
local my_addr
my_addr=`booth status | get_stat_fld booth_addr_string`
if [ -z "$my_addr" ]; then
logmsg "cannot find my address, booth running?"
return 1
fi
for t in `ip link | grep '^[1-9]:' | sed 's/.: //;s/: .*//'`
do
if ip a l $t | fgrep -wq $my_addr; then
netif=$t
break
fi
done
if [ -n "$netif" ]; then
$fun $netif $arg
else
logmsg "cannot find netif for $my_addr, netem not set"
fi
}
if [ "$1" = "__netem__" ]; then
shift 1
local_netem_env $@
exit
fi
is_function() {
test z"`command -v $1`" = z"$1"
}
runcmd() {
local h=$1 rc
shift 1
logmsg "$h: running '$@'"
if ip a l | fgrep -wq $h; then
$@
else
ssh $h $@
fi
rc=$?
if [ $rc -ne 0 ]; then
echo "$h: '$@' failed (exit code $rc)" | logmsg
fi
return $rc
}
manage_site() {
runcmd $1 crm resource $2 booth
}
manage_arbitrator() {
runcmd $1 systemctl $2 booth@booth.service
}
start_site() {
manage_site $1 start
}
start_arbitrator() {
manage_arbitrator $1 start
}
stop_site_clean() {
manage_site $1 stop &&
sleep 1 &&
runcmd $1 crm --force site ticket revoke $tkt
}
stop_site() {
manage_site $1 stop
}
stop_arbitrator() {
manage_arbitrator $1 stop
}
restart_site() {
manage_site $1 restart
}
restart_arbitrator() {
manage_arbitrator $1 restart
}
booth_status() {
test "`runcmd $1 booth status | get_stat_fld booth_state`" = "started"
}
stop_booth() {
local h
for h in $sites; do
stop_site $h
done >/dev/null 2>&1
for h in $arbitrators; do
stop_arbitrator $h
done >/dev/null 2>&1
wait_timeout
}
start_booth() {
local h
for h in $sites; do
start_site $h
done >/dev/null 2>&1
for h in $arbitrators; do
start_arbitrator $h
done >/dev/null 2>&1
wait_timeout
}
restart_booth() {
local h procs
for h in $sites; do
restart_site $h & procs="$! $procs"
done >/dev/null 2>&1
for h in $arbitrators; do
restart_arbitrator $h
done >/dev/null 2>&1
wait $procs
wait_timeout
}
sync_conf() {
local h rc=0
for h in $sites $arbitrators; do
rsync -q $cnf $h:/etc/booth/booth.conf
rc=$((rc|$?))
done
return $rc
}
dump_conf() {
echo "test configuration file $cnf:"
grep -v '^#' $cnf | grep -v '^[[:space:]]*$' | sed "s/^/$cnf: /"
}
forall() {
local h rc=0
for h in $sites $arbitrators; do
runcmd $h $@
rc=$((rc|$?))
done
return $rc
}
forall_sites() {
local h rc=0
for h in $sites; do
runcmd $h $@
rc=$((rc|$?))
done
return $rc
}
forall_fun() {
local h rc=0 f=$1
for h in $sites $arbitrators; do
$f $h
rc=$((rc|$?))
[ $rc -ne 0 ] && break
done
return $rc
}
# run on all hosts whatever function produced on stdout
forall_fun2() {
local h rc=0 f
f=$1
shift 1
for h in $sites $arbitrators; do
$f $@ | ssh $h
rc=$((rc|$?))
[ $rc -ne 0 ] && break
done
return $rc
}
run_site() {
local n=$1 h
shift 1
h=`echo $sites | awk '{print $'$n'}'`
runcmd $h $@
}
run_arbitrator() {
local n=$1 h
shift 1
h=`echo $arbitrators | awk '{print $'$n'}'`
runcmd $h $@
}
get_site() {
local n=$1
echo $sites | awk '{print $'$n'}'
}
get_port() {
grep "^port" |
sed -n 's/.*="//;s/"//p'
}
get_servers() {
grep "^$1" |
sed -n 's/.*="//;s/"//p'
}
get_rsc() {
awk '/before-acquire-handler/{print $NF}' $cnf
}
break_external_prog() {
echo "location __pref_booth_live_test `get_rsc` rule -inf: defined #uname" | run_site 1 crm configure
}
repair_external_prog() {
run_site $1 crm configure delete __pref_booth_live_test
}
get_tkt() {
grep "^ticket=" | head -1 | sed 's/ticket=//;s/"//g'
}
get_tkt_settings() {
awk '
n && /^ / && /expire|timeout/ {
sub(" = ", "=", $0);
sub("^ ", "T_", $0);
print
next
}
n && /^$/ {exit}
/^ticket.*'$tkt'/ {n=1}
' $cnf
}
wait_booth() {
local maxwait=$T_expire testcmd="$@"
while sleep 1; do
run_site 1 booth list | grep $tkt | booth_list_fld 2 |
$testcmd && break
maxwait=$((maxwait-1))
[ $maxwait -eq 0 ] && {
echo "booth wait failed"
return 1
}
done
}
wait_revoke() {
wait_booth grep -iq none
}
wait_grant() {
wait_booth grep -iqv none
}
wait_exp() {
sleep $T_expire
}
wait_half_exp() {
sleep $((T_expire/2))
}
wait_timeout() {
local t=2
[ "$T_timeout" -gt $t ] && t=$T_timeout
[ "$PKT_LOSS" ] && t=$((t+PKT_LOSS/6))
sleep $t
}
set_netem_env() {
local modfun args
modfun=`echo $1 | sed 's/:.*//'`
args=`echo $1 | sed 's/[^:]*://;s/:/ /g'`
if ! is_function NETEM_ENV_$modfun; then
echo "NETEM_ENV_$modfun: doesn't exist"
exit 1
fi
NETEM_ENV_$modfun $args
}
reset_netem_env() {
[ -z "$NETEM_ENV" ] && return
forall $0 $cnf __netem__ netem_reset
}
setup_netem() {
[ -z "$NETEM_ENV" ] && return
for env in $NETEM_ENV; do
set_netem_env $env
done
trap "reset_netem_env" EXIT
}
cib_status() {
local h=$1 stat
stat=`runcmd $h crm_ticket -L |
grep "^$tkt" | awk '{print $2}'`
test "$stat" != "-1"
}
is_cib_granted() {
local stat h=$1
stat=`runcmd $h crm_ticket -L |
grep "^$tkt" | awk '{print $2}'`
[ "$stat" = "granted" ]
}
check_cib_consistency() {
local h gh="" rc=0
for h in $sites; do
if is_cib_granted $h; then
[ -n "$gh" ] && rc=1 # granted twice
gh="$gh $h"
fi
done
[ -z "$gh" ] && gh="none"
if [ $rc -eq 0 ]; then
echo $gh
return $rc
fi
cat<<EOF | logmsg
CIB consistency test failed
ticket granted to $gh
EOF
return $rc
}
check_cib() {
local exp_grantee=$1 cib_grantee booth_grantee
local rc=0 pending
cib_grantee=`check_cib_consistency`
booth_grantee=`booth_where_granted`
pending=$?
if [ $pending -eq 0 ]; then
[ "$cib_grantee" = "$booth_grantee" ]
rc=$?
else
# ticket is not committed to cib yet
[ "$exp_grantee" = "$booth_grantee" ]
rc=$?
exp_grantee="" # cheat a bit
fi
case "$exp_grantee" in
"any") [ "$cib_grantee" != "none" ] ;;
"") [ "$cib_grantee" = "none" ] ;;
*) [ "$cib_grantee" = "$exp_grantee" ] ;;
esac
rc=$((rc|$?))
if [ $rc -ne 0 ]; then
cat<<EOF | logmsg
CIB check failed
CIB grantee: $cib_grantee
booth grantee: $booth_grantee
expected grantee: $exp_grantee
EOF
fi
return $rc
}
booth_where_granted() {
local grantee ticket_line
# we don't know which sites could be stopped, so run booth
# list on all of them (at least one should have booth
# running)
ticket_line=`forall_sites booth list | grep $tkt | sort -u | head -1`
grantee=`echo "$ticket_line" | sed 's/.*leader: //;s/,.*//'`
echo $grantee
[ "$grantee" = "none" ] && return
! runcmd $grantee booth list | grep -q "$tkt.*pending"
}
booth_list_fld() {
cut -d, -f $1 | sed 's/[^:]*://'
}
max_booth_time_diff() {
local l
booth_list_fld 3 |
while read l; do
date -d "$l" "+%s"
done |
awk '
{t[n++]=$0}
END{
for (i=0; i<n; i++)
for (j=i+1; j<n; j++) {
x=t[i]-t[j];
print x >= 0 ? x : -x;
}
}
' | sort -n | tail -1
}
booth_leader_consistency() {
test `booth_list_fld 2 | sort -u | wc -l` -eq 1
}
check_booth_consistency() {
local tlist rc maxdiff
tlist=`forall booth list 2>/dev/null | grep $tkt |
sed 's/commit:.*//;s/NONE/none/'`
maxdiff=`echo "$tlist" | max_booth_time_diff`
test "$maxdiff" -eq 0
rc=$?
echo "$tlist" | booth_leader_consistency
rc=$(($rc | $?<<1))
test $rc -eq 0 && return
cat<<EOF | logmsg
`if [ $rc -gt 1 ]; then
echo "booth list consistency failed (more than one leader!):"
else
echo "booth list consistency failed (max valid time diff: $maxdiff):"
fi`
===========
"$tlist"
===========
EOF
test $rc -le 1
}
check_consistency() {
local exp_grantee=$1
check_booth_consistency &&
check_cib $exp_grantee
}
all_booth_status() {
forall_fun booth_status
}
can_run_test() {
if is_function applicable_$1; then
if ! applicable_$1; then
echo "(not applicable, skipping)"
return 1
fi
fi
if ! is_function test_$1 || ! is_function check_$1; then
echo "(test missing)"
return 1
fi
}
revoke_ticket() {
run_site 1 booth revoke $tkt >/dev/null
wait_revoke
wait_timeout
}
run_report() {
local start_ts=$1 end_ts=$2 name=$3
logmsg "running hb_report"
hb_report -f "`date -d @$((start_ts-5))`" \
-t "`date -d @$((end_ts+60))`" \
-n "$sites $arbitrators" $name 2>&1 | logmsg
}
runtest() {
local start_ts end_ts rc booth_status
local start_time end_time
TEST=$1
start_time=`date`
start_ts=`date +%s`
echo -n "Testing: $1... "
can_run_test $1 || return 0
logmsg "starting booth test $1 ..."
setup_netem
test_$1 && check_$1
rc=$?
end_time=`date`
end_ts=`date +%s`
reset_netem_env
logmsg "finished booth test $1 (exit code $rc)"
is_function recover_$1 && recover_$1
sleep 3
all_booth_status
booth_status=$?
if [ $rc -eq 0 -a $booth_status -eq 0 ]; then
echo OK
[ "$GET_REPORT" ] && run_report $start_ts $end_ts $TEST
else
echo "FAIL (running hb_report ... $1.tar.bz2; see also $logf)"
[ $booth_status -ne 0 ] &&
echo "unexpected: some booth daemons not running"
run_report $start_ts $end_ts $TEST
fi
revoke_ticket
}
[ -f "$cnf" ] || {
ls $cnf
usage 1
}
sites=`get_servers site < $cnf`
arbitrators=`get_servers arbitrator < $cnf`
port=`get_port < $cnf`
: ${port:=9929}
site_cnt=`echo $sites | wc -w`
arbitrator_cnt=`echo $arbitrators | wc -w`
tkt=`get_tkt < $cnf`
eval `get_tkt_settings`
[ -z "$sites" ] && {
echo no sites in $cnf
usage 1
}
[ -z "$T_expire" ] && {
echo set $tkt expire time in $cnf
usage 1
}
exec 2>$logf
BASH_XTRACEFD=2
PS4='+ `date +"%T"`: '
set -x
#
# the tests
#
# most tests start like this
grant2site_one() {
run_site 1 booth grant $tkt >/dev/null
wait_grant
wait_timeout
}
## TEST: grant ##
# just a grant
test_grant() {
grant2site_one
}
check_grant() {
check_consistency `get_site 1`
}
+## TEST: longgrant ##
+
+# just a grant followed by three expire times
+test_longgrant() {
+ grant2site_one
+ wait_exp
+ wait_exp
+ wait_exp
+}
+check_longgrant() {
+ check_consistency `get_site 1`
+}
+
## TEST: grant_noarb ##
# just a grant with no arbitrators
test_grant_noarb() {
local h
for h in $arbitrators; do
stop_arbitrator $h
done >/dev/null 2>&1
sleep 1
run_site 1 booth grant $tkt >/dev/null
wait_grant
}
check_grant_noarb() {
check_consistency `get_site 1`
}
recover_grant_noarb() {
local h
for h in $arbitrators; do
start_arbitrator $h
done >/dev/null 2>&1
}
applicable_grant_noarb() {
[ -n "$arbitrators" ]
}
## TEST: revoke ##
# just a revoke
test_revoke() {
grant2site_one
revoke_ticket
}
check_revoke() {
check_consistency
}
## TEST: grant_elsewhere ##
# just a grant to another site
test_grant_elsewhere() {
run_site 1 booth grant -s `get_site 2` $tkt >/dev/null
wait_grant
}
check_grant_elsewhere() {
check_consistency `get_site 2`
}
## TEST: grant_site_lost ##
# grant with one site lost
test_grant_site_lost() {
stop_site `get_site 2`
wait_timeout
run_site 1 booth grant $tkt >/dev/null
wait_grant
check_cib `get_site 1` || return 1
wait_exp
}
check_grant_site_lost() {
check_consistency `get_site 1`
}
recover_grant_site_lost() {
start_site `get_site 2`
}
## TEST: simultaneous_start_even ##
# simultaneous start of even number of members
test_simultaneous_start_even() {
local serv
run_site 2 booth grant $tkt >/dev/null
wait_grant
stop_booth
wait_timeout
for serv in $(echo $sites | sed "s/`get_site 1` //"); do
start_site $serv &
done
for serv in $arbitrators; do
start_arbitrator $serv &
done
wait_half_exp
start_site `get_site 1`
wait_timeout
wait_timeout
}
check_simultaneous_start_even() {
check_consistency `get_site 2`
}
## TEST: slow_start_granted ##
# slow start
test_slow_start_granted() {
grant2site_one
stop_booth
wait_timeout
for serv in $sites; do
start_site $serv
wait_timeout
done
for serv in $arbitrators; do
start_arbitrator $serv
wait_timeout
done
}
check_slow_start_granted() {
check_consistency `get_site 1`
}
## TEST: restart_granted ##
# restart with ticket granted
test_restart_granted() {
grant2site_one
restart_site `get_site 1`
wait_timeout
}
check_restart_granted() {
check_consistency `get_site 1`
}
## TEST: restart_granted_nocib ##
# restart with ticket granted (but cib empty)
test_restart_granted_nocib() {
grant2site_one
stop_site_clean `get_site 1` || return 1
wait_timeout
start_site `get_site 1`
wait_timeout
wait_timeout
wait_timeout
}
check_restart_granted_nocib() {
check_consistency `get_site 1`
}
## TEST: notgranted ##
# restart with ticket not granted
test_restart_notgranted() {
grant2site_one
stop_site `get_site 2`
sleep 1
start_site `get_site 2`
wait_timeout
}
check_restart_notgranted() {
check_consistency `get_site 1`
}
## TEST: failover ##
# ticket failover
test_failover() {
grant2site_one
stop_site_clean `get_site 1` || return 1
booth_status `get_site 1` && return 1
wait_exp
wait_timeout
wait_timeout
wait_timeout
}
check_failover() {
check_consistency any
}
recover_failover() {
start_site `get_site 1`
}
## TEST: split_leader ##
# split brain (leader alone)
test_split_leader() {
grant2site_one
run_site 1 $iprules stop $port >/dev/null
wait_exp
wait_timeout
check_cib any || return 1
run_site 1 $iprules start $port >/dev/null
wait_timeout
wait_timeout
wait_timeout
}
check_split_leader() {
check_consistency any
}
recover_split_leader() {
run_site 1 $iprules start $port >/dev/null
}
## TEST: split_follower ##
# split brain (follower alone)
test_split_follower() {
grant2site_one
run_site 2 $iprules stop $port >/dev/null
wait_exp
wait_timeout
run_site 2 $iprules start $port >/dev/null
wait_timeout
}
check_split_follower() {
check_consistency `get_site 1`
}
## TEST: split_edge ##
# split brain (leader alone)
test_split_edge() {
grant2site_one
run_site 1 $iprules stop $port >/dev/null
wait_exp
run_site 1 $iprules start $port >/dev/null
wait_timeout
}
check_split_edge() {
check_consistency any
}
## TEST: external_prog_failed ##
# external test prog failed
test_external_prog_failed() {
grant2site_one
break_external_prog 1
wait_half_exp
wait_timeout
}
check_external_prog_failed() {
check_consistency any &&
[ `booth_where_granted` != `get_site 1` ]
}
recover_external_prog_failed() {
repair_external_prog 1
}
applicable_external_prog_failed() {
[ -n `get_rsc` ]
}
#
# environment modifications
#
# packet loss at one site 30%
NETEM_ENV_single_loss() {
run_site 1 $0 $cnf __netem__ netem_loss ${1:-30}
PKT_LOSS=${1:-30}
}
# packet loss everywhere 30%
NETEM_ENV_loss() {
forall $0 $cnf __netem__ netem_loss ${1:-30}
PKT_LOSS=${1:-30}
}
# network delay 100ms
NETEM_ENV_net_delay() {
forall $0 $cnf __netem__ netem_delay ${1:-100}
}
sync_conf || exit
restart_booth
all_booth_status || {
start_booth
all_booth_status || exit
}
revoke_ticket
dump_conf | logmsg
TESTS="$@"
-: ${TESTS:="grant grant_noarb grant_elsewhere grant_site_lost revoke
+: ${TESTS:="grant longgrant grant_noarb grant_elsewhere grant_site_lost revoke
simultaneous_start_even slow_start_granted
restart_granted restart_granted_nocib restart_notgranted
failover split_leader split_follower split_edge
external_prog_failed"}
for t in $TESTS; do
runtest $t
done
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jan 25, 7:03 AM (1 d, 19 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1320998
Default Alt Text
(16 KB)
Attached To
Mode
rB Booth
Attached
Detach File
Event Timeline
Log In to Comment