No OneTemporary
Actions

Size

14 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/test/live_test.sh b/test/live_test.sh
	index 60b22e3..a96af34 100755
	--- a/test/live_test.sh
	+++ b/test/live_test.sh
	@@ -1,736 +1,738 @@
	#!/bin/sh
	#
	# see README-testing for more information
	# do some basic booth operation tests for the given config
	#

	usage() {
	echo "$0: {booth.conf}"
	exit
	}

	[ $# -eq 0 ] && usage

	cnf=$1
	shift 1
	logf=test_booth.log
	iprules=/usr/share/booth/tests/test/booth_path
	netif=eth0
	: ${HA_LOGFACILITY:="syslog"}

	is_function() {
	test z"`command -v $1`" = z"$1"
	}
	manage_site() {
	ssh $1 crm resource $2 booth
	}
	manage_arbitrator() {
	ssh $1 systemctl $2 booth@booth.service
	}
	start_site() {
	manage_site $1 start
	}
	start_arbitrator() {
	manage_arbitrator $1 start
	}
	stop_site_clean() {
	manage_site $1 stop &&
	sleep 1 &&
	ssh $1 crm --force site ticket revoke $tkt
	}
	stop_site() {
	manage_site $1 stop
	}
	stop_arbitrator() {
	manage_arbitrator $1 stop
	}
	restart_site() {
	manage_site $1 restart
	}
	restart_arbitrator() {
	manage_arbitrator $1 restart
	}
	get_stat_fld() {
	local h=$1 fld=$2
	ssh $h booth status \| sed "s/.* $fld=//;s/ .*//;s/'//g"
	}
	booth_status() {
	test "`get_stat_fld $1 booth_state`" = "started"
	}
	stop_booth() {
	local h
	for h in $sites; do
	stop_site $h
	done >/dev/null 2>&1
	for h in $arbitrators; do
	stop_arbitrator $h
	done >/dev/null 2>&1
	wait_timeout
	}
	start_booth() {
	local h
	for h in $sites; do
	start_site $h
	done >/dev/null 2>&1
	for h in $arbitrators; do
	start_arbitrator $h
	done >/dev/null 2>&1
	wait_timeout
	}
	restart_booth() {
	local h procs
	for h in $sites; do
	restart_site $h & procs="$! $procs"
	done >/dev/null 2>&1
	for h in $arbitrators; do
	restart_arbitrator $h
	done >/dev/null 2>&1
	wait $procs
	wait_timeout
	}
	sync_conf() {
	local h rc=0
	for h in $sites $arbitrators; do
	rsync -q $cnf $h:/etc/booth/booth.conf
	rc=$((rc\|$?))
	done
	return $rc
	}
	dump_conf() {
	echo "test configuration file $cnf:"
	grep -v '^#' $cnf \| grep -v '^[[:space:]]*$' \| sed "s/^/$cnf: /"
	}
	forall() {
	local h rc=0
	for h in $sites $arbitrators; do
	ssh $h $@
	rc=$((rc\|$?))
	done
	return $rc
	}
	forall_sites() {
	local h rc=0
	for h in $sites; do
	ssh $h $@
	rc=$((rc\|$?))
	done
	return $rc
	}
	forall_fun() {
	local h rc=0 f=$1
	for h in $sites $arbitrators; do
	$f $h
	rc=$((rc\|$?))
	[ $rc -ne 0 ] && break
	done
	return $rc
	}
	# run on all hosts whatever function produced on stdout
	forall_fun2() {
	local h rc=0 f
	f=$1
	shift 1
	for h in $sites $arbitrators; do
	$f $@ \| ssh $h
	rc=$((rc\|$?))
	[ $rc -ne 0 ] && break
	done
	return $rc
	}
	run_site() {
	local n=$1 h
	shift 1
	h=`echo $sites \| awk '{print $'$n'}'`
	ssh $h $@ \|\| {
	echo "$h: '$@' failed (exit code $?)" >&2
	}
	}
	run_arbitrator() {
	local n=$1 h
	shift 1
	h=`echo $arbitrators \| awk '{print $'$n'}'`
	ssh $h $@
	}
	get_site() {
	local n=$1
	echo $sites \| awk '{print $'$n'}'
	}

	get_servers() {
	grep "^$1" \|
	sed -n 's/.*="//;s/"//p'
	}
	get_rsc() {
	awk '/before-acquire-handler/{print $NF}' $cnf
	}

	break_external_prog() {
	echo "location __pref_booth_live_test `get_rsc` rule -inf: defined #uname" \| run_site 1 crm configure
	}
	repair_external_prog() {
	run_site $1 crm configure delete __pref_booth_live_test
	}
	get_tkt() {
	grep "^ticket=" \| head -1 \| sed 's/ticket=//;s/"//g'
	}
	get_tkt_settings() {
	awk '
	n && /^ / && /expire\|timeout/ {
	sub(" = ", "=", $0);
	sub("^ ", "T_", $0);
	print
	next
	}
	n && /^$/ {exit}
	/^ticket.*'$tkt'/ {n=1}
	' $cnf
	}
	wait_exp() {
	sleep $T_expire
	}
	wait_half_exp() {
	sleep $((T_expire/2))
	}
	wait_timeout() {
	- sleep $T_timeout
	+ local t=2
	+ [ "$T_timeout" -gt $t ] && t=$T_timeout
	+ sleep $t
	}

	ext_prog_log() {
	local cmd="$@"
	echo "run: $cmd" >&2
	logger -p $HA_LOGFACILITY.info "$cmd"
	$cmd
	}

	# tc netem, simulate packet loss, wan, etc
	netem_delay() {
	echo "tc qdisc add dev $netif root netem delay $1ms $(($1/10))ms"
	}
	netem_loss() {
	echo "tc qdisc add dev $netif root netem loss $1%"
	}
	netem_reset() {
	echo "tc qdisc del dev $netif root netem"
	}

	cib_status() {
	local h=$1 stat
	stat=`ssh $h crm_ticket -L \|
	grep "^$tkt" \| awk '{print $2}'`
	test "$stat" != "-1"
	}
	is_cib_granted() {
	local stat h=$1
	stat=`ssh $h crm_ticket -L \|
	grep "^$tkt" \| awk '{print $2}'`
	[ "$stat" = "granted" ]
	}
	check_cib_consistency() {
	local h gh="" rc=0
	for h in $sites; do
	if is_cib_granted $h; then
	[ -n "$gh" ] && rc=1 # granted twice
	gh="$gh $h"
	fi
	done
	[ -z "$gh" ] && gh="none"
	if [ $rc -eq 0 ]; then
	echo $gh
	return $rc
	fi
	cat<<EOF >&2
	CIB consistency test failed
	ticket granted to $gh
	EOF
	return $rc
	}
	check_cib() {
	local exp_grantee=$1 cib_grantee booth_grantee
	local rc=0 pending
	cib_grantee=`check_cib_consistency`
	booth_grantee=`booth_where_granted`
	pending=$?
	if [ $pending -eq 0 ]; then
	[ "$cib_grantee" = "$booth_grantee" ]
	rc=$?
	else
	# ticket is not committed to cib yet
	[ "$exp_grantee" = "$booth_grantee" ]
	rc=$?
	exp_grantee="" # cheat a bit
	fi
	case "$exp_grantee" in
	"any") [ "$cib_grantee" != "none" ] ;;
	"") [ "$cib_grantee" = "none" ] ;;
	*) [ "$cib_grantee" = "$exp_grantee" ] ;;
	esac
	rc=$((rc\|$?))
	if [ $rc -ne 0 ]; then
	cat<<EOF >&2
	CIB check failed
	CIB grantee: $cib_grantee
	booth grantee: $booth_grantee
	expected grantee: $exp_grantee
	EOF
	fi
	return $rc
	}

	booth_where_granted() {
	local grantee ticket_line
	# we don't know which sites could be stopped, so run booth
	# list on all of them (at least one should have booth
	# running)
	ticket_line=`forall_sites booth list \| grep $tkt \| sort -u \| head -1`
	grantee=`echo "$ticket_line" \| sed 's/.leader: //;s/,.//'`
	echo $grantee
	[ "$grantee" = "none" ] && return
	! ssh $grantee booth list \| grep -q "$tkt.*pending"
	}
	check_booth_consistency() {
	local cnt tlist
	tlist=`forall booth list 2>/dev/null \| grep $tkt \|
	sed 's/commit:.*//;s/NONE/none/'`
	cnt=`echo "$tlist" \| sort -u \| wc -l`
	test $cnt -eq 1 && return
	cat<<EOF >&2
	booth list consistency test failed:
	===========
	"$tlist"
	===========
	EOF
	return 1
	}

	check_consistency() {
	local exp_grantee=$1
	check_booth_consistency &&
	check_cib $exp_grantee
	}

	reset_booth() {
	start_booth
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	}
	all_booth_status() {
	forall_fun booth_status
	}

	can_run_test() {
	if is_function applicable_$1; then
	if ! applicable_$1; then
	echo "(not applicable, skipping)"
	return 1
	fi
	fi
	if ! is_function test_$1 \|\| ! is_function check_$1; then
	echo "(test missing)"
	return 1
	fi
	}
	runtest() {
	local start_ts end_ts rc booth_status
	local start_time end_time
	TEST=$1
	start_time=`date`
	start_ts=`date +%s`
	echo -n "Testing: $1... "
	can_run_test $1 \|\| return 0
	logger -p $HA_LOGFACILITY.info "starting booth test $1 ..."
	test_$1 && check_$1
	rc=$?
	end_time=`date`
	end_ts=`date +%s`
	logger -p $HA_LOGFACILITY.info "finished booth test $1 (exit code $rc)"
	is_function recover_$1 && recover_$1
	all_booth_status
	booth_status=$?
	if [ $rc -eq 0 -a $booth_status -eq 0 ]; then
	echo OK
	else
	echo "FAIL (running hb_report ... $1.tar.bz2; see also $logf)"
	[ $booth_status -ne 0 ] &&
	echo "unexpected: some booth daemons not running"
	echo "running hb_report" >&2
	hb_report -f "`date -d @$((start_ts-5))`" \
	-t "`date -d @$((end_ts+60))`" \
	-n "$sites $arbitrators" $1 >&2
	fi
	}

	[ -f "$cnf" ] \|\| {
	ls $cnf
	usage
	}

	sites=`get_servers site < $cnf`
	arbitrators=`get_servers arbitrator < $cnf`
	site_cnt=`echo $sites \| wc -w`
	arbitrator_cnt=`echo $arbitrators \| wc -w`
	tkt=`get_tkt < $cnf`
	eval `get_tkt_settings`

	[ -z "$sites" ] && {
	echo no sites in $cnf
	usage
	}

	[ -z "$T_expire" ] && {
	echo set $tkt expire time in $cnf
	usage
	}

	exec 2>$logf
	BASH_XTRACEFD=2
	PS4='+ `date +"%T"`: '
	set -x

	#
	# the tests
	#

	## TEST: grant ##

	# just a grant
	test_grant() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	}
	check_grant() {
	check_consistency `get_site 1`
	}

	## TEST: grant_noarb ##

	# just a grant with no arbitrators
	test_grant_noarb() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	local h
	for h in $arbitrators; do
	stop_arbitrator $h
	done >/dev/null 2>&1
	sleep 1
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	}
	check_grant_noarb() {
	check_consistency `get_site 1`
	}
	recover_grant_noarb() {
	local h
	for h in $arbitrators; do
	start_arbitrator $h
	done >/dev/null 2>&1
	}

	## TEST: revoke ##

	# just a revoke
	test_revoke() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	}
	check_revoke() {
	check_consistency
	}

	## TEST: grant_elsewhere ##

	# just a grant to another site
	test_grant_elsewhere() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant -s `get_site 2` $tkt >/dev/null
	wait_timeout
	}
	check_grant_elsewhere() {
	check_consistency `get_site 2`
	}

	## TEST: grant_site_lost ##

	# grant with one site lost
	test_grant_site_lost() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	stop_site `get_site 2`
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	check_cib `get_site 1` \|\| return 1
	wait_exp
	}
	check_grant_site_lost() {
	check_consistency `get_site 1`
	}
	recover_grant_site_lost() {
	start_site `get_site 2`
	}

	## TEST: simultaneous_start_even ##

	# simultaneous start of even number of members
	test_simultaneous_start_even() {
	local serv
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 2 booth grant $tkt >/dev/null
	wait_timeout
	stop_booth
	wait_timeout
	for serv in $(echo $sites \| sed "s/`get_site 1` //"); do
	start_site $serv &
	done
	for serv in $arbitrators; do
	start_arbitrator $serv &
	done
	wait_half_exp
	start_site `get_site 1`
	wait_timeout
	wait_timeout
	}
	check_simultaneous_start_even() {
	check_consistency `get_site 2`
	}

	## TEST: slow_start_granted ##

	# slow start
	test_slow_start_granted() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	stop_booth
	wait_timeout
	for serv in $sites; do
	start_site $serv
	wait_timeout
	done
	for serv in $arbitrators; do
	start_arbitrator $serv
	wait_timeout
	done
	}
	check_slow_start_granted() {
	check_consistency `get_site 1`
	}

	## TEST: restart_granted ##

	# restart with ticket granted
	test_restart_granted() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	restart_site `get_site 1`
	wait_timeout
	}
	check_restart_granted() {
	check_consistency `get_site 1`
	}

	## TEST: restart_granted_nocib ##

	# restart with ticket granted (but cib empty)
	test_restart_granted_nocib() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	stop_site_clean `get_site 1` \|\| return 1
	wait_timeout
	start_site `get_site 1`
	wait_timeout
	}
	check_restart_granted_nocib() {
	check_consistency `get_site 1`
	}

	## TEST: notgranted ##

	# restart with ticket not granted
	test_restart_notgranted() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	stop_site `get_site 2`
	sleep 1
	start_site `get_site 2`
	wait_timeout
	}
	check_restart_notgranted() {
	check_consistency `get_site 1`
	}

	## TEST: failover ##

	# ticket failover
	test_failover() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	stop_site_clean `get_site 1` \|\| return 1
	booth_status `get_site 1` && return 1
	wait_exp
	wait_timeout
	}
	check_failover() {
	check_consistency any
	}
	recover_failover() {
	start_site `get_site 1`
	}

	## TEST: split_leader ##

	# split brain (leader alone)
	test_split_leader() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	run_site 1 $iprules stop >/dev/null
	wait_exp
	wait_timeout
	check_cib any \|\| return 1
	run_site 1 $iprules start >/dev/null
	wait_timeout
	}
	check_split_leader() {
	check_consistency any
	}
	recover_split_leader() {
	run_site 1 $iprules start >/dev/null
	}

	## TEST: split_follower ##

	# split brain (follower alone)
	test_split_follower() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	run_site 2 $iprules stop >/dev/null
	wait_exp
	wait_timeout
	run_site 2 $iprules start >/dev/null
	wait_timeout
	}
	check_split_follower() {
	check_consistency `get_site 1`
	}

	## TEST: split_edge ##

	# split brain (leader alone)
	test_split_edge() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	wait_timeout
	run_site 1 $iprules stop >/dev/null
	wait_exp
	run_site 1 $iprules start >/dev/null
	wait_timeout
	}
	check_split_edge() {
	check_consistency any
	}

	## TEST: external_prog_failed ##

	# external test prog failed
	test_external_prog_failed() {
	run_site 1 booth revoke $tkt >/dev/null
	wait_timeout
	run_site 1 booth grant $tkt >/dev/null
	sleep 1
	break_external_prog 1
	wait_half_exp
	wait_timeout
	}
	check_external_prog_failed() {
	check_consistency any &&
	[ `booth_where_granted` != `get_site 1` ]
	}
	recover_external_prog_failed() {
	repair_external_prog 1
	}
	applicable_external_prog_failed() {
	[ -n `get_rsc` ]
	}

	#
	# environment modifications
	#

	# packet loss at one site 30%
	ENV_single_loss() {
	run_site 1 netem_loss ${1:-30}
	}

	# packet loss everywhere 30%
	ENV_loss() {
	forall_fun2 netem_loss ${1:-30}
	}

	# network delay 100ms
	ENV_net_delay() {
	forall_fun2 netem_delay ${1:-100}
	}

	set_env() {
	local modfun args
	modfun=`echo $1 \| sed 's/:.*//'`
	args=`echo $1 \| sed 's/[^:]*://;s/:/ /g'`
	if ! is_function ENV_$modfun; then
	echo "ENV_$modfun: doesn't exist"
	exit 1
	fi
	echo running $modfun $args
	ENV_$modfun $args
	}
	reset_env() {
	trap "forall_fun2 netem_reset" EXIT
	}

	sync_conf \|\| exit
	restart_booth
	all_booth_status \|\| {
	reset_booth
	all_booth_status \|\| exit
	}

	dump_conf >&2

	TESTS="$@"

	: ${TESTS:="grant grant_noarb grant_elsewhere grant_site_lost revoke
	simultaneous_start_even slow_start_granted
	restart_granted restart_granted_nocib restart_notgranted
	failover split_leader split_follower split_edge
	external_prog_failed"}

	if [ -n "$NETEM_ENV" ]; then
	for env in $NETEM_ENV; do
	set_env $env
	done
	reset_env
	fi

	for t in $TESTS; do
	runtest $t
	done

File Metadata

Mime Type: text/x-diff
Expires: Sat, Jan 25, 6:56 AM (1 d, 15 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1321406
Default Alt Text: (14 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions