No OneTemporary
Actions

Size

166 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
	index 7a85be55e6..61766ce188 100644
	--- a/cts/lab/CTStests.py
	+++ b/cts/lab/CTStests.py
	@@ -1,3178 +1,3178 @@
	""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
	"""

	__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
	__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"

	#
	# SPECIAL NOTE:
	#
	# Tests may NOT implement any cluster-manager-specific code in them.
	# EXTEND the ClusterManager object to provide the base capabilities
	# the test needs if you need to do something that the current CM classes
	# do not. Otherwise you screw up the whole point of the object structure
	# in CTS.
	#
	# Thank you.
	#

	import os
	import re
	import time
	import subprocess
	import tempfile

	from stat import *
	from cts.CTSaudits import *

	from pacemaker import BuildOptions
	from pacemaker._cts.CTS import NodeStatus
	from pacemaker._cts.environment import EnvFactory
	from pacemaker._cts.logging import LogFactory
	from pacemaker._cts.patterns import PatternSelector
	from pacemaker._cts.remote import RemoteFactory
	from pacemaker._cts.watcher import LogWatcher

	AllTestClasses = [ ]


	class CTSTest(object):
	'''
	A Cluster test.
	We implement the basic set of properties and behaviors for a generic
	cluster test.

	Cluster tests track their own statistics.
	We keep each of the kinds of counts we track as separate {name,value}
	pairs.
	'''

	def __init__(self, cm):
	#self.name="the unnamed test"
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}

	# if not issubclass(cm.__class__, ClusterManager):
	# raise ValueError("Must be a ClusterManager object")
	self.CM = cm
	self.Env = EnvFactory().getInstance()
	self.rsh = RemoteFactory().getInstance()
	self.logger = LogFactory()
	self.templates = PatternSelector(cm["Name"])
	self.Audits = []
	self.timeout = 120
	self.passed = 1
	self.is_loop = 0
	self.is_unsafe = 0
	self.is_experimental = 0
	self.is_container = 0
	self.is_valgrind = 0
	self.benchmark = 0 # which tests to benchmark
	self.timer = {} # timers

	def log(self, args):
	self.logger.log(args)

	def debug(self, args):
	self.logger.debug(args)

	def has_key(self, key):
	return key in self.Stats

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	if str(key) == "0":
	raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")

	if key in self.Stats:
	return self.Stats[key]
	return None

	def log_mark(self, msg):
	self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
	return

	def get_timer(self,key = "test"):
	try: return self.timer[key]
	except: return 0

	def set_timer(self,key = "test"):
	self.timer[key] = time.time()
	return self.timer[key]

	def log_timer(self,key = "test"):
	elapsed = 0
	if key in self.timer:
	elapsed = time.time() - self.timer[key]
	s = key == "test" and self.name or "%s:%s" % (self.name,key)
	self.debug("%s runtime: %.2f" % (s, elapsed))
	del self.timer[key]
	return elapsed

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not name in self.Stats:
	self.Stats[name] = 0
	self.Stats[name] = self.Stats[name]+1

	# Reset the test passed boolean
	if name == "calls":
	self.passed = 1

	def failure(self, reason="none"):
	'''Increment the failure count'''
	self.passed = 0
	self.incr("failure")
	self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
	return None

	def success(self):
	'''Increment the success count'''
	self.incr("success")
	return 1

	def skipped(self):
	'''Increment the skipped count'''
	self.incr("skipped")
	return 1

	def __call__(self, node):
	'''Perform the given test'''
	raise ValueError("Abstract Class member (__call__)")
	self.incr("calls")
	return self.failure()

	def audit(self):
	passed = 1
	if len(self.Audits) > 0:
	for audit in self.Audits:
	if not audit():
	self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
	self.incr("auditfail")
	passed = 0
	return passed

	def setup(self, node):
	'''Setup the given test'''
	return self.success()

	def teardown(self, node):
	'''Tear down the given test'''
	return self.success()

	def create_watch(self, patterns, timeout, name=None):
	if not name:
	name = self.name
	return LogWatcher(self.Env["LogFileName"], patterns, self.Env["nodes"], self.Env["LogWatcher"], name, timeout)

	def local_badnews(self, prefix, watch, local_ignore=[]):
	errcount = 0
	if not prefix:
	prefix = "LocalBadNews:"

	ignorelist = []
	ignorelist.append(" CTS: ")
	ignorelist.append(prefix)
	ignorelist.extend(local_ignore)

	while errcount < 100:
	match = watch.look(0)
	if match:
	add_err = 1
	for ignore in ignorelist:
	if add_err == 1 and re.search(ignore, match):
	add_err = 0
	if add_err == 1:
	self.logger.log(prefix + " " + match)
	errcount = errcount + 1
	else:
	break
	else:
	self.logger.log("Too many errors!")

	watch.end()
	return errcount

	def is_applicable(self):
	return self.is_applicable_common()

	def is_applicable_common(self):
	'''Return True if we are applicable in the current test configuration'''
	#raise ValueError("Abstract Class member (is_applicable)")

	if self.is_loop and not self.Env["loop-tests"]:
	return False
	elif self.is_unsafe and not self.Env["unsafe-tests"]:
	return False
	elif self.is_valgrind and not self.Env["valgrind-tests"]:
	return False
	elif self.is_experimental and not self.Env["experimental-tests"]:
	return False
	elif self.is_container and not self.Env["container-tests"]:
	return False
	elif self.Env["benchmark"] and self.benchmark == 0:
	return False

	return True

	def find_ocfs2_resources(self, node):
	self.r_o2cb = None
	self.r_ocfs2 = []

	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	r = AuditResource(self.CM, line)
	if r.rtype == "o2cb" and r.parent != "NA":
	self.debug("Found o2cb: %s" % self.r_o2cb)
	self.r_o2cb = r.parent
	if re.search("^Constraint", line):
	c = AuditConstraint(self.CM, line)
	if c.type == "rsc_colocation" and c.target == self.r_o2cb:
	self.r_ocfs2.append(c.rsc)

	self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
	return len(self.r_ocfs2)

	def canrunnow(self, node):
	'''Return TRUE if we can meaningfully run right now'''
	return 1

	def errorstoignore(self):
	'''Return list of errors which are 'normal' and should be ignored'''
	return []


	class StopTest(CTSTest):
	'''Stop (deactivate) the cluster manager on a node'''
	def __init__(self, cm):
	CTSTest.__init__(self, cm)
	self.name = "Stop"

	def __call__(self, node):
	'''Perform the 'stop' test. '''
	self.incr("calls")
	if self.CM.ShouldBeStatus[node] != "up":
	return self.skipped()

	patterns = []
	# Technically we should always be able to notice ourselves stopping
	patterns.append(self.templates["Pat:We_stopped"] % node)

	# Any active node needs to notice this one left
	# (note that this won't work if we have multiple partitions)
	for other in self.Env["nodes"]:
	if self.CM.ShouldBeStatus[other] == "up" and other != node:
	patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
	#self.debug("Checking %s will notice %s left"%(other, node))

	watch = self.create_watch(patterns, self.Env["DeadTime"])
	watch.set_watch()

	if node == self.CM.OurNode:
	self.incr("us")
	else:
	if self.CM.upcount() <= 1:
	self.incr("all")
	else:
	self.incr("them")

	self.CM.StopaCM(node)
	watch_result = watch.look_for_all()

	failreason = None
	UnmatchedList = "\|\|"
	if watch.unmatched:
	(_, output) = self.rsh(node, "/bin/ps axf", verbose=1)
	for line in output:
	self.debug(line)

	(_, output) = self.rsh(node, "/usr/sbin/dlm_tool dump 2>/dev/null", verbose=1)
	for line in output:
	self.debug(line)

	for regex in watch.unmatched:
	self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
	UnmatchedList += regex + "\|\|";
	failreason = "Missing shutdown pattern"

	self.CM.cluster_stable(self.Env["DeadTime"])

	if not watch.unmatched or self.CM.upcount() == 0:
	return self.success()

	if len(watch.unmatched) >= self.CM.upcount():
	return self.failure("no match against (%s)" % UnmatchedList)

	if failreason == None:
	return self.success()
	else:
	return self.failure(failreason)
	#
	# We don't register StopTest because it's better when called by
	# another test...
	#


	class StartTest(CTSTest):
	'''Start (activate) the cluster manager on a node'''
	def __init__(self, cm, debug=None):
	CTSTest.__init__(self,cm)
	self.name = "start"
	self.debug = debug

	def __call__(self, node):
	'''Perform the 'start' test. '''
	self.incr("calls")

	if self.CM.upcount() == 0:
	self.incr("us")
	else:
	self.incr("them")

	if self.CM.ShouldBeStatus[node] != "down":
	return self.skipped()
	elif self.CM.StartaCM(node):
	return self.success()
	else:
	return self.failure("Startup %s on node %s failed"
	% (self.Env["Name"], node))

	#
	# We don't register StartTest because it's better when called by
	# another test...
	#


	class FlipTest(CTSTest):
	'''If it's running, stop it. If it's stopped start it.
	Overthrow the status quo...
	'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "Flip"
	self.start = StartTest(cm)
	self.stop = StopTest(cm)

	def __call__(self, node):
	'''Perform the 'Flip' test. '''
	self.incr("calls")
	if self.CM.ShouldBeStatus[node] == "up":
	self.incr("stopped")
	ret = self.stop(node)
	type = "up->down"
	# Give the cluster time to recognize it's gone...
	time.sleep(self.Env["StableTime"])
	elif self.CM.ShouldBeStatus[node] == "down":
	self.incr("started")
	ret = self.start(node)
	type = "down->up"
	else:
	return self.skipped()

	self.incr(type)
	if ret:
	return self.success()
	else:
	return self.failure("%s failure" % type)

	# Register FlipTest as a good test to run
	AllTestClasses.append(FlipTest)


	class RestartTest(CTSTest):
	'''Stop and restart a node'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "Restart"
	self.start = StartTest(cm)
	self.stop = StopTest(cm)
	self.benchmark = 1

	def __call__(self, node):
	'''Perform the 'restart' test. '''
	self.incr("calls")

	self.incr("node:" + node)

	ret1 = 1
	if self.CM.StataCM(node):
	self.incr("WasStopped")
	if not self.start(node):
	return self.failure("start (setup) failure: "+node)

	self.set_timer()
	if not self.stop(node):
	return self.failure("stop failure: "+node)
	if not self.start(node):
	return self.failure("start failure: "+node)
	return self.success()

	# Register RestartTest as a good test to run
	AllTestClasses.append(RestartTest)


	class StonithdTest(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self, cm)
	self.name = "Stonithd"
	self.startall = SimulStartLite(cm)
	self.benchmark = 1

	def __call__(self, node):
	self.incr("calls")
	if len(self.Env["nodes"]) < 2:
	return self.skipped()

	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	is_dc = self.CM.is_node_dc(node)

	watchpats = []
	watchpats.append(self.templates["Pat:Fencing_ok"] % node)
	watchpats.append(self.templates["Pat:NodeFenced"] % node)

	if not self.Env["at-boot"]:
	self.debug("Expecting %s to stay down" % node)
	self.CM.ShouldBeStatus[node] = "down"
	else:
	self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
	watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
	watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)

	watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
	watch.set_watch()

	origin = self.Env.random_gen.choice(self.Env["nodes"])

	(rc, _) = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)

	if rc == 124: # CRM_EX_TIMEOUT
	# Look for the patterns, usually this means the required
	# device was running on the node to be fenced - or that
	# the required devices were in the process of being loaded
	# and/or moved
	#
	# Effectively the node committed suicide so there will be
	# no confirmation, but pacemaker should be watching and
	# fence the node again

	self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))

	elif origin != node and rc != 0:
	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()

	self.debug("Waiting for fenced node to come back up")
	self.CM.ns.wait_for_all_nodes(self.Env["nodes"], 600)

	self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))

	elif origin == node and rc != 255:
	# 255 == broken pipe, ie. the node was fenced as expected
	self.logger.log("Locally originated fencing returned %d" % rc)

	self.set_timer("fence")
	matched = watch.look_for_all()
	self.log_timer("fence")
	self.set_timer("reform")
	if watch.unmatched:
	self.logger.log("Patterns not found: " + repr(watch.unmatched))

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()

	self.debug("Waiting for fenced node to come back up")
	self.CM.ns.wait_for_all_nodes(self.Env["nodes"], 600)

	self.debug("Waiting for the cluster to re-stabilize with all nodes")
	is_stable = self.CM.cluster_stable(self.Env["StartTime"])

	if not matched:
	return self.failure("Didn't find all expected patterns")
	elif not is_stable:
	return self.failure("Cluster did not become stable")

	self.log_timer("reform")
	return self.success()

	def errorstoignore(self):
	return [
	self.templates["Pat:Fencing_start"] % ".*",
	self.templates["Pat:Fencing_ok"] % ".*",
	self.templates["Pat:Fencing_active"],
	r"error.: Operation 'reboot' targeting . by .* for stonith_admin.*: Timer expired",
	]

	def is_applicable(self):
	if not self.is_applicable_common():
	return False

	if "DoFencing" in list(self.Env.keys()):
	return self.Env["DoFencing"]

	return True

	AllTestClasses.append(StonithdTest)


	class StartOnebyOne(CTSTest):
	'''Start all the nodes ~ one by one'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "StartOnebyOne"
	self.stopall = SimulStopLite(cm)
	self.start = StartTest(cm)
	self.ns = NodeStatus(cm.Env)

	def __call__(self, dummy):
	'''Perform the 'StartOnebyOne' test. '''
	self.incr("calls")

	# We ignore the "node" parameter...

	# Shut down all the nodes...
	ret = self.stopall(None)
	if not ret:
	return self.failure("Test setup failed")

	failed = []
	self.set_timer()
	for node in self.Env["nodes"]:
	if not self.start(node):
	failed.append(node)

	if len(failed) > 0:
	return self.failure("Some node failed to start: " + repr(failed))

	return self.success()

	# Register StartOnebyOne as a good test to run
	AllTestClasses.append(StartOnebyOne)


	class SimulStart(CTSTest):
	'''Start all the nodes ~ simultaneously'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "SimulStart"
	self.stopall = SimulStopLite(cm)
	self.startall = SimulStartLite(cm)

	def __call__(self, dummy):
	'''Perform the 'SimulStart' test. '''
	self.incr("calls")

	# We ignore the "node" parameter...

	# Shut down all the nodes...
	ret = self.stopall(None)
	if not ret:
	return self.failure("Setup failed")

	if not self.startall(None):
	return self.failure("Startall failed")

	return self.success()

	# Register SimulStart as a good test to run
	AllTestClasses.append(SimulStart)


	class SimulStop(CTSTest):
	'''Stop all the nodes ~ simultaneously'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "SimulStop"
	self.startall = SimulStartLite(cm)
	self.stopall = SimulStopLite(cm)

	def __call__(self, dummy):
	'''Perform the 'SimulStop' test. '''
	self.incr("calls")

	# We ignore the "node" parameter...

	# Start up all the nodes...
	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	if not self.stopall(None):
	return self.failure("Stopall failed")

	return self.success()

	# Register SimulStop as a good test to run
	AllTestClasses.append(SimulStop)


	class StopOnebyOne(CTSTest):
	'''Stop all the nodes in order'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "StopOnebyOne"
	self.startall = SimulStartLite(cm)
	self.stop = StopTest(cm)

	def __call__(self, dummy):
	'''Perform the 'StopOnebyOne' test. '''
	self.incr("calls")

	# We ignore the "node" parameter...

	# Start up all the nodes...
	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	failed = []
	self.set_timer()
	for node in self.Env["nodes"]:
	if not self.stop(node):
	failed.append(node)

	if len(failed) > 0:
	return self.failure("Some node failed to stop: " + repr(failed))

	return self.success()

	# Register StopOnebyOne as a good test to run
	AllTestClasses.append(StopOnebyOne)


	class RestartOnebyOne(CTSTest):
	'''Restart all the nodes in order'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "RestartOnebyOne"
	self.startall = SimulStartLite(cm)

	def __call__(self, dummy):
	'''Perform the 'RestartOnebyOne' test. '''
	self.incr("calls")

	# We ignore the "node" parameter...

	# Start up all the nodes...
	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	did_fail = []
	self.set_timer()
	self.restart = RestartTest(self.CM)
	for node in self.Env["nodes"]:
	if not self.restart(node):
	did_fail.append(node)

	if did_fail:
	return self.failure("Could not restart %d nodes: %s"
	% (len(did_fail), repr(did_fail)))
	return self.success()

	# Register StopOnebyOne as a good test to run
	AllTestClasses.append(RestartOnebyOne)


	class PartialStart(CTSTest):
	'''Start a node - but tell it to stop before it finishes starting up'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "PartialStart"
	self.startall = SimulStartLite(cm)
	self.stopall = SimulStopLite(cm)
	self.stop = StopTest(cm)
	#self.is_unsafe = 1

	def __call__(self, node):
	'''Perform the 'PartialStart' test. '''
	self.incr("calls")

	ret = self.stopall(None)
	if not ret:
	return self.failure("Setup failed")

	watchpats = []
	watchpats.append("pacemaker-controld.Connecting to . cluster infrastructure")
	watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
	watch.set_watch()

	self.CM.StartaCMnoBlock(node)
	ret = watch.look_for_all()
	if not ret:
	self.logger.log("Patterns not found: " + repr(watch.unmatched))
	return self.failure("Setup of %s failed" % node)

	ret = self.stop(node)
	if not ret:
	return self.failure("%s did not stop in time" % node)

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''

	# We might do some fencing in the 2-node case if we make it up far enough
	return [
	r"Executing reboot fencing operation",
	- r"Requesting fencing $[^)]+$ of node ",
	+ r"Requesting fencing $[^)]+$ targeting node ",
	]

	# Register StopOnebyOne as a good test to run
	AllTestClasses.append(PartialStart)


	class StandbyTest(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "Standby"
	self.benchmark = 1

	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)

	# make sure the node is active
	# set the node to standby mode
	# check resources, none resource should be running on the node
	# set the node to active mode
	# check resouces, resources should have been migrated back (SHOULD THEY?)

	def __call__(self, node):

	self.incr("calls")
	ret = self.startall(None)
	if not ret:
	return self.failure("Start all nodes failed")

	self.debug("Make sure node %s is active" % node)
	if self.CM.StandbyStatus(node) != "off":
	if not self.CM.SetStandbyMode(node, "off"):
	return self.failure("can't set node %s to active mode" % node)

	self.CM.cluster_stable()

	status = self.CM.StandbyStatus(node)
	if status != "off":
	return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))

	self.debug("Getting resources running on node %s" % node)
	rsc_on_node = self.CM.active_resources(node)

	watchpats = []
	watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
	watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
	watch.set_watch()

	self.debug("Setting node %s to standby mode" % node)
	if not self.CM.SetStandbyMode(node, "on"):
	return self.failure("can't set node %s to standby mode" % node)

	self.set_timer("on")

	ret = watch.look_for_all()
	if not ret:
	self.logger.log("Patterns not found: " + repr(watch.unmatched))
	self.CM.SetStandbyMode(node, "off")
	return self.failure("cluster didn't react to standby change on %s" % node)

	self.CM.cluster_stable()

	status = self.CM.StandbyStatus(node)
	if status != "on":
	return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
	self.log_timer("on")

	self.debug("Checking resources")
	bad_run = self.CM.active_resources(node)
	if len(bad_run) > 0:
	rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
	self.debug("Setting node %s to active mode" % node)
	self.CM.SetStandbyMode(node, "off")
	return rc

	self.debug("Setting node %s to active mode" % node)
	if not self.CM.SetStandbyMode(node, "off"):
	return self.failure("can't set node %s to active mode" % node)

	self.set_timer("off")
	self.CM.cluster_stable()

	status = self.CM.StandbyStatus(node)
	if status != "off":
	return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
	self.log_timer("off")

	return self.success()

	AllTestClasses.append(StandbyTest)


	class ValgrindTest(CTSTest):
	'''Check for memory leaks'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "Valgrind"
	self.stopall = SimulStopLite(cm)
	self.startall = SimulStartLite(cm)
	self.is_valgrind = 1
	self.is_loop = 1

	def setup(self, node):
	self.incr("calls")

	ret = self.stopall(None)
	if not ret:
	return self.failure("Stop all nodes failed")

	# @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind,
	# and clear any valgrind logs from previous runs. For now, we rely on
	# the user to do this manually.

	ret = self.startall(None)
	if not ret:
	return self.failure("Start all nodes failed")

	return self.success()

	def teardown(self, node):
	# Return all nodes to normal
	# @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind
	ret = self.stopall(None)
	if not ret:
	return self.failure("Stop all nodes failed")

	return self.success()

	def find_leaks(self):
	# Check for leaks
	# (no longer used but kept in case feature is restored)
	leaked = []
	self.stop = StopTest(self.CM)

	for node in self.Env["nodes"]:
	rc = self.stop(node)
	if not rc:
	self.failure("Couldn't shut down %s" % node)

	(rc, _) = self.rsh(node, "grep -e indirectly.lost:.[1-9] -e definitely.lost:.[1-9] -e (ERROR\|error).SUMMARY:.[1-9].*errors %s" % self.logger.logPat)
	if rc != 1:
	leaked.append(node)
	self.failure("Valgrind errors detected on %s" % node)
	(_, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, verbose=1)
	for line in output:
	self.logger.log(line)
	(_, output) = self.rsh(node, "cat %s" % self.logger.logPat, verbose=1)
	for line in output:
	self.debug(line)

	self.rsh(node, "rm -f %s" % self.logger.logPat, verbose=1)
	return leaked

	def __call__(self, node):
	#leaked = self.find_leaks()
	#if len(leaked) > 0:
	# return self.failure("Nodes %s leaked" % repr(leaked))

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [
	r"pacemaker-based.: \\\\\\\\\\\\\",
	r"pacemaker-based.: . avoid confusing Valgrind",
	r"HA_VALGRIND_ENABLED",
	]


	class StandbyLoopTest(ValgrindTest):
	'''Check for memory leaks by putting a node in and out of standby for an hour'''
	# @TODO This is not a useful test for memory leaks
	def __init__(self, cm):
	ValgrindTest.__init__(self,cm)
	self.name = "StandbyLoop"

	def __call__(self, node):

	lpc = 0
	delay = 2
	failed = 0
	done = time.time() + self.Env["loop-minutes"] * 60
	while time.time() <= done and not failed:
	lpc = lpc + 1

	time.sleep(delay)
	if not self.CM.SetStandbyMode(node, "on"):
	self.failure("can't set node %s to standby mode" % node)
	failed = lpc

	time.sleep(delay)
	if not self.CM.SetStandbyMode(node, "off"):
	self.failure("can't set node %s to active mode" % node)
	failed = lpc

	leaked = self.find_leaks()
	if failed:
	return self.failure("Iteration %d failed" % failed)
	elif len(leaked) > 0:
	return self.failure("Nodes %s leaked" % repr(leaked))

	return self.success()

	#AllTestClasses.append(StandbyLoopTest)


	class BandwidthTest(CTSTest):
	# Tests should not be cluster-manager-specific
	# If you need to find out cluster manager configuration to do this, then
	# it should be added to the generic cluster manager API.
	'''Test the bandwidth which the cluster uses'''
	def __init__(self, cm):
	CTSTest.__init__(self, cm)
	self.name = "Bandwidth"
	self.start = StartTest(cm)
	self.__setitem__("min",0)
	self.__setitem__("max",0)
	self.__setitem__("totalbandwidth",0)
	(handle, self.tempfile) = tempfile.mkstemp(".cts")
	os.close(handle)
	self.startall = SimulStartLite(cm)

	def __call__(self, node):
	'''Perform the Bandwidth test'''
	self.incr("calls")

	if self.CM.upcount() < 1:
	return self.skipped()

	Path = self.CM.InternalCommConfig()
	if "ip" not in Path["mediatype"]:
	return self.skipped()

	port = Path["port"][0]
	port = int(port)

	ret = self.startall(None)
	if not ret:
	return self.failure("Test setup failed")
	time.sleep(5) # We get extra messages right after startup.

	fstmpfile = "/var/run/band_estimate"
	dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
	% (port, fstmpfile)

	(rc, _) = self.rsh(node, dumpcmd)
	if rc == 0:
	farfile = "root@%s:%s" % (node, fstmpfile)
	self.rsh.copy(farfile, self.tempfile)
	Bandwidth = self.countbandwidth(self.tempfile)
	if not Bandwidth:
	self.logger.log("Could not compute bandwidth.")
	return self.success()
	intband = int(Bandwidth + 0.5)
	self.logger.log("...bandwidth: %d bits/sec" % intband)
	self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
	if self.Stats["min"] == 0:
	self.Stats["min"] = Bandwidth
	if Bandwidth > self.Stats["max"]:
	self.Stats["max"] = Bandwidth
	if Bandwidth < self.Stats["min"]:
	self.Stats["min"] = Bandwidth
	self.rsh(node, "rm -f %s" % fstmpfile)
	os.unlink(self.tempfile)
	return self.success()
	else:
	return self.failure("no response from tcpdump command [%d]!" % rc)

	def countbandwidth(self, file):
	fp = open(file, "r")
	fp.seek(0)
	count = 0
	sum = 0
	while 1:
	line = fp.readline()
	if not line:
	return None
	if re.search("udp",line) or re.search("UDP,", line):
	count = count + 1
	linesplit = line.split(" ")
	for j in range(len(linesplit)-1):
	if linesplit[j] == "udp": break
	if linesplit[j] == "length:": break

	try:
	sum = sum + int(linesplit[j+1])
	except ValueError:
	self.logger.log("Invalid tcpdump line: %s" % line)
	return None
	T1 = linesplit[0]
	timesplit = T1.split(":")
	time2split = timesplit[2].split(".")
	time1 = (int(timesplit[0])60+int(timesplit[1]))60+int(time2split[0])+int(time2split[1])*0.000001
	break

	while count < 100:
	line = fp.readline()
	if not line:
	return None
	if re.search("udp",line) or re.search("UDP,", line):
	count = count+1
	linessplit = line.split(" ")
	for j in range(len(linessplit)-1):
	if linessplit[j] == "udp": break
	if linessplit[j] == "length:": break
	try:
	sum = int(linessplit[j+1]) + sum
	except ValueError:
	self.logger.log("Invalid tcpdump line: %s" % line)
	return None

	T2 = linessplit[0]
	timesplit = T2.split(":")
	time2split = timesplit[2].split(".")
	time2 = (int(timesplit[0])60+int(timesplit[1]))60+int(time2split[0])+int(time2split[1])*0.000001
	time = time2-time1
	if (time <= 0):
	return 0
	return int((sum*8)/time)

	def is_applicable(self):
	'''BandwidthTest never applicable'''
	return False

	AllTestClasses.append(BandwidthTest)


	###################################################################
	class MaintenanceMode(CTSTest):
	###################################################################
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "MaintenanceMode"
	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)
	self.max = 30
	#self.is_unsafe = 1
	self.benchmark = 1
	self.action = "asyncmon"
	self.interval = 0
	self.rid = "maintenanceDummy"

	def toggleMaintenanceMode(self, node, action):
	pats = []
	pats.append(self.templates["Pat:DC_IDLE"])

	# fail the resource right after turning Maintenance mode on
	# verify it is not recovered until maintenance mode is turned off
	if action == "On":
	pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid))
	else:
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
	pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))

	watch = self.create_watch(pats, 60)
	watch.set_watch()

	self.debug("Turning maintenance mode %s" % action)
	self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
	if (action == "On"):
	self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))

	self.set_timer("recover%s" % (action))
	watch.look_for_all()
	self.log_timer("recover%s" % (action))
	if watch.unmatched:
	self.debug("Failed to find patterns when turning maintenance mode %s" % action)
	return repr(watch.unmatched)

	return ""

	def insertMaintenanceDummy(self, node):
	pats = []
	pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))

	watch = self.create_watch(pats, 60)
	watch.set_watch()

	self.CM.AddDummyRsc(node, self.rid)

	self.set_timer("addDummy")
	watch.look_for_all()
	self.log_timer("addDummy")

	if watch.unmatched:
	self.debug("Failed to find patterns when adding maintenance dummy resource")
	return repr(watch.unmatched)
	return ""

	def removeMaintenanceDummy(self, node):
	pats = []
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))

	watch = self.create_watch(pats, 60)
	watch.set_watch()
	self.CM.RemoveDummyRsc(node, self.rid)

	self.set_timer("removeDummy")
	watch.look_for_all()
	self.log_timer("removeDummy")

	if watch.unmatched:
	self.debug("Failed to find patterns when removing maintenance dummy resource")
	return repr(watch.unmatched)
	return ""

	def managedRscList(self, node):
	rscList = []
	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	tmp = AuditResource(self.CM, line)
	if tmp.managed():
	rscList.append(tmp.id)

	return rscList

	def verifyResources(self, node, rscList, managed):
	managedList = list(rscList)
	managed_str = "managed"
	if not managed:
	managed_str = "unmanaged"

	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	tmp = AuditResource(self.CM, line)
	if managed and not tmp.managed():
	continue
	elif not managed and tmp.managed():
	continue
	elif managedList.count(tmp.id):
	managedList.remove(tmp.id)

	if len(managedList) == 0:
	self.debug("Found all %s resources on %s" % (managed_str, node))
	return True

	self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
	return False

	def __call__(self, node):
	'''Perform the 'MaintenanceMode' test. '''
	self.incr("calls")
	verify_managed = False
	verify_unmanaged = False
	failPat = ""

	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	# get a list of all the managed resources. We use this list
	# after enabling maintenance mode to verify all managed resources
	# become un-managed. After maintenance mode is turned off, we use
	# this list to verify all the resources become managed again.
	managedResources = self.managedRscList(node)
	if len(managedResources) == 0:
	self.logger.log("No managed resources on %s" % node)
	return self.skipped()

	# insert a fake resource we can fail during maintenance mode
	# so we can verify recovery does not take place until after maintenance
	# mode is disabled.
	failPat = failPat + self.insertMaintenanceDummy(node)

	# toggle maintenance mode ON, then fail dummy resource.
	failPat = failPat + self.toggleMaintenanceMode(node, "On")

	# verify all the resources are now unmanaged
	if self.verifyResources(node, managedResources, False):
	verify_unmanaged = True

	# Toggle maintenance mode OFF, verify dummy is recovered.
	failPat = failPat + self.toggleMaintenanceMode(node, "Off")

	# verify all the resources are now managed again
	if self.verifyResources(node, managedResources, True):
	verify_managed = True

	# Remove our maintenance dummy resource.
	failPat = failPat + self.removeMaintenanceDummy(node)

	self.CM.cluster_stable()

	if failPat != "":
	return self.failure("Unmatched patterns: %s" % (failPat))
	elif verify_unmanaged is False:
	return self.failure("Failed to verify resources became unmanaged during maintenance mode")
	elif verify_managed is False:
	return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [
	r"Updating failcount for %s" % self.rid,
	r"schedulerd.: Recover\s+%s\s+$.$" % self.rid,
	r"Unknown operation: fail",
	self.templates["Pat:RscOpOK"] % (self.action, self.rid),
	r"(ERROR\|error).: Action %s_%s_%d . initiated outside of a transition" % (self.rid, self.action, self.interval),
	]

	AllTestClasses.append(MaintenanceMode)


	class ResourceRecover(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "ResourceRecover"
	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)
	self.max = 30
	self.rid = None
	self.rid_alt = None
	#self.is_unsafe = 1
	self.benchmark = 1

	# these are the values used for the new LRM API call
	self.action = "asyncmon"
	self.interval = 0

	def __call__(self, node):
	'''Perform the 'ResourceRecover' test. '''
	self.incr("calls")

	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	# List all resources active on the node (skip test if none)
	resourcelist = self.CM.active_resources(node)
	if len(resourcelist) == 0:
	self.logger.log("No active resources on %s" % node)
	return self.skipped()

	# Choose one resource at random
	rsc = self.choose_resource(node, resourcelist)
	if rsc is None:
	return self.failure("Could not get details of resource '%s'" % self.rid)
	if rsc.id == rsc.clone_id:
	self.debug("Failing " + rsc.id)
	else:
	self.debug("Failing " + rsc.id + " (also known as " + rsc.clone_id + ")")

	# Log patterns to watch for (failure, plus restart if managed)
	pats = []
	pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id))
	if rsc.managed():
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
	if rsc.unique():
	pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
	else:
	# Anonymous clones may get restarted with a different clone number
	pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))

	# Fail resource. (Ideally, we'd fail it twice, to ensure the fail count
	# is incrementing properly, but it might restart on a different node.
	# We'd have to temporarily ban it from all other nodes and ensure the
	# migration-threshold hasn't been reached.)
	if self.fail_resource(rsc, node, pats) is None:
	return None # self.failure() already called

	return self.success()

	def choose_resource(self, node, resourcelist):
	""" Choose a random resource to target """

	self.rid = self.Env.random_gen.choice(resourcelist)
	self.rid_alt = self.rid
	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if line.startswith("Resource: "):
	rsc = AuditResource(self.CM, line)
	if rsc.id == self.rid:
	# Handle anonymous clones that get renamed
	self.rid = rsc.clone_id
	return rsc
	return None

	def get_failcount(self, node):
	""" Check the fail count of targeted resource on given node """

	(rc, lines) = self.rsh(node,
	"crm_failcount --quiet --query --resource %s "
	"--operation %s --interval %d "
	"--node %s" % (self.rid, self.action,
	self.interval, node), verbose=1)
	if rc != 0 or len(lines) != 1:
	self.logger.log("crm_failcount on %s failed (%d): %s" % (node, rc,
	" // ".join(map(str.strip, lines))))
	return -1
	try:
	failcount = int(lines[0])
	except (IndexError, ValueError):
	self.logger.log("crm_failcount output on %s unparseable: %s" % (node,
	' '.join(lines)))
	return -1
	return failcount

	def fail_resource(self, rsc, node, pats):
	""" Fail the targeted resource, and verify as expected """

	orig_failcount = self.get_failcount(node)

	watch = self.create_watch(pats, 60)
	watch.set_watch()

	self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))

	self.set_timer("recover")
	watch.look_for_all()
	self.log_timer("recover")

	self.CM.cluster_stable()
	recovered = self.CM.ResourceLocation(self.rid)

	if watch.unmatched:
	return self.failure("Patterns not found: %s" % repr(watch.unmatched))

	elif rsc.unique() and len(recovered) > 1:
	return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))

	elif len(recovered) > 0:
	self.debug("%s is running on: %s" % (self.rid, repr(recovered)))

	elif rsc.managed():
	return self.failure("%s was not recovered and is inactive" % self.rid)

	new_failcount = self.get_failcount(node)
	if new_failcount != (orig_failcount + 1):
	return self.failure("%s fail count is %d not %d" % (self.rid,
	new_failcount, orig_failcount + 1))

	return 0 # Anything but None is success

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [
	r"Updating failcount for %s" % self.rid,
	r"schedulerd.: Recover\s+(%s\|%s)\s+$.$" % (self.rid, self.rid_alt),
	r"Unknown operation: fail",
	self.templates["Pat:RscOpOK"] % (self.action, self.rid),
	r"(ERROR\|error).: Action %s_%s_%d . initiated outside of a transition" % (self.rid, self.action, self.interval),
	]

	AllTestClasses.append(ResourceRecover)


	class ComponentFail(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "ComponentFail"
	self.startall = SimulStartLite(cm)
	self.complist = cm.Components()
	self.patterns = []
	self.okerrpatterns = []
	self.is_unsafe = 1

	def __call__(self, node):
	'''Perform the 'ComponentFail' test. '''
	self.incr("calls")
	self.patterns = []
	self.okerrpatterns = []

	# start all nodes
	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	if not self.CM.cluster_stable(self.Env["StableTime"]):
	return self.failure("Setup failed - unstable")

	node_is_dc = self.CM.is_node_dc(node, None)

	# select a component to kill
	chosen = self.Env.random_gen.choice(self.complist)
	while chosen.dc_only and node_is_dc == 0:
	chosen = self.Env.random_gen.choice(self.complist)

	self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc))
	self.incr(chosen.name)

	if chosen.name != "corosync":
	self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
	self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))

	self.patterns.extend(chosen.pats)
	if node_is_dc:
	self.patterns.extend(chosen.dc_pats)

	# @TODO this should be a flag in the Component
	if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]:
	# Ignore actions for fence devices if fencer will respawn
	# (their registration will be lost, and probes will fail)
	self.okerrpatterns = [ self.templates["Pat:Fencing_active"] ]
	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	r = AuditResource(self.CM, line)
	if r.rclass == "stonith":
	self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
	self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id)

	# supply a copy so self.patterns doesn't end up empty
	tmpPats = []
	tmpPats.extend(self.patterns)
	self.patterns.extend(chosen.badnews_ignore)

	# Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
	stonithPats = []
	stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
	stonith = self.create_watch(stonithPats, 0)
	stonith.set_watch()

	# set the watch for stable
	watch = self.create_watch(
	tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
	watch.set_watch()

	# kill the component
	chosen.kill(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()

	self.debug("Waiting for any fenced node to come back up")
	self.CM.ns.wait_for_all_nodes(self.Env["nodes"], 600)

	self.debug("Waiting for the cluster to re-stabilize with all nodes")
	self.CM.cluster_stable(self.Env["StartTime"])

	self.debug("Checking if %s was shot" % node)
	shot = stonith.look(60)
	if shot:
	self.debug("Found: " + repr(shot))
	self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)

	if not self.Env["at-boot"]:
	self.CM.ShouldBeStatus[node] = "down"

	# If fencing occurred, chances are many (if not all) the expected logs
	# will not be sent - or will be lost when the node reboots
	return self.success()

	# check for logs indicating a graceful recovery
	matched = watch.look_for_all(allow_multiple_matches=True)
	if watch.unmatched:
	self.logger.log("Patterns not found: " + repr(watch.unmatched))

	self.debug("Waiting for the cluster to re-stabilize with all nodes")
	is_stable = self.CM.cluster_stable(self.Env["StartTime"])

	if not matched:
	return self.failure("Didn't find all expected %s patterns" % chosen.name)
	elif not is_stable:
	return self.failure("Cluster did not become stable after killing %s" % chosen.name)

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	# Note that okerrpatterns refers to the last time we ran this test
	# The good news is that this works fine for us...
	self.okerrpatterns.extend(self.patterns)
	return self.okerrpatterns

	AllTestClasses.append(ComponentFail)


	class SplitBrainTest(CTSTest):
	'''It is used to test split-brain. when the path between the two nodes break
	check the two nodes both take over the resource'''
	def __init__(self,cm):
	CTSTest.__init__(self,cm)
	self.name = "SplitBrain"
	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)
	self.is_experimental = 1

	def isolate_partition(self, partition):
	other_nodes = []
	other_nodes.extend(self.Env["nodes"])

	for node in partition:
	try:
	other_nodes.remove(node)
	except ValueError:
	self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))

	if len(other_nodes) == 0:
	return 1

	self.debug("Creating partition: " + repr(partition))
	self.debug("Everyone else: " + repr(other_nodes))

	for node in partition:
	if not self.CM.isolate_node(node, other_nodes):
	self.logger.log("Could not isolate %s" % node)
	return 0

	return 1

	def heal_partition(self, partition):
	other_nodes = []
	other_nodes.extend(self.Env["nodes"])

	for node in partition:
	try:
	other_nodes.remove(node)
	except ValueError:
	self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))

	if len(other_nodes) == 0:
	return 1

	self.debug("Healing partition: " + repr(partition))
	self.debug("Everyone else: " + repr(other_nodes))

	for node in partition:
	self.CM.unisolate_node(node, other_nodes)

	def __call__(self, node):
	'''Perform split-brain test'''
	self.incr("calls")
	self.passed = 1
	partitions = {}

	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed")

	while 1:
	# Retry until we get multiple partitions
	partitions = {}
	p_max = len(self.Env["nodes"])
	for node in self.Env["nodes"]:
	p = self.Env.random_gen.randint(1, p_max)
	if not p in partitions:
	partitions[p] = []
	partitions[p].append(node)
	p_max = len(list(partitions.keys()))
	if p_max > 1:
	break
	# else, try again

	self.debug("Created %d partitions" % p_max)
	for key in list(partitions.keys()):
	self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))

	# Disabling STONITH to reduce test complexity for now
	self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")

	for key in list(partitions.keys()):
	self.isolate_partition(partitions[key])

	count = 30
	while count > 0:
	if len(self.CM.find_partitions()) != p_max:
	time.sleep(10)
	else:
	break
	else:
	self.failure("Expected partitions were not created")

	# Target number of partitions formed - wait for stability
	if not self.CM.cluster_stable():
	self.failure("Partitioned cluster not stable")

	# Now audit the cluster state
	self.CM.partitions_expected = p_max
	if not self.audit():
	self.failure("Audits failed")
	self.CM.partitions_expected = 1

	# And heal them again
	for key in list(partitions.keys()):
	self.heal_partition(partitions[key])

	# Wait for a single partition to form
	count = 30
	while count > 0:
	if len(self.CM.find_partitions()) != 1:
	time.sleep(10)
	count -= 1
	else:
	break
	else:
	self.failure("Cluster did not reform")

	# Wait for it to have the right number of members
	count = 30
	while count > 0:
	members = []

	partitions = self.CM.find_partitions()
	if len(partitions) > 0:
	members = partitions[0].split()

	if len(members) != len(self.Env["nodes"]):
	time.sleep(10)
	count -= 1
	else:
	break
	else:
	self.failure("Cluster did not completely reform")

	# Wait up to 20 minutes - the delay is more preferable than
	# trying to continue with in a messed up state
	if not self.CM.cluster_stable(1200):
	self.failure("Reformed cluster not stable")
	if self.Env["continue"]:
	answer = "Y"
	else:
	try:
	answer = input('Continue? [nY]')
	except EOFError as e:
	answer = "n"
	if answer and answer == "n":
	raise ValueError("Reformed cluster not stable")

	# Turn fencing back on
	if self.Env["DoFencing"]:
	self.rsh(node, "crm_attribute -V -D -n stonith-enabled")

	self.CM.cluster_stable()

	if self.passed:
	return self.success()
	return self.failure("See previous errors")

	def errorstoignore(self):
	'''Return list of errors which are 'normal' and should be ignored'''
	return [
	r"Another DC detected:",
	r"(ERROR\|error).: .Application of an update diff failed",
	r"pacemaker-controld.:.not in our membership list",
	r"CRIT:.node.returning after partition",
	]

	def is_applicable(self):
	if not self.is_applicable_common():
	return False
	return len(self.Env["nodes"]) > 2

	AllTestClasses.append(SplitBrainTest)


	class Reattach(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "Reattach"
	self.startall = SimulStartLite(cm)
	self.restart1 = RestartTest(cm)
	self.stopall = SimulStopLite(cm)
	self.is_unsafe = 0 # Handled by canrunnow()

	def _is_managed(self, node):
	(_, is_managed) = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1)
	is_managed = is_managed[0].strip()
	return is_managed == "true"

	def _set_unmanaged(self, node):
	self.debug("Disable resource management")
	self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")

	def _set_managed(self, node):
	self.debug("Re-enable resource management")
	self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")

	def setup(self, node):
	attempt = 0
	if not self.startall(None):
	return None

	# Make sure we are really _really_ stable and that all
	# resources, including those that depend on transient node
	# attributes, are started
	while not self.CM.cluster_stable(double_check=True):
	if attempt < 5:
	attempt += 1
	self.debug("Not stable yet, re-testing")
	else:
	self.logger.log("Cluster is not stable")
	return None

	return 1

	def teardown(self, node):

	# Make sure 'node' is up
	start = StartTest(self.CM)
	start(node)

	if not self._is_managed(node):
	self.logger.log("Attempting to re-enable resource management on %s" % node)
	self._set_managed(node)
	self.CM.cluster_stable()
	if not self._is_managed(node):
	self.logger.log("Could not re-enable resource management")
	return 0

	return 1

	def canrunnow(self, node):
	'''Return TRUE if we can meaningfully run right now'''
	if self.find_ocfs2_resources(node):
	self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
	return 0
	return 1

	def __call__(self, node):
	self.incr("calls")

	pats = []
	# Conveniently, the scheduler will display this message when disabling
	# management, even if fencing is not enabled, so we can rely on it.
	managed = self.create_watch(["No fencing will be done"], 60)
	managed.set_watch()

	self._set_unmanaged(node)

	if not managed.look_for_all():
	self.logger.log("Patterns not found: " + repr(managed.unmatched))
	return self.failure("Resource management not disabled")

	pats = []
	pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
	pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
	pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
	pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))

	watch = self.create_watch(pats, 60, "ShutdownActivity")
	watch.set_watch()

	self.debug("Shutting down the cluster")
	ret = self.stopall(None)
	if not ret:
	self._set_managed(node)
	return self.failure("Couldn't shut down the cluster")

	self.debug("Bringing the cluster back up")
	ret = self.startall(None)
	time.sleep(5) # allow ping to update the CIB
	if not ret:
	self._set_managed(node)
	return self.failure("Couldn't restart the cluster")

	if self.local_badnews("ResourceActivity:", watch):
	self._set_managed(node)
	return self.failure("Resources stopped or started during cluster restart")

	watch = self.create_watch(pats, 60, "StartupActivity")
	watch.set_watch()

	# Re-enable resource management (and verify it happened).
	self._set_managed(node)
	self.CM.cluster_stable()
	if not self._is_managed(node):
	return self.failure("Could not re-enable resource management")

	# Ignore actions for STONITH resources
	ignore = []
	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	r = AuditResource(self.CM, line)
	if r.rclass == "stonith":

	self.debug("Ignoring start actions for %s" % r.id)
	ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))

	if self.local_badnews("ResourceActivity:", watch, ignore):
	return self.failure("Resources stopped or started after resource management was re-enabled")

	return ret

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [
	r"resource( was\|s were) active at shutdown",
	]

	def is_applicable(self):
	return True

	AllTestClasses.append(Reattach)


	class SpecialTest1(CTSTest):
	'''Set up a custom test to cause quorum failure issues for Andrew'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "SpecialTest1"
	self.startall = SimulStartLite(cm)
	self.restart1 = RestartTest(cm)
	self.stopall = SimulStopLite(cm)

	def __call__(self, node):
	'''Perform the 'SpecialTest1' test for Andrew. '''
	self.incr("calls")

	# Shut down all the nodes...
	ret = self.stopall(None)
	if not ret:
	return self.failure("Could not stop all nodes")

	# Test config recovery when the other nodes come up
	self.rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*")

	# Start the selected node
	ret = self.restart1(node)
	if not ret:
	return self.failure("Could not start "+node)

	# Start all remaining nodes
	ret = self.startall(None)
	if not ret:
	return self.failure("Could not start the remaining nodes")

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	# Errors that occur as a result of the CIB being wiped
	return [
	r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
	r"error.*: Resource start-up disabled since no STONITH resources have been defined",
	r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
	r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
	]

	AllTestClasses.append(SpecialTest1)


	class HAETest(CTSTest):
	'''Set up a custom test to cause quorum failure issues for Andrew'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "HAETest"
	self.stopall = SimulStopLite(cm)
	self.startall = SimulStartLite(cm)
	self.is_loop = 1

	def setup(self, node):
	# Start all remaining nodes
	ret = self.startall(None)
	if not ret:
	return self.failure("Couldn't start all nodes")
	return self.success()

	def teardown(self, node):
	# Stop everything
	ret = self.stopall(None)
	if not ret:
	return self.failure("Couldn't stop all nodes")
	return self.success()

	def wait_on_state(self, node, resource, expected_clones, attempts=240):
	while attempts > 0:
	active = 0
	(rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, verbose=1)

	# Hack until crm_resource does the right thing
	if rc == 0 and lines:
	active = len(lines)

	if len(lines) == expected_clones:
	return 1

	elif rc == 1:
	self.debug("Resource %s is still inactive" % resource)

	elif rc == 234:
	self.logger.log("Unknown resource %s" % resource)
	return 0

	elif rc == 246:
	self.logger.log("Cluster is inactive")
	return 0

	elif rc != 0:
	self.logger.log("Call to crm_resource failed, rc=%d" % rc)
	return 0

	else:
	self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))

	attempts -= 1
	time.sleep(1)

	return 0

	def find_dlm(self, node):
	self.r_dlm = None

	(_, lines) = self.rsh(node, "crm_resource -c", verbose=1)
	for line in lines:
	if re.search("^Resource", line):
	r = AuditResource(self.CM, line)
	if r.rtype == "controld" and r.parent != "NA":
	self.debug("Found dlm: %s" % self.r_dlm)
	self.r_dlm = r.parent
	return 1
	return 0

	def find_hae_resources(self, node):
	self.r_dlm = None
	self.r_o2cb = None
	self.r_ocfs2 = []

	if self.find_dlm(node):
	self.find_ocfs2_resources(node)

	def is_applicable(self):
	if not self.is_applicable_common():
	return False
	if self.Env["Schema"] == "hae":
	return True
	return None


	class HAERoleTest(HAETest):
	def __init__(self, cm):
	'''Lars' mount/unmount test for the HA extension. '''
	HAETest.__init__(self,cm)
	self.name = "HAERoleTest"

	def change_state(self, node, resource, target):
	(rc, _) = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target))
	return rc

	def __call__(self, node):
	self.incr("calls")
	lpc = 0
	failed = 0
	delay = 2
	done = time.time() + self.Env["loop-minutes"]*60
	self.find_hae_resources(node)

	clone_max = len(self.Env["nodes"])
	while time.time() <= done and not failed:
	lpc = lpc + 1

	self.change_state(node, self.r_dlm, "Stopped")
	if not self.wait_on_state(node, self.r_dlm, 0):
	self.failure("%s did not go down correctly" % self.r_dlm)
	failed = lpc

	self.change_state(node, self.r_dlm, "Started")
	if not self.wait_on_state(node, self.r_dlm, clone_max):
	self.failure("%s did not come up correctly" % self.r_dlm)
	failed = lpc

	if not self.wait_on_state(node, self.r_o2cb, clone_max):
	self.failure("%s did not come up correctly" % self.r_o2cb)
	failed = lpc

	for fs in self.r_ocfs2:
	if not self.wait_on_state(node, fs, clone_max):
	self.failure("%s did not come up correctly" % fs)
	failed = lpc

	if failed:
	return self.failure("iteration %d failed" % failed)
	return self.success()

	AllTestClasses.append(HAERoleTest)


	class HAEStandbyTest(HAETest):
	'''Set up a custom test to cause quorum failure issues for Andrew'''
	def __init__(self, cm):
	HAETest.__init__(self,cm)
	self.name = "HAEStandbyTest"

	def change_state(self, node, resource, target):
	(rc, _) = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
	return rc

	def __call__(self, node):
	self.incr("calls")

	lpc = 0
	failed = 0
	done = time.time() + self.Env["loop-minutes"]*60
	self.find_hae_resources(node)

	clone_max = len(self.Env["nodes"])
	while time.time() <= done and not failed:
	lpc = lpc + 1

	self.change_state(node, self.r_dlm, "true")
	if not self.wait_on_state(node, self.r_dlm, clone_max-1):
	self.failure("%s did not go down correctly" % self.r_dlm)
	failed = lpc

	self.change_state(node, self.r_dlm, "false")
	if not self.wait_on_state(node, self.r_dlm, clone_max):
	self.failure("%s did not come up correctly" % self.r_dlm)
	failed = lpc

	if not self.wait_on_state(node, self.r_o2cb, clone_max):
	self.failure("%s did not come up correctly" % self.r_o2cb)
	failed = lpc

	for fs in self.r_ocfs2:
	if not self.wait_on_state(node, fs, clone_max):
	self.failure("%s did not come up correctly" % fs)
	failed = lpc

	if failed:
	return self.failure("iteration %d failed" % failed)
	return self.success()

	AllTestClasses.append(HAEStandbyTest)


	class NearQuorumPointTest(CTSTest):
	'''
	This test brings larger clusters near the quorum point (50%).
	In addition, it will test doing starts and stops at the same time.

	Here is how I think it should work:
	- loop over the nodes and decide randomly which will be up and which
	will be down Use a 50% probability for each of up/down.
	- figure out what to do to get into that state from the current state
	- in parallel, bring up those going up and bring those going down.
	'''

	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "NearQuorumPoint"

	def __call__(self, dummy):
	'''Perform the 'NearQuorumPoint' test. '''
	self.incr("calls")
	startset = []
	stopset = []

	stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
	#decide what to do with each node
	for node in self.Env["nodes"]:
	action = self.Env.random_gen.choice(["start","stop"])
	#action = self.Env.random_gen.choice(["start","stop","no change"])
	if action == "start" :
	startset.append(node)
	elif action == "stop" :
	stopset.append(node)

	self.debug("start nodes:" + repr(startset))
	self.debug("stop nodes:" + repr(stopset))

	#add search patterns
	watchpats = [ ]
	for node in stopset:
	if self.CM.ShouldBeStatus[node] == "up":
	watchpats.append(self.templates["Pat:We_stopped"] % node)

	for node in startset:
	if self.CM.ShouldBeStatus[node] == "down":
	#watchpats.append(self.templates["Pat:NonDC_started"] % node)
	watchpats.append(self.templates["Pat:Local_started"] % node)
	else:
	for stopping in stopset:
	if self.CM.ShouldBeStatus[stopping] == "up":
	watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))

	if len(watchpats) == 0:
	return self.skipped()

	if len(startset) != 0:
	watchpats.append(self.templates["Pat:DC_IDLE"])

	watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)

	watch.set_watch()

	#begin actions
	for node in stopset:
	if self.CM.ShouldBeStatus[node] == "up":
	self.CM.StopaCMnoBlock(node)

	for node in startset:
	if self.CM.ShouldBeStatus[node] == "down":
	self.CM.StartaCMnoBlock(node)

	#get the result
	if watch.look_for_all():
	self.CM.cluster_stable()
	self.CM.fencing_cleanup("NearQuorumPoint", stonith)
	return self.success()

	self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))

	#get the "bad" nodes
	upnodes = []
	for node in stopset:
	if self.CM.StataCM(node) == 1:
	upnodes.append(node)

	downnodes = []
	for node in startset:
	if self.CM.StataCM(node) == 0:
	downnodes.append(node)

	self.CM.fencing_cleanup("NearQuorumPoint", stonith)
	if upnodes == [] and downnodes == []:
	self.CM.cluster_stable()

	# Make sure they're completely down with no residule
	for node in stopset:
	self.rsh(node, self.templates["StopCmd"])

	return self.success()

	if len(upnodes) > 0:
	self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))

	if len(downnodes) > 0:
	self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))

	return self.failure()

	def is_applicable(self):
	return True

	AllTestClasses.append(NearQuorumPointTest)


	class RollingUpgradeTest(CTSTest):
	'''Perform a rolling upgrade of the cluster'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "RollingUpgrade"
	self.start = StartTest(cm)
	self.stop = StopTest(cm)
	self.stopall = SimulStopLite(cm)
	self.startall = SimulStartLite(cm)

	def setup(self, node):
	# Start all remaining nodes
	ret = self.stopall(None)
	if not ret:
	return self.failure("Couldn't stop all nodes")

	for node in self.Env["nodes"]:
	if not self.downgrade(node, None):
	return self.failure("Couldn't downgrade %s" % node)

	ret = self.startall(None)
	if not ret:
	return self.failure("Couldn't start all nodes")
	return self.success()

	def teardown(self, node):
	# Stop everything
	ret = self.stopall(None)
	if not ret:
	return self.failure("Couldn't stop all nodes")

	for node in self.Env["nodes"]:
	if not self.upgrade(node, None):
	return self.failure("Couldn't upgrade %s" % node)

	return self.success()

	def install(self, node, version, start=1, flags="--force"):

	target_dir = "/tmp/rpm-%s" % version
	src_dir = "%s/%s" % (self.Env["rpm-dir"], version)

	self.logger.log("Installing %s on %s with %s" % (version, node, flags))
	if not self.stop(node):
	return self.failure("stop failure: "+node)

	self.rsh(node, "mkdir -p %s" % target_dir)
	self.rsh(node, "rm -f %s/*.rpm" % target_dir)
	(_, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, verbose=1)
	for line in lines:
	line = line[:-1]
	rc = self.rsh.copy("%s" % (line), "%s:%s/" % (node, target_dir))
	self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))

	if start and not self.start(node):
	return self.failure("start failure: "+node)

	return self.success()

	def upgrade(self, node, start=1):
	return self.install(node, self.Env["current-version"], start)

	def downgrade(self, node, start=1):
	return self.install(node, self.Env["previous-version"], start, "--force --nodeps")

	def __call__(self, node):
	'''Perform the 'Rolling Upgrade' test. '''
	self.incr("calls")

	for node in self.Env["nodes"]:
	if self.upgrade(node):
	return self.failure("Couldn't upgrade %s" % node)

	self.CM.cluster_stable()

	return self.success()

	def is_applicable(self):
	if not self.is_applicable_common():
	return None

	if not "rpm-dir" in list(self.Env.keys()):
	return None
	if not "current-version" in list(self.Env.keys()):
	return None
	if not "previous-version" in list(self.Env.keys()):
	return None

	return 1

	# Register RestartTest as a good test to run
	AllTestClasses.append(RollingUpgradeTest)


	class BSC_AddResource(CTSTest):
	'''Add a resource to the cluster'''
	def __init__(self, cm):
	CTSTest.__init__(self, cm)
	self.name = "AddResource"
	self.resource_offset = 0
	self.cib_cmd = """cibadmin -C -o %s -X '%s' """

	def __call__(self, node):
	self.incr("calls")
	self.resource_offset = self.resource_offset + 1

	r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
	start_pat = "pacemaker-controld.%s_start_0.confirmed.*ok"

	patterns = []
	patterns.append(start_pat % r_id)

	watch = self.create_watch(patterns, self.Env["DeadTime"])
	watch.set_watch()

	ip = self.NextIP()
	if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
	return self.failure("Make resource %s failed" % r_id)

	failed = 0
	watch_result = watch.look_for_all()
	if watch.unmatched:
	for regex in watch.unmatched:
	self.logger.log ("Warn: Pattern not found: %s" % (regex))
	failed = 1

	if failed:
	return self.failure("Resource pattern(s) not found")

	if not self.CM.cluster_stable(self.Env["DeadTime"]):
	return self.failure("Unstable cluster")

	return self.success()

	def NextIP(self):
	ip = self.Env["IPBase"]
	if ":" in ip:
	fields = ip.rpartition(":")
	fields[2] = str(hex(int(fields[2], 16)+1))
	print(str(hex(int(f[2], 16)+1)))
	else:
	fields = ip.rpartition('.')
	fields[2] = str(int(fields[2])+1)

	ip = fields[0] + fields[1] + fields[3];
	self.Env["IPBase"] = ip
	return ip.strip()

	def make_ip_resource(self, node, id, rclass, type, ip):
	self.logger.log("Creating %s:%s:%s (%s) on %s" % (rclass,type,id,ip,node))
	rsc_xml="""
	<primitive id="%s" class="%s" type="%s" provider="heartbeat">
	<instance_attributes id="%s"><attributes>
	<nvpair id="%s" name="ip" value="%s"/>
	</attributes></instance_attributes>
	</primitive>""" % (id, rclass, type, id, id, ip)

	node_constraint = """
	<rsc_location id="run_%s" rsc="%s">
	<rule id="pref_run_%s" score="100">
	<expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
	</rule>
	</rsc_location>""" % (id, id, id, id, node)

	rc = 0
	(rc, _) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), verbose=1)
	if rc != 0:
	self.logger.log("Constraint creation failed: %d" % rc)
	return None

	(rc, _) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), verbose=1)
	if rc != 0:
	self.logger.log("Resource creation failed: %d" % rc)
	return None

	return 1

	def is_applicable(self):
	if self.Env["DoBSC"]:
	return True
	return None

	AllTestClasses.append(BSC_AddResource)


	class SimulStopLite(CTSTest):
	'''Stop any active nodes ~ simultaneously'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "SimulStopLite"

	def __call__(self, dummy):
	'''Perform the 'SimulStopLite' setup work. '''
	self.incr("calls")

	self.debug("Setup: " + self.name)

	# We ignore the "node" parameter...
	watchpats = [ ]

	for node in self.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == "up":
	self.incr("WasStarted")
	watchpats.append(self.templates["Pat:We_stopped"] % node)

	if len(watchpats) == 0:
	return self.success()

	# Stop all the nodes - at about the same time...
	watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)

	watch.set_watch()
	self.set_timer()
	for node in self.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == "up":
	self.CM.StopaCMnoBlock(node)
	if watch.look_for_all():
	# Make sure they're completely down with no residule
	for node in self.Env["nodes"]:
	self.rsh(node, self.templates["StopCmd"])

	return self.success()

	did_fail = 0
	up_nodes = []
	for node in self.Env["nodes"]:
	if self.CM.StataCM(node) == 1:
	did_fail = 1
	up_nodes.append(node)

	if did_fail:
	return self.failure("Active nodes exist: " + repr(up_nodes))

	self.logger.log("Warn: All nodes stopped but CTS didn't detect: "
	+ repr(watch.unmatched))

	return self.failure("Missing log message: "+repr(watch.unmatched))

	def is_applicable(self):
	'''SimulStopLite is a setup test and never applicable'''
	return False


	class SimulStartLite(CTSTest):
	'''Start any stopped nodes ~ simultaneously'''
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "SimulStartLite"

	def __call__(self, dummy):
	'''Perform the 'SimulStartList' setup work. '''
	self.incr("calls")
	self.debug("Setup: " + self.name)

	# We ignore the "node" parameter...
	node_list = []
	for node in self.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == "down":
	self.incr("WasStopped")
	node_list.append(node)

	self.set_timer()
	while len(node_list) > 0:
	# Repeat until all nodes come up
	watchpats = [ ]

	uppat = self.templates["Pat:NonDC_started"]
	if self.CM.upcount() == 0:
	uppat = self.templates["Pat:Local_started"]

	watchpats.append(self.templates["Pat:DC_IDLE"])
	for node in node_list:
	watchpats.append(uppat % node)
	watchpats.append(self.templates["Pat:InfraUp"] % node)
	watchpats.append(self.templates["Pat:PacemakerUp"] % node)

	# Start all the nodes - at about the same time...
	watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
	watch.set_watch()

	stonith = self.CM.prepare_fencing_watcher(self.name)

	for node in node_list:
	self.CM.StartaCMnoBlock(node)

	watch.look_for_all()

	node_list = self.CM.fencing_cleanup(self.name, stonith)

	if node_list == None:
	return self.failure("Cluster did not stabilize")

	# Remove node_list messages from watch.unmatched
	for node in node_list:
	self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
	if watch.unmatched:
	try:
	watch.unmatched.remove(uppat % node)
	except:
	self.debug("Already matched: %s" % (uppat % node))
	try:
	watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
	except:
	self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
	try:
	watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
	except:
	self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))

	if watch.unmatched:
	for regex in watch.unmatched:
	self.logger.log ("Warn: Startup pattern not found: %s" %(regex))

	if not self.CM.cluster_stable():
	return self.failure("Cluster did not stabilize")

	did_fail = 0
	unstable = []
	for node in self.Env["nodes"]:
	if self.CM.StataCM(node) == 0:
	did_fail = 1
	unstable.append(node)

	if did_fail:
	return self.failure("Unstarted nodes exist: " + repr(unstable))

	unstable = []
	for node in self.Env["nodes"]:
	if not self.CM.node_stable(node):
	did_fail = 1
	unstable.append(node)

	if did_fail:
	return self.failure("Unstable cluster nodes exist: " + repr(unstable))

	return self.success()

	def is_applicable(self):
	'''SimulStartLite is a setup test and never applicable'''
	return False


	def TestList(cm, audits):
	result = []
	for testclass in AllTestClasses:
	bound_test = testclass(cm)
	if bound_test.is_applicable():
	bound_test.Audits = audits
	result.append(bound_test)
	return result


	class RemoteLXC(CTSTest):
	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = "RemoteLXC"
	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)
	self.num_containers = 2
	self.is_container = 1
	self.failed = 0
	self.fail_string = ""

	def start_lxc_simple(self, node):

	# restore any artifacts laying around from a previous test.
	self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")

	# generate the containers, put them in the config, add some resources to them
	pats = [ ]
	watch = self.create_watch(pats, 120)
	watch.set_watch()
	pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
	pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
	pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
	pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))

	self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
	self.set_timer("remoteSimpleInit")
	watch.look_for_all()
	self.log_timer("remoteSimpleInit")
	if watch.unmatched:
	self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
	self.failed = 1

	def cleanup_lxc_simple(self, node):

	pats = [ ]
	# if the test failed, attempt to clean up the cib and libvirt environment
	# as best as possible
	if self.failed == 1:
	# restore libvirt and cib
	self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
	return

	watch = self.create_watch(pats, 120)
	watch.set_watch()

	pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))

	self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
	self.set_timer("remoteSimpleCleanup")
	watch.look_for_all()
	self.log_timer("remoteSimpleCleanup")

	if watch.unmatched:
	self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
	self.failed = 1

	# cleanup libvirt
	self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")

	def __call__(self, node):
	'''Perform the 'RemoteLXC' test. '''
	self.incr("calls")

	ret = self.startall(None)
	if not ret:
	return self.failure("Setup failed, start all nodes failed.")

	(rc, _) = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
	if rc == 1:
	self.log("Environment test for lxc support failed.")
	return self.skipped()

	self.start_lxc_simple(node)
	self.cleanup_lxc_simple(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()

	if self.failed == 1:
	return self.failure(self.fail_string)

	return self.success()

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [
	r"Updating failcount for ping",
	r"schedulerd.: Recover\s+(ping\|lxc-ms\|container)\s+$.$",
	# The orphaned lxc-ms resource causes an expected transition error
	# that is a result of the scheduler not having knowledge that the
	# promotable resource used to be a clone. As a result, it looks like that
	# resource is running in multiple locations when it shouldn't... But in
	# this instance we know why this error is occurring and that it is expected.
	r"Calculated [Tt]ransition .*pe-error",
	r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
	r"Unknown operation: fail",
	r"VirtualDomain.*ERROR: Unable to determine emulator",
	]

	AllTestClasses.append(RemoteLXC)


	class RemoteDriver(CTSTest):

	def __init__(self, cm):
	CTSTest.__init__(self,cm)
	self.name = self.__class__.__name__
	self.start = StartTest(cm)
	self.startall = SimulStartLite(cm)
	self.stop = StopTest(cm)
	self.remote_rsc = "remote-rsc"
	self.cib_cmd = """cibadmin -C -o %s -X '%s' """
	self.reset()

	def reset(self):
	self.pcmk_started = 0
	self.failed = False
	self.fail_string = ""
	self.remote_node_added = 0
	self.remote_rsc_added = 0
	self.remote_use_reconnect_interval = self.Env.random_gen.choice([True,False])

	def fail(self, msg):
	""" Mark test as failed. """

	self.failed = True

	# Always log the failure.
	self.logger.log(msg)

	# Use first failure as test status, as it's likely to be most useful.
	if not self.fail_string:
	self.fail_string = msg

	def get_othernode(self, node):
	for othernode in self.Env["nodes"]:
	if othernode == node:
	# we don't want to try and use the cib that we just shutdown.
	# find a cluster node that is not our soon to be remote-node.
	continue
	else:
	return othernode

	def del_rsc(self, node, rsc):
	othernode = self.get_othernode(node)
	(rc, _) = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
	if rc != 0:
	self.fail("Removal of resource '%s' failed" % rsc)

	def add_rsc(self, node, rsc_xml):
	othernode = self.get_othernode(node)
	(rc, _) = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
	if rc != 0:
	self.fail("resource creation failed")

	def add_primitive_rsc(self, node):
	rsc_xml = """
	<primitive class="ocf" id="%(node)s" provider="heartbeat" type="Dummy">
	<meta_attributes id="%(node)s-meta_attributes"/>
	<operations>
	<op id="%(node)s-monitor-interval-20s" interval="20s" name="monitor"/>
	</operations>
	</primitive>""" % { "node": self.remote_rsc }
	self.add_rsc(node, rsc_xml)
	if not self.failed:
	self.remote_rsc_added = 1

	def add_connection_rsc(self, node):
	rsc_xml = """
	<primitive class="ocf" id="%(node)s" provider="pacemaker" type="remote">
	<instance_attributes id="%(node)s-instance_attributes">
	<nvpair id="%(node)s-instance_attributes-server" name="server" value="%(server)s"/>
	""" % { "node": self.remote_node, "server": node }

	if self.remote_use_reconnect_interval:
	# Set reconnect interval on resource
	rsc_xml = rsc_xml + """
	<nvpair id="%s-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
	""" % (self.remote_node)

	rsc_xml = rsc_xml + """
	</instance_attributes>
	<operations>
	<op id="%(node)s-start" name="start" interval="0" timeout="120s"/>
	<op id="%(node)s-monitor-20s" name="monitor" interval="20s" timeout="45s"/>
	</operations>
	</primitive>
	""" % { "node": self.remote_node }

	self.add_rsc(node, rsc_xml)
	if not self.failed:
	self.remote_node_added = 1

	def disable_services(self, node):
	self.corosync_enabled = self.Env.service_is_enabled(node, "corosync")
	if self.corosync_enabled:
	self.Env.disable_service(node, "corosync")

	self.pacemaker_enabled = self.Env.service_is_enabled(node, "pacemaker")
	if self.pacemaker_enabled:
	self.Env.disable_service(node, "pacemaker")

	def restore_services(self, node):
	if self.corosync_enabled:
	self.Env.enable_service(node, "corosync")

	if self.pacemaker_enabled:
	self.Env.enable_service(node, "pacemaker")

	def stop_pcmk_remote(self, node):
	# disable pcmk remote
	for i in range(10):
	(rc, _) = self.rsh(node, "service pacemaker_remote stop")
	if rc != 0:
	time.sleep(6)
	else:
	break

	def start_pcmk_remote(self, node):
	for i in range(10):
	(rc, _) = self.rsh(node, "service pacemaker_remote start")
	if rc != 0:
	time.sleep(6)
	else:
	self.pcmk_started = 1
	break

	def freeze_pcmk_remote(self, node):
	""" Simulate a Pacemaker Remote daemon failure. """

	# We freeze the process.
	self.rsh(node, "killall -STOP pacemaker-remoted")

	def resume_pcmk_remote(self, node):
	# We resume the process.
	self.rsh(node, "killall -CONT pacemaker-remoted")

	def start_metal(self, node):
	# Cluster nodes are reused as remote nodes in remote tests. If cluster
	# services were enabled at boot, in case the remote node got fenced, the
	# cluster node would join instead of the expected remote one. Meanwhile
	# pacemaker_remote would not be able to start. Depending on the chances,
	# the situations might not be able to be orchestrated gracefully any more.
	#
	# Temporarily disable any enabled cluster serivces.
	self.disable_services(node)

	pcmk_started = 0

	# make sure the resource doesn't already exist for some reason
	self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
	self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))

	if not self.stop(node):
	self.fail("Failed to shutdown cluster node %s" % node)
	return

	self.start_pcmk_remote(node)

	if self.pcmk_started == 0:
	self.fail("Failed to start pacemaker_remote on node %s" % node)
	return

	# Convert node to baremetal now that it has shutdown the cluster stack
	pats = [ ]
	watch = self.create_watch(pats, 120)
	watch.set_watch()
	pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
	pats.append(self.templates["Pat:DC_IDLE"])

	self.add_connection_rsc(node)

	self.set_timer("remoteMetalInit")
	watch.look_for_all()
	self.log_timer("remoteMetalInit")
	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)

	def migrate_connection(self, node):
	if self.failed:
	return

	pats = [ ]
	pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
	pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
	pats.append(self.templates["Pat:DC_IDLE"])
	watch = self.create_watch(pats, 120)
	watch.set_watch()

	(rc, _) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), verbose=1)
	if rc != 0:
	self.fail("failed to move remote node connection resource")
	return

	self.set_timer("remoteMetalMigrate")
	watch.look_for_all()
	self.log_timer("remoteMetalMigrate")

	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)
	return

	def fail_rsc(self, node):
	if self.failed:
	return

	watchpats = [ ]
	watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
	watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
	watchpats.append(self.templates["Pat:DC_IDLE"])

	watch = self.create_watch(watchpats, 120)
	watch.set_watch()

	self.debug("causing dummy rsc to fail.")

	self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")

	self.set_timer("remoteRscFail")
	watch.look_for_all()
	self.log_timer("remoteRscFail")
	if watch.unmatched:
	self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)

	def fail_connection(self, node):
	if self.failed:
	return

	watchpats = [ ]
	watchpats.append(self.templates["Pat:Fencing_ok"] % self.remote_node)
	watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)

	watch = self.create_watch(watchpats, 120)
	watch.set_watch()

	# freeze the pcmk remote daemon. this will result in fencing
	self.debug("Force stopped active remote node")
	self.freeze_pcmk_remote(node)

	self.debug("Waiting for remote node to be fenced.")
	self.set_timer("remoteMetalFence")
	watch.look_for_all()
	self.log_timer("remoteMetalFence")
	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)
	return

	self.debug("Waiting for the remote node to come back up")
	self.CM.ns.wait_for_node(node, 120);

	pats = [ ]
	watch = self.create_watch(pats, 240)
	watch.set_watch()
	pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
	if self.remote_rsc_added == 1:
	pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))

	# start the remote node again watch it integrate back into cluster.
	self.start_pcmk_remote(node)
	if self.pcmk_started == 0:
	self.fail("Failed to start pacemaker_remote on node %s" % node)
	return

	self.debug("Waiting for remote node to rejoin cluster after being fenced.")
	self.set_timer("remoteMetalRestart")
	watch.look_for_all()
	self.log_timer("remoteMetalRestart")
	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)
	return

	def add_dummy_rsc(self, node):
	if self.failed:
	return

	# verify we can put a resource on the remote node
	pats = [ ]
	watch = self.create_watch(pats, 120)
	watch.set_watch()
	pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
	pats.append(self.templates["Pat:DC_IDLE"])

	# Add a resource that must live on remote-node
	self.add_primitive_rsc(node)

	# force that rsc to prefer the remote node.
	(rc, _) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), verbose=1)
	if rc != 0:
	self.fail("Failed to place remote resource on remote node.")
	return

	self.set_timer("remoteMetalRsc")
	watch.look_for_all()
	self.log_timer("remoteMetalRsc")
	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)

	def test_attributes(self, node):
	if self.failed:
	return

	# This verifies permanent attributes can be set on a remote-node. It also
	# verifies the remote-node can edit its own cib node section remotely.
	(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), verbose=1)
	if rc != 0:
	self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
	return

	(rc, _) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), verbose=1)
	if rc != 0:
	self.fail("Failed to get remote-node attribute")
	return

	(rc, _) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), verbose=1)
	if rc != 0:
	self.fail("Failed to delete remote-node attribute")
	return

	def cleanup_metal(self, node):
	self.restore_services(node)

	if self.pcmk_started == 0:
	return

	pats = [ ]

	watch = self.create_watch(pats, 120)
	watch.set_watch()

	if self.remote_rsc_added == 1:
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
	if self.remote_node_added == 1:
	pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))

	self.set_timer("remoteMetalCleanup")

	self.resume_pcmk_remote(node)

	if self.remote_rsc_added == 1:

	# Remove dummy resource added for remote node tests
	self.debug("Cleaning up dummy rsc put on remote node")
	self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc)
	self.del_rsc(node, self.remote_rsc)

	if self.remote_node_added == 1:

	# Remove remote node's connection resource
	self.debug("Cleaning up remote node connection resource")
	self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node))
	self.del_rsc(node, self.remote_node)

	watch.look_for_all()
	self.log_timer("remoteMetalCleanup")

	if watch.unmatched:
	self.fail("Unmatched patterns: %s" % watch.unmatched)

	self.stop_pcmk_remote(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()

	if self.remote_node_added == 1:
	# Remove remote node itself
	self.debug("Cleaning up node entry for remote node")
	self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)

	def setup_env(self, node):

	self.remote_node = "remote-%s" % (node)

	# we are assuming if all nodes have a key, that it is
	# the right key... If any node doesn't have a remote
	# key, we regenerate it everywhere.
	if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
	return

	# create key locally
	(handle, keyfile) = tempfile.mkstemp(".cts")
	os.close(handle)
	subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
	stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	# sync key throughout the cluster
	for node in self.Env["nodes"]:
	self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
	self.rsh.copy(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
	self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
	self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
	os.unlink(keyfile)

	def is_applicable(self):
	if not self.is_applicable_common():
	return False

	for node in self.Env["nodes"]:
	(rc, _) = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
	if rc != 0:
	return False
	return True

	def start_new_test(self, node):
	self.incr("calls")
	self.reset()

	ret = self.startall(None)
	if not ret:
	return self.failure("setup failed: could not start all nodes")

	self.setup_env(node)
	self.start_metal(node)
	self.add_dummy_rsc(node)
	return True

	def __call__(self, node):
	return self.failure("This base class is not meant to be called directly.")

	def errorstoignore(self):
	'''Return list of errors which should be ignored'''
	return [ r"""is running on remote.*which isn't allowed""",
	r"""Connection terminated""",
	r"""Could not send remote""",
	]

	# RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses


	class RemoteBasic(RemoteDriver):

	def __call__(self, node):
	'''Perform the 'RemoteBaremetal' test. '''

	if not self.start_new_test(node):
	return self.failure(self.fail_string)

	self.test_attributes(node)
	self.cleanup_metal(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()
	if self.failed:
	return self.failure(self.fail_string)

	return self.success()

	AllTestClasses.append(RemoteBasic)

	class RemoteStonithd(RemoteDriver):

	def __call__(self, node):
	'''Perform the 'RemoteStonithd' test. '''

	if not self.start_new_test(node):
	return self.failure(self.fail_string)

	self.fail_connection(node)
	self.cleanup_metal(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()
	if self.failed:
	return self.failure(self.fail_string)

	return self.success()

	def is_applicable(self):
	if not RemoteDriver.is_applicable(self):
	return False

	if "DoFencing" in list(self.Env.keys()):
	return self.Env["DoFencing"]

	return True

	def errorstoignore(self):
	ignore_pats = [
	r"Lost connection to Pacemaker Remote node",
	r"Software caused connection abort",
	r"pacemaker-controld.:\s+error.: Operation remote-.*_monitor",
	r"pacemaker-controld.:\s+error.: Result of monitor operation for remote-.*",
	r"schedulerd.:\s+Recover\s+remote-.\s+$.*$",
	r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
	]

	ignore_pats.extend(RemoteDriver.errorstoignore(self))
	return ignore_pats

	AllTestClasses.append(RemoteStonithd)


	class RemoteMigrate(RemoteDriver):

	def __call__(self, node):
	'''Perform the 'RemoteMigrate' test. '''

	if not self.start_new_test(node):
	return self.failure(self.fail_string)

	self.migrate_connection(node)
	self.cleanup_metal(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()
	if self.failed:
	return self.failure(self.fail_string)

	return self.success()

	def is_applicable(self):
	if not RemoteDriver.is_applicable(self):
	return 0
	# This test requires at least three nodes: one to convert to a
	# remote node, one to host the connection originally, and one
	# to migrate the connection to.
	if len(self.Env["nodes"]) < 3:
	return 0
	return 1

	AllTestClasses.append(RemoteMigrate)


	class RemoteRscFailure(RemoteDriver):

	def __call__(self, node):
	'''Perform the 'RemoteRscFailure' test. '''

	if not self.start_new_test(node):
	return self.failure(self.fail_string)

	# This is an important step. We are migrating the connection
	# before failing the resource. This verifies that the migration
	# has properly maintained control over the remote-node.
	self.migrate_connection(node)

	self.fail_rsc(node)
	self.cleanup_metal(node)

	self.debug("Waiting for the cluster to recover")
	self.CM.cluster_stable()
	if self.failed:
	return self.failure(self.fail_string)

	return self.success()

	def errorstoignore(self):
	ignore_pats = [
	r"schedulerd.: Recover\s+remote-rsc\s+$.$",
	r"Dummy.*: No process state file found",
	]

	ignore_pats.extend(RemoteDriver.errorstoignore(self))
	return ignore_pats

	def is_applicable(self):
	if not RemoteDriver.is_applicable(self):
	return 0
	# This test requires at least three nodes: one to convert to a
	# remote node, one to host the connection originally, and one
	# to migrate the connection to.
	if len(self.Env["nodes"]) < 3:
	return 0
	return 1

	AllTestClasses.append(RemoteRscFailure)

	# vim:ts=4:sw=4:et:
	diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
	index 38ebeb7ddf..89cb61fa3d 100644
	--- a/daemons/controld/controld_fencing.c
	+++ b/daemons/controld/controld_fencing.c
	@@ -1,1108 +1,1108 @@
	/*
	* Copyright 2004-2023 the Pacemaker project contributors
	*
	* The version control history for this file may have further details.
	*
	* This source code is licensed under the GNU General Public License version 2
	* or later (GPLv2+) WITHOUT ANY WARRANTY.
	*/

	#include <crm_internal.h>
	#include <crm/crm.h>
	#include <crm/msg_xml.h>
	#include <crm/common/xml.h>
	#include <crm/stonith-ng.h>
	#include <crm/fencing/internal.h>

	#include <pacemaker-controld.h>

	static void
	tengine_stonith_history_synced(stonith_t st, stonith_event_t st_event);

	/*
	* stonith failure counting
	*
	* We don't want to get stuck in a permanent fencing loop. Keep track of the
	* number of fencing failures for each target node, and the most we'll restart a
	* transition for.
	*/

	struct st_fail_rec {
	int count;
	};

	static bool fence_reaction_panic = false;
	static unsigned long int stonith_max_attempts = 10;
	static GHashTable *stonith_failures = NULL;

	/*!
	* \internal
	* \brief Update max fencing attempts before giving up
	*
	* \param[in] value New max fencing attempts
	*/
	static void
	update_stonith_max_attempts(const char *value)
	{
	stonith_max_attempts = char2score(value);
	if (stonith_max_attempts < 1UL) {
	stonith_max_attempts = 10UL;
	}
	}

	/*!
	* \internal
	* \brief Configure reaction to notification of local node being fenced
	*
	* \param[in] reaction_s Reaction type
	*/
	static void
	set_fence_reaction(const char *reaction_s)
	{
	if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
	fence_reaction_panic = true;

	} else {
	if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
	crm_warn("Invalid value '%s' for %s, using 'stop'",
	reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
	}
	fence_reaction_panic = false;
	}
	}

	/*!
	* \internal
	* \brief Configure fencing options based on the CIB
	*
	* \param[in,out] options Name/value pairs for configured options
	*/
	void
	controld_configure_fencing(GHashTable *options)
	{
	const char *value = NULL;

	value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FENCE_REACTION);
	set_fence_reaction(value);

	value = g_hash_table_lookup(options, "stonith-max-attempts");
	update_stonith_max_attempts(value);
	}

	static gboolean
	too_many_st_failures(const char *target)
	{
	GHashTableIter iter;
	const char *key = NULL;
	struct st_fail_rec *value = NULL;

	if (stonith_failures == NULL) {
	return FALSE;
	}

	if (target == NULL) {
	g_hash_table_iter_init(&iter, stonith_failures);
	while (g_hash_table_iter_next(&iter, (gpointer *) &key,
	(gpointer *) &value)) {

	if (value->count >= stonith_max_attempts) {
	target = (const char*)key;
	goto too_many;
	}
	}
	} else {
	value = g_hash_table_lookup(stonith_failures, target);
	if ((value != NULL) && (value->count >= stonith_max_attempts)) {
	goto too_many;
	}
	}
	return FALSE;

	too_many:
	crm_warn("Too many failures (%d) to fence %s, giving up",
	value->count, target);
	return TRUE;
	}

	/*!
	* \internal
	* \brief Reset a stonith fail count
	*
	* \param[in] target Name of node to reset, or NULL for all
	*/
	void
	st_fail_count_reset(const char *target)
	{
	if (stonith_failures == NULL) {
	return;
	}

	if (target) {
	struct st_fail_rec *rec = NULL;

	rec = g_hash_table_lookup(stonith_failures, target);
	if (rec) {
	rec->count = 0;
	}
	} else {
	GHashTableIter iter;
	const char *key = NULL;
	struct st_fail_rec *rec = NULL;

	g_hash_table_iter_init(&iter, stonith_failures);
	while (g_hash_table_iter_next(&iter, (gpointer *) &key,
	(gpointer *) &rec)) {
	rec->count = 0;
	}
	}
	}

	static void
	st_fail_count_increment(const char *target)
	{
	struct st_fail_rec *rec = NULL;

	if (stonith_failures == NULL) {
	stonith_failures = pcmk__strkey_table(free, free);
	}

	rec = g_hash_table_lookup(stonith_failures, target);
	if (rec) {
	rec->count++;
	} else {
	rec = malloc(sizeof(struct st_fail_rec));
	if(rec == NULL) {
	return;
	}

	rec->count = 1;
	g_hash_table_insert(stonith_failures, strdup(target), rec);
	}
	}

	/* end stonith fail count functions */


	static void
	cib_fencing_updated(xmlNode msg, int call_id, int rc, xmlNode output,
	void *user_data)
	{
	if (rc < pcmk_ok) {
	crm_err("Fencing update %d for %s: failed - %s (%d)",
	call_id, (char *)user_data, pcmk_strerror(rc), rc);
	crm_log_xml_warn(msg, "Failed update");
	abort_transition(INFINITY, pcmk__graph_shutdown, "CIB update failed",
	NULL);

	} else {
	crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
	}
	}

	static void
	send_stonith_update(pcmk__graph_action_t action, const char target,
	const char *uuid)
	{
	int rc = pcmk_ok;
	crm_node_t *peer = NULL;

	/* We (usually) rely on the membership layer to do node_update_cluster,
	* and the peer status callback to do node_update_peer, because the node
	* might have already rejoined before we get the stonith result here.
	*/
	int flags = node_update_join \| node_update_expected;

	/* zero out the node-status & remove all LRM status info */
	xmlNode *node_state = NULL;

	CRM_CHECK(target != NULL, return);
	CRM_CHECK(uuid != NULL, return);

	/* Make sure the membership and join caches are accurate */
	peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);

	CRM_CHECK(peer != NULL, return);

	if (peer->state == NULL) {
	/* Usually, we rely on the membership layer to update the cluster state
	* in the CIB. However, if the node has never been seen, do it here, so
	* the node is not considered unclean.
	*/
	flags \|= node_update_cluster;
	}

	if (peer->uuid == NULL) {
	crm_info("Recording uuid '%s' for node '%s'", uuid, target);
	peer->uuid = strdup(uuid);
	}

	crmd_peer_down(peer, TRUE);

	/* Generate a node state update for the CIB */
	node_state = create_node_state_update(peer, flags, NULL, __func__);

	/* we have to mark whether or not remote nodes have already been fenced */
	if (peer->flags & crm_remote_node) {
	char *now_s = pcmk__ttoa(time(NULL));

	crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
	free(now_s);
	}

	/* Force our known ID */
	crm_xml_add(node_state, XML_ATTR_ID, uuid);

	rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
	XML_CIB_TAG_STATUS, node_state,
	cib_scope_local
	\|cib_can_create);

	/* Delay processing the trigger until the update completes */
	crm_debug("Sending fencing update %d for %s", rc, target);
	fsa_register_cib_callback(rc, strdup(target), cib_fencing_updated);

	// Make sure it sticks
	/* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn,
	* cib_scope_local);
	*/

	controld_delete_node_state(peer->uname, controld_section_all,
	cib_scope_local);
	free_xml(node_state);
	return;
	}

	/*!
	* \internal
	* \brief Abort transition due to stonith failure
	*
	* \param[in] abort_action Whether to restart or stop transition
	* \param[in] target Don't restart if this (NULL for any) has too many failures
	* \param[in] reason Log this stonith action XML as abort reason (or NULL)
	*/
	static void
	abort_for_stonith_failure(enum pcmk__graph_next abort_action,
	const char target, const xmlNode reason)
	{
	/* If stonith repeatedly fails, we eventually give up on starting a new
	* transition for that reason.
	*/
	if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
	abort_action = pcmk__graph_wait;
	}
	abort_transition(INFINITY, abort_action, "Stonith failed", reason);
	}


	/*
	* stonith cleanup list
	*
	* If the DC is shot, proper notifications might not go out.
	* The stonith cleanup list allows the cluster to (re-)send
	* notifications once a new DC is elected.
	*/

	static GList *stonith_cleanup_list = NULL;

	/*!
	* \internal
	* \brief Add a node to the stonith cleanup list
	*
	* \param[in] target Name of node to add
	*/
	void
	add_stonith_cleanup(const char *target) {
	stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
	}

	/*!
	* \internal
	* \brief Remove a node from the stonith cleanup list
	*
	* \param[in] Name of node to remove
	*/
	void
	remove_stonith_cleanup(const char *target)
	{
	GList *iter = stonith_cleanup_list;

	while (iter != NULL) {
	GList *tmp = iter;
	char *iter_name = tmp->data;

	iter = iter->next;
	if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
	crm_trace("Removing %s from the cleanup list", iter_name);
	stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
	free(iter_name);
	}
	}
	}

	/*!
	* \internal
	* \brief Purge all entries from the stonith cleanup list
	*/
	void
	purge_stonith_cleanup(void)
	{
	if (stonith_cleanup_list) {
	GList *iter = NULL;

	for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
	char *target = iter->data;

	crm_info("Purging %s from stonith cleanup list", target);
	free(target);
	}
	g_list_free(stonith_cleanup_list);
	stonith_cleanup_list = NULL;
	}
	}

	/*!
	* \internal
	* \brief Send stonith updates for all entries in cleanup list, then purge it
	*/
	void
	execute_stonith_cleanup(void)
	{
	GList *iter;

	for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
	char *target = iter->data;
	crm_node_t *target_node = crm_get_peer(0, target);
	const char *uuid = crm_peer_uuid(target_node);

	crm_notice("Marking %s, target of a previous stonith action, as clean", target);
	send_stonith_update(NULL, target, uuid);
	free(target);
	}
	g_list_free(stonith_cleanup_list);
	stonith_cleanup_list = NULL;
	}

	/* end stonith cleanup list functions */


	/* stonith API client
	*
	* Functions that need to interact directly with the fencer via its API
	*/

	static stonith_t *stonith_api = NULL;
	static crm_trigger_t *stonith_reconnect = NULL;
	static char *te_client_id = NULL;

	static gboolean
	fail_incompletable_stonith(pcmk__graph_t *graph)
	{
	GList *lpc = NULL;
	const char *task = NULL;
	xmlNode *last_action = NULL;

	if (graph == NULL) {
	return FALSE;
	}

	for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
	GList *lpc2 = NULL;
	pcmk__graph_synapse_t synapse = (pcmk__graph_synapse_t ) lpc->data;

	if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
	continue;
	}

	for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
	pcmk__graph_action_t action = (pcmk__graph_action_t ) lpc2->data;

	if ((action->type != pcmk__cluster_graph_action)
	\|\| pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
	continue;
	}

	task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
	if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
	pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
	last_action = action->xml;
	pcmk__update_graph(graph, action);
	crm_notice("Failing action %d (%s): fencer terminated",
	action->id, ID(action->xml));
	}
	}
	}

	if (last_action != NULL) {
	crm_warn("Fencer failure resulted in unrunnable actions");
	abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
	return TRUE;
	}

	return FALSE;
	}

	static void
	tengine_stonith_connection_destroy(stonith_t st, stonith_event_t e)
	{
	te_cleanup_stonith_history_sync(st, FALSE);

	if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
	crm_crit("Fencing daemon connection failed");
	mainloop_set_trigger(stonith_reconnect);

	} else {
	crm_info("Fencing daemon disconnected");
	}

	if (stonith_api) {
	/* the client API won't properly reconnect notifications
	* if they are still in the table - so remove them
	*/
	if (stonith_api->state != stonith_disconnected) {
	stonith_api->cmds->disconnect(st);
	}
	stonith_api->cmds->remove_notification(stonith_api, NULL);
	}

	if (AM_I_DC) {
	fail_incompletable_stonith(controld_globals.transition_graph);
	trigger_graph();
	}
	}

	/*!
	* \internal
	* \brief Handle an event notification from the fencing API
	*
	* \param[in] st Fencing API connection (ignored)
	* \param[in] event Fencing API event notification
	*/
	static void
	handle_fence_notification(stonith_t st, stonith_event_t event)
	{
	bool succeeded = true;
	const char *executioner = "the cluster";
	const char *client = "a client";
	const char *reason = NULL;
	int exec_status;

	if (te_client_id == NULL) {
	te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
	(unsigned long) getpid());
	}

	if (event == NULL) {
	crm_err("Notify data not found");
	return;
	}

	if (event->executioner != NULL) {
	executioner = event->executioner;
	}
	if (event->client_origin != NULL) {
	client = event->client_origin;
	}

	exec_status = stonith__event_execution_status(event);
	if ((stonith__event_exit_status(event) != CRM_EX_OK)
	\|\| (exec_status != PCMK_EXEC_DONE)) {
	succeeded = false;
	if (exec_status == PCMK_EXEC_DONE) {
	exec_status = PCMK_EXEC_ERROR;
	}
	}
	reason = stonith__event_exit_reason(event);

	crmd_alert_fencing_op(event);

	if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
	// Unfencing doesn't need special handling, just a log message
	if (succeeded) {
	crm_notice("%s was unfenced by %s at the request of %s@%s",
	event->target, executioner, client, event->origin);
	} else {
	crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
	event->target, executioner,
	pcmk_exec_status_str(exec_status),
	((reason == NULL)? "" : ": "),
	((reason == NULL)? "" : reason),
	stonith__event_exit_status(event));
	}
	return;
	}

	if (succeeded
	&& pcmk__str_eq(event->target, controld_globals.our_nodename,
	pcmk__str_casei)) {
	/* We were notified of our own fencing. Most likely, either fencing was
	* misconfigured, or fabric fencing that doesn't cut cluster
	* communication is in use.
	*
	* Either way, shutting down the local host is a good idea, to require
	* administrator intervention. Also, other nodes would otherwise likely
	* set our status to lost because of the fencing callback and discard
	* our subsequent election votes as "not part of our cluster".
	*/
	crm_crit("We were allegedly just fenced by %s for %s!",
	executioner, event->origin); // Dumps blackbox if enabled
	if (fence_reaction_panic) {
	pcmk__panic(__func__);
	} else {
	crm_exit(CRM_EX_FATAL);
	}
	return; // Should never get here
	}

	/* Update the count of fencing failures for this target, in case we become
	* DC later. The current DC has already updated its fail count in
	* tengine_stonith_callback().
	*/
	if (!AM_I_DC) {
	if (succeeded) {
	st_fail_count_reset(event->target);
	} else {
	st_fail_count_increment(event->target);
	}
	}

	crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
	"%s%s%s%s " CRM_XS " event=%s",
	event->target, (succeeded? "" : " not"),
	event->action, executioner, client, event->origin,
	(succeeded? "OK" : pcmk_exec_status_str(exec_status)),
	((reason == NULL)? "" : " ("),
	((reason == NULL)? "" : reason),
	((reason == NULL)? "" : ")"),
	event->id);

	if (succeeded) {
	crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
	CRM_GET_PEER_ANY);
	const char *uuid = NULL;

	if (peer == NULL) {
	return;
	}

	uuid = crm_peer_uuid(peer);

	if (AM_I_DC) {
	/* The DC always sends updates */
	send_stonith_update(NULL, event->target, uuid);

	/* @TODO Ideally, at this point, we'd check whether the fenced node
	* hosted any guest nodes, and call remote_node_down() for them.
	* Unfortunately, the controller doesn't have a simple, reliable way
	* to map hosts to guests. It might be possible to track this in the
	* peer cache via crm_remote_peer_cache_refresh(). For now, we rely
	* on the scheduler creating fence pseudo-events for the guests.
	*/

	if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
	/* Abort the current transition if it wasn't the cluster that
	* initiated fencing.
	*/
	crm_info("External fencing operation from %s fenced %s",
	client, event->target);
	abort_transition(INFINITY, pcmk__graph_restart,
	"External Fencing Operation", NULL);
	}

	} else if (pcmk__str_eq(controld_globals.dc_name, event->target,
	pcmk__str_null_matches\|pcmk__str_casei)
	&& !pcmk_is_set(peer->flags, crm_remote_node)) {
	// Assume the target was our DC if we don't currently have one

	if (controld_globals.dc_name != NULL) {
	crm_notice("Fencing target %s was our DC", event->target);
	} else {
	crm_notice("Fencing target %s may have been our DC",
	event->target);
	}

	/* Given the CIB resyncing that occurs around elections,
	* have one node update the CIB now and, if the new DC is different,
	* have them do so too after the election
	*/
	if (pcmk__str_eq(event->executioner, controld_globals.our_nodename,
	pcmk__str_casei)) {
	send_stonith_update(NULL, event->target, uuid);
	}
	add_stonith_cleanup(event->target);
	}

	/* If the target is a remote node, and we host its connection,
	* immediately fail all monitors so it can be recovered quickly.
	* The connection won't necessarily drop when a remote node is fenced,
	* so the failure might not otherwise be detected until the next poke.
	*/
	if (pcmk_is_set(peer->flags, crm_remote_node)) {
	remote_ra_fail(event->target);
	}

	crmd_peer_down(peer, TRUE);
	}
	}

	/*!
	* \brief Connect to fencer
	*
	* \param[in] user_data If NULL, retry failures now, otherwise retry in main loop
	*
	* \return TRUE
	* \note If user_data is NULL, this will wait 2s between attempts, for up to
	* 30 attempts, meaning the controller could be blocked as long as 58s.
	*/
	static gboolean
	te_connect_stonith(gpointer user_data)
	{
	int rc = pcmk_ok;

	if (stonith_api == NULL) {
	stonith_api = stonith_api_new();
	if (stonith_api == NULL) {
	crm_err("Could not connect to fencer: API memory allocation failed");
	return TRUE;
	}
	}

	if (stonith_api->state != stonith_disconnected) {
	crm_trace("Already connected to fencer, no need to retry");
	return TRUE;
	}

	if (user_data == NULL) {
	// Blocking (retry failures now until successful)
	rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
	if (rc != pcmk_ok) {
	crm_err("Could not connect to fencer in 30 attempts: %s "
	CRM_XS " rc=%d", pcmk_strerror(rc), rc);
	}
	} else {
	// Non-blocking (retry failures later in main loop)
	rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
	if (rc != pcmk_ok) {
	if (pcmk_is_set(controld_globals.fsa_input_register,
	R_ST_REQUIRED)) {
	crm_notice("Fencer connection failed (will retry): %s "
	CRM_XS " rc=%d", pcmk_strerror(rc), rc);
	mainloop_set_trigger(stonith_reconnect);
	} else {
	crm_info("Fencer connection failed (ignoring because no longer required): %s "
	CRM_XS " rc=%d", pcmk_strerror(rc), rc);
	}
	return TRUE;
	}
	}

	if (rc == pcmk_ok) {
	stonith_api->cmds->register_notification(stonith_api,
	T_STONITH_NOTIFY_DISCONNECT,
	tengine_stonith_connection_destroy);
	stonith_api->cmds->register_notification(stonith_api,
	T_STONITH_NOTIFY_FENCE,
	handle_fence_notification);
	stonith_api->cmds->register_notification(stonith_api,
	T_STONITH_NOTIFY_HISTORY_SYNCED,
	tengine_stonith_history_synced);
	te_trigger_stonith_history_sync(TRUE);
	crm_notice("Fencer successfully connected");
	}

	return TRUE;
	}

	/*!
	\internal
	\brief Schedule fencer connection attempt in main loop
	*/
	void
	controld_trigger_fencer_connect(void)
	{
	if (stonith_reconnect == NULL) {
	stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
	te_connect_stonith,
	GINT_TO_POINTER(TRUE));
	}
	controld_set_fsa_input_flags(R_ST_REQUIRED);
	mainloop_set_trigger(stonith_reconnect);
	}

	void
	controld_disconnect_fencer(bool destroy)
	{
	if (stonith_api) {
	// Prevent fencer connection from coming up again
	controld_clear_fsa_input_flags(R_ST_REQUIRED);

	if (stonith_api->state != stonith_disconnected) {
	stonith_api->cmds->disconnect(stonith_api);
	}
	stonith_api->cmds->remove_notification(stonith_api, NULL);
	}
	if (destroy) {
	if (stonith_api) {
	stonith_api->cmds->free(stonith_api);
	stonith_api = NULL;
	}
	if (stonith_reconnect) {
	mainloop_destroy_trigger(stonith_reconnect);
	stonith_reconnect = NULL;
	}
	if (te_client_id) {
	free(te_client_id);
	te_client_id = NULL;
	}
	}
	}

	static gboolean
	do_stonith_history_sync(gpointer user_data)
	{
	if (stonith_api && (stonith_api->state != stonith_disconnected)) {
	stonith_history_t *history = NULL;

	te_cleanup_stonith_history_sync(stonith_api, FALSE);
	stonith_api->cmds->history(stonith_api,
	st_opt_sync_call \| st_opt_broadcast,
	NULL, &history, 5);
	stonith_history_free(history);
	return TRUE;
	} else {
	crm_info("Skip triggering stonith history-sync as stonith is disconnected");
	return FALSE;
	}
	}

	static void
	tengine_stonith_callback(stonith_t stonith, stonith_callback_data_t data)
	{
	char *uuid = NULL;
	int stonith_id = -1;
	int transition_id = -1;
	pcmk__graph_action_t *action = NULL;
	const char *target = NULL;

	if ((data == NULL) \|\| (data->userdata == NULL)) {
	crm_err("Ignoring fence operation %d result: "
	"No transition key given (bug?)",
	((data == NULL)? -1 : data->call_id));
	return;
	}

	if (!AM_I_DC) {
	const char *reason = stonith__exit_reason(data);

	if (reason == NULL) {
	reason = pcmk_exec_status_str(stonith__execution_status(data));
	}
	crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
	data->call_id, stonith__exit_status(data), reason,
	(const char *) data->userdata);
	return;
	}

	CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
	&stonith_id, NULL),
	goto bail);

	if (controld_globals.transition_graph->complete \|\| (stonith_id < 0)
	\|\| !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
	\|\| (controld_globals.transition_graph->id != transition_id)) {
	crm_info("Ignoring fence operation %d result: "
	"Not from current transition " CRM_XS
	" complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
	data->call_id,
	pcmk__btoa(controld_globals.transition_graph->complete),
	stonith_id, uuid, controld_globals.te_uuid, transition_id,
	controld_globals.transition_graph->id);
	goto bail;
	}

	action = controld_get_action(stonith_id);
	if (action == NULL) {
	crm_err("Ignoring fence operation %d result: "
	"Action %d not found in transition graph (bug?) "
	CRM_XS " uuid=%s transition=%d",
	data->call_id, stonith_id, uuid, transition_id);
	goto bail;
	}

	target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
	if (target == NULL) {
	crm_err("Ignoring fence operation %d result: No target given (bug?)",
	data->call_id);
	goto bail;
	}

	stop_te_timer(action);
	if (stonith__exit_status(data) == CRM_EX_OK) {
	const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
	const char *op = crm_meta_value(action->params, "stonith_action");

	crm_info("Fence operation %d for %s succeeded", data->call_id, target);
	if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
	te_action_confirmed(action, NULL);
	if (pcmk__str_eq("on", op, pcmk__str_casei)) {
	const char *value = NULL;
	char *now = pcmk__ttoa(time(NULL));
	gboolean is_remote_node = FALSE;

	/* This check is not 100% reliable, since this node is not
	* guaranteed to have the remote node cached. However, it
	* doesn't have to be reliable, since the attribute manager can
	* learn a node's "remoteness" by other means sooner or later.
	* This allows it to learn more quickly if this node does have
	* the information.
	*/
	if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
	is_remote_node = TRUE;
	}

	update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
	is_remote_node);
	free(now);

	value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
	update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
	is_remote_node);

	value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
	update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
	is_remote_node);

	} else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
	send_stonith_update(action, target, uuid);
	pcmk__set_graph_action_flags(action,
	pcmk__graph_action_sent_update);
	}
	}
	st_fail_count_reset(target);

	} else {
	enum pcmk__graph_next abort_action = pcmk__graph_restart;
	int status = stonith__execution_status(data);
	const char *reason = stonith__exit_reason(data);

	if (reason == NULL) {
	if (status == PCMK_EXEC_DONE) {
	reason = "Agent returned error";
	} else {
	reason = pcmk_exec_status_str(status);
	}
	}
	pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);

	/* If no fence devices were available, there's no use in immediately
	* checking again, so don't start a new transition in that case.
	*/
	if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
	crm_warn("Fence operation %d for %s failed: %s "
	"(aborting transition and giving up for now)",
	data->call_id, target, reason);
	abort_action = pcmk__graph_wait;
	} else {
	crm_notice("Fence operation %d for %s failed: %s "
	"(aborting transition)", data->call_id, target, reason);
	}

	/* Increment the fail count now, so abort_for_stonith_failure() can
	* check it. Non-DC nodes will increment it in
	* handle_fence_notification().
	*/
	st_fail_count_increment(target);
	abort_for_stonith_failure(abort_action, target, NULL);
	}

	pcmk__update_graph(controld_globals.transition_graph, action);
	trigger_graph();

	bail:
	free(data->userdata);
	free(uuid);
	return;
	}

	static int
	fence_with_delay(const char target, const char type, int delay)
	{
	uint32_t options = st_opt_none; // Group of enum stonith_call_options
	int timeout_sec = (int) (controld_globals.transition_graph->stonith_timeout
	/ 1000);

	if (crmd_join_phase_count(crm_join_confirmed) == 1) {
	stonith__set_call_options(options, target, st_opt_allow_suicide);
	}
	return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
	type, timeout_sec, 0, delay);
	}

	/*!
	* \internal
	* \brief Execute a fencing action from a transition graph
	*
	* \param[in] graph Transition graph being executed (ignored)
	* \param[in] action Fencing action to execute
	*
	* \return Standard Pacemaker return code
	*/
	int
	controld_execute_fence_action(pcmk__graph_t *graph,
	pcmk__graph_action_t *action)
	{
	int rc = 0;
	const char *id = ID(action->xml);
	const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
	const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
	const char *type = crm_meta_value(action->params, "stonith_action");
	char *transition_key = NULL;
	const char *priority_delay = NULL;
	int delay_i = 0;
	gboolean invalid_action = FALSE;
	int stonith_timeout = (int) (controld_globals.transition_graph->stonith_timeout
	/ 1000);

	CRM_CHECK(id != NULL, invalid_action = TRUE);
	CRM_CHECK(uuid != NULL, invalid_action = TRUE);
	CRM_CHECK(type != NULL, invalid_action = TRUE);
	CRM_CHECK(target != NULL, invalid_action = TRUE);

	if (invalid_action) {
	crm_log_xml_warn(action->xml, "BadAction");
	return EPROTO;
	}

	priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);

	- crm_notice("Requesting fencing (%s) of node %s "
	+ crm_notice("Requesting fencing (%s) targeting node %s "
	CRM_XS " action=%s timeout=%i%s%s",
	type, target, id, stonith_timeout,
	priority_delay ? " priority_delay=" : "",
	priority_delay ? priority_delay : "");

	/* Passing NULL means block until we can connect... */
	te_connect_stonith(NULL);

	pcmk__scan_min_int(priority_delay, &delay_i, 0);
	rc = fence_with_delay(target, type, delay_i);
	transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
	action->id, 0,
	controld_globals.te_uuid),
	stonith_api->cmds->register_callback(stonith_api, rc,
	(stonith_timeout
	+ (delay_i > 0 ? delay_i : 0)),
	st_opt_timeout_updates, transition_key,
	"tengine_stonith_callback",
	tengine_stonith_callback);
	return pcmk_rc_ok;
	}

	bool
	controld_verify_stonith_watchdog_timeout(const char *value)
	{
	const char *our_nodename = controld_globals.our_nodename;
	gboolean rv = TRUE;

	if (stonith_api && (stonith_api->state != stonith_disconnected) &&
	stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
	our_nodename)) {
	rv = pcmk__valid_sbd_timeout(value);
	}
	return rv;
	}

	/* end stonith API client functions */


	/*
	* stonith history synchronization
	*
	* Each node's fencer keeps track of a cluster-wide fencing history. When a node
	* joins or leaves, we need to synchronize the history across all nodes.
	*/

	static crm_trigger_t *stonith_history_sync_trigger = NULL;
	static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
	static mainloop_timer_t *stonith_history_sync_timer_long = NULL;

	void
	te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
	{
	if (free_timers) {
	mainloop_timer_del(stonith_history_sync_timer_short);
	stonith_history_sync_timer_short = NULL;
	mainloop_timer_del(stonith_history_sync_timer_long);
	stonith_history_sync_timer_long = NULL;
	} else {
	mainloop_timer_stop(stonith_history_sync_timer_short);
	mainloop_timer_stop(stonith_history_sync_timer_long);
	}

	if (st) {
	st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
	}
	}

	static void
	tengine_stonith_history_synced(stonith_t st, stonith_event_t st_event)
	{
	te_cleanup_stonith_history_sync(st, FALSE);
	crm_debug("Fence-history synced - cancel all timers");
	}

	static gboolean
	stonith_history_sync_set_trigger(gpointer user_data)
	{
	mainloop_set_trigger(stonith_history_sync_trigger);
	return FALSE;
	}

	void
	te_trigger_stonith_history_sync(bool long_timeout)
	{
	/* trigger a sync in 5s to give more nodes the
	* chance to show up so that we don't create
	* unnecessary stonith-history-sync traffic
	*
	* the long timeout of 30s is there as a fallback
	* so that after a successful connection to fenced
	* we will wait for 30s for the DC to trigger a
	* history-sync
	* if this doesn't happen we trigger a sync locally
	* (e.g. fenced segfaults and is restarted by pacemakerd)
	*/

	/* as we are finally checking the stonith-connection
	* in do_stonith_history_sync we should be fine
	* leaving stonith_history_sync_time & stonith_history_sync_trigger
	* around
	*/
	if (stonith_history_sync_trigger == NULL) {
	stonith_history_sync_trigger =
	mainloop_add_trigger(G_PRIORITY_LOW,
	do_stonith_history_sync, NULL);
	}

	if (long_timeout) {
	if(stonith_history_sync_timer_long == NULL) {
	stonith_history_sync_timer_long =
	mainloop_timer_add("history_sync_long", 30000,
	FALSE, stonith_history_sync_set_trigger,
	NULL);
	}
	crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
	mainloop_timer_start(stonith_history_sync_timer_long);
	} else {
	if(stonith_history_sync_timer_short == NULL) {
	stonith_history_sync_timer_short =
	mainloop_timer_add("history_sync_short", 5000,
	FALSE, stonith_history_sync_set_trigger,
	NULL);
	}
	crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
	mainloop_timer_start(stonith_history_sync_timer_short);
	}

	}

	/* end stonith history synchronization functions */
	diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py
	index 2e4c23d232..880477aa6a 100644
	--- a/python/pacemaker/_cts/patterns.py
	+++ b/python/pacemaker/_cts/patterns.py
	@@ -1,408 +1,408 @@
	""" Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS) """

	__all__ = ["PatternSelector"]
	__copyright__ = "Copyright 2008-2023 the Pacemaker project contributors"
	__license__ = "GNU General Public License version 2 or later (GPLv2+)"

	import argparse

	from pacemaker.buildoptions import BuildOptions

	class BasePatterns:
	""" The base class for holding a stack-specific set of command and log
	file/stdout patterns. Stack-specific classes need to be built on top
	of this one.
	"""

	def __init__(self):
	""" Create a new BasePatterns instance which holds a very minimal set of
	basic patterns.
	"""

	self._bad_news = []
	self._components = {}
	self._name = "crm-base"

	self._ignore = [
	"avoid confusing Valgrind",

	# Logging bug in some versions of libvirtd
	r"libvirtd.*: internal error: Failed to parse PCI config address",

	# pcs can log this when node is fenced, but fencing is OK in some
	# tests (and we will catch it in pacemaker logs when not OK)
	r"pcs.daemon:No response from: .* request: get_configs, error:",
	]

	self._commands = {
	"StatusCmd" : "crmadmin -t 60 -S %s 2>/dev/null",
	"CibQuery" : "cibadmin -Ql",
	"CibAddXml" : "cibadmin --modify -c --xml-text %s",
	"CibDelXpath" : "cibadmin --delete --xpath %s",
	"RscRunning" : BuildOptions.DAEMON_DIR + "/cts-exec-helper -R -r %s",
	"CIBfile" : "%s:" + BuildOptions.CIB_DIR + "/cib.xml",
	"TmpDir" : "/tmp",

	"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
	"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",

	"MaintenanceModeOn" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
	"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",

	"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
	"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
	}

	self._search = {
	"Pat:DC_IDLE" : r"pacemaker-controld.State transition.-> S_IDLE",

	# This won't work if we have multiple partitions
	"Pat:Local_started" : r"%s\W.*controller successfully started",
	"Pat:NonDC_started" : r"%s\W.State transition.-> S_NOT_DC",
	"Pat:DC_started" : r"%s\W.State transition.-> S_IDLE",
	"Pat:We_stopped" : r"%s\W.*OVERRIDE THIS PATTERN",
	"Pat:They_stopped" : r"%s\W.LOST:. %s ",
	"Pat:They_dead" : r"node %s.*: is dead",
	"Pat:They_up" : r"%s %s\W.*OVERRIDE THIS PATTERN",
	"Pat:TransitionComplete" : "Transition status: Complete: complete",

	"Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s",
	"Pat:Fencing_ok" : r"pacemaker-fenced.:\sOperation .* targeting %s by .* for .@.: OK",
	"Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover\s+%s",
	"Pat:Fencing_active" : r"stonith resource .* is active on 2 nodes (attempting recovery)",
	"Pat:Fencing_probe" : r"pacemaker-controld.* Result of probe operation for %s on .*: Error",

	"Pat:RscOpOK" : r"pacemaker-controld.:\s+Result of %s operation for %s.: (0 \()?ok",
	"Pat:RscOpFail" : r"pacemaker-schedulerd.:.Unexpected result .* recorded for %s of %s ",
	"Pat:CloneOpFail" : r"pacemaker-schedulerd.:.Unexpected result .* recorded for %s of (%s\|%s) ",
	"Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
	"Pat:NodeFenced" : r"pacemaker-controld.:\s Peer %s was terminated $.$ by . on behalf of .*: OK",
	}

	def get_component(self, key):
	""" Return the patterns for a single component as a list, given by key.
	This is typically the name of some subprogram (pacemaker-based,
	pacemaker-fenced, etc.) or various special purpose keys. If key is
	unknown, return an empty list.
	"""

	if key in self._components:
	return self._components[key]

	print("Unknown component '%s' for %s" % (key, self._name))
	return []

	def get_patterns(self, key):
	""" Return various patterns supported by this object, given by key.
	Depending on the key, this could either be a list or a hash. If key
	is unknown, return None.
	"""

	if key == "BadNews":
	return self._bad_news
	if key == "BadNewsIgnore":
	return self._ignore
	if key == "Commands":
	return self._commands
	if key == "Search":
	return self._search
	if key == "Components":
	return self._components

	print("Unknown pattern '%s' for %s" % (key, self._name))
	return None

	def __getitem__(self, key):
	if key == "Name":
	return self._name
	if key in self._commands:
	return self._commands[key]
	if key in self._search:
	return self._search[key]

	print("Unknown template '%s' for %s" % (key, self._name))
	return None


	class Corosync2Patterns(BasePatterns):
	""" Patterns for Corosync version 2 cluster manager class """

	def __init__(self):
	BasePatterns.__init__(self)
	self._name = "crm-corosync"

	self._commands.update({
	"StartCmd" : "service corosync start && service pacemaker start",
	"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] \|\| service pacemaker_remote stop; service corosync stop",

	"EpochCmd" : "crm_node -e",
	"QuorumCmd" : "crm_node -q",
	"PartitionCmd" : "crm_node -p",
	})

	self._search.update({
	# Close enough ... "Corosync Cluster Engine exiting normally" isn't
	# printed reliably.
	"Pat:We_stopped" : r"%s\W.*Unloading all Corosync service engines",
	"Pat:They_stopped" : r"%s\W.pacemaker-controld.Node %s(\[\|\s).*state is now lost",
	"Pat:They_dead" : r"pacemaker-controld.Node %s(\[\|\s).state is now lost",
	"Pat:They_up" : r"\W%s\W.pacemaker-controld.Node %s state is now member",

	"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
	# "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
	"Pat:ChildKilled" : r"%s\W.pacemakerd.%s\[[0-9]+\] terminated( with signal 9\|$)",
	"Pat:ChildRespawn" : r"%s\W.pacemakerd.Respawning %s subdaemon after unexpected exit",

	"Pat:InfraUp" : r"%s\W.corosync.Initializing transport",
	"Pat:PacemakerUp" : r"%s\W.pacemakerd.Starting Pacemaker",
	})

	self._ignore += [
	r"crm_mon:",
	r"crmadmin:",
	r"update_trace_data",
	r"async_notify:.*strange, client not found",
	r"Parse error: Ignoring unknown option .*nodename",
	r"error.: Operation 'reboot' . using FencingFail returned ",
	r"getinfo response error: 1$",
	r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
	r"sbd.* pcmk:\serror:.Connection to cib_ro.* (failed\|closed)",
	]

	self._bad_news = [
	r"[^(]error:",
	r"crit:",
	r"ERROR:",
	r"CRIT:",
	r"Shutting down...NOW",
	r"Timer I_TERMINATE just popped",
	r"input=I_ERROR",
	r"input=I_FAIL",
	r"input=I_INTEGRATED cause=C_TIMER_POPPED",
	r"input=I_FINALIZED cause=C_TIMER_POPPED",
	r"input=I_ERROR",
	r"(pacemakerd\|pacemaker-execd\|pacemaker-controld):.*, exiting",
	r"schedulerd.*Attempting recovery of resource",
	r"is taking more than 2x its timeout",
	r"Confirm not received from",
	r"Welcome reply not received from",
	r"Attempting to schedule .* after a stop",
	r"Resource .* was active at shutdown",
	r"duplicate entries for call_id",
	r"Search terminated:",
	r":global_timer_callback",
	r"Faking parameter digest creation",
	r"Parameters to .* action changed:",
	r"Parameters to .* changed",
	r"pacemakerd.*\[[0-9]+\] terminated( with signal\| as IPC server\|$)",
	r"pacemaker-schedulerd.Recover\s+.$.* -\> .*$",
	r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
	r"Peer is not part of our cluster",
	r"We appear to be in an election loop",
	r"Unknown node -> we will not deliver message",
	r"(Blackbox dump requested\|Problem detected)",
	r"pacemakerd.*Could not connect to Cluster Configuration Database API",
	r"Receiving messages from a node we think is dead",
	r"share the same cluster nodeid",
	r"share the same name",

	r"pacemaker-controld:.*Transition failed: terminated",
	r"Local CIB .* differs from .*:",
	r"warn.:\sContinuing but .* will NOT be used",
	r"warn.:\sCluster configuration file .* is corrupt",
	r"Election storm",
	r"stalled the FSA with pending inputs",
	]

	self._components["common-ignore"] = [
	r"Pending action:",
	r"resource( was\|s were) active at shutdown",
	r"pending LRM operations at shutdown",
	r"Lost connection to the CIB manager",
	r"pacemaker-controld.:\sAction A_RECOVER .* not supported",
	r"pacemaker-controld.:\sPerforming A_EXIT_1 - forcefully exiting ",
	- r".:\sRequesting fencing $[^)]+$ of node ",
	+ r".:\sRequesting fencing $[^)]+$ targeting node ",
	r"(Blackbox dump requested\|Problem detected)",
	]

	self._components["corosync-ignore"] = [
	r"Could not connect to Corosync CFG: CS_ERR_LIBRARY",
	r"error:.*Connection to the CPG API failed: Library error",
	r"\[[0-9]+\] exited with status [0-9]+ \(",
	r"\[[0-9]+\] terminated with signal 15",
	r"pacemaker-based.error:.Corosync connection lost",
	r"pacemaker-fenced.error:.Corosync connection terminated",
	r"pacemaker-controld.State transition . S_RECOVERY",
	r"pacemaker-controld.error:.Input (I_ERROR\|I_TERMINATE ) .*received in state",
	r"pacemaker-controld.error:.Could not recover from internal error",
	r"error:.Connection to cib_(shm\|rw). (failed\|closed)",
	r"error:.*cib_(shm\|rw) IPC provider disconnected while waiting",
	r"error:.Connection to (fencer\|stonith-ng). (closed\|failed\|lost)",
	r"crit: Fencing daemon connection failed",
	# This is overbroad, but we don't have a way to say that only
	# certain transition errors are acceptable (if the fencer respawns,
	# fence devices may appear multiply active). We have to rely on
	# other causes of a transition error logging their own error
	# message, which is the usual practice.
	r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
	]

	self._components["corosync"] = [
	# We expect each daemon to lose its cluster connection.
	# However, if the CIB manager loses its connection first,
	# it's possible for another daemon to lose that connection and
	# exit before losing the cluster connection.
	r"pacemakerd.:\swarning:.*Lost connection to cluster layer",
	r"pacemaker-attrd.:\s(crit\|error):.*Lost connection to (cluster layer\|the CIB manager)",
	r"pacemaker-based.:\s(crit\|error):.*Lost connection to cluster layer",
	r"pacemaker-controld.:\s(crit\|error):.*Lost connection to (cluster layer\|the CIB manager)",
	r"pacemaker-fenced.:\s(crit\|error):.*Lost connection to (cluster layer\|the CIB manager)",
	r"schedulerd.Scheduling node . for fencing",
	r"pacemaker-controld.:\sPeer .* was terminated $.$ by . on behalf of .:\sOK",
	]

	self._components["pacemaker-based"] = [
	r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102",
	r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1",
	r"pacemakerd.* Respawning pacemaker-attrd subdaemon after unexpected exit",
	r"pacemakerd.* Respawning pacemaker-based subdaemon after unexpected exit",
	r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit",
	r"pacemakerd.* Respawning pacemaker-fenced subdaemon after unexpected exit",
	r"pacemaker-.* Connection to cib_.* (failed\|closed)",
	r"pacemaker-attrd.:.Lost connection to the CIB manager",
	r"pacemaker-controld.:.Lost connection to the CIB manager",
	r"pacemaker-controld.I_ERROR.handle_cib_disconnect",
	r"pacemaker-controld.* State transition .* S_RECOVERY",
	r"pacemaker-controld.: Input I_TERMINATE .from do_recover",
	r"pacemaker-controld.*Could not recover from internal error",
	]

	self._components["pacemaker-based-ignore"] = [
	r"pacemaker-execd.Connection to (fencer\|stonith-ng). (closed\|failed\|lost)",
	r"pacemaker-controld.:\s+Result of . operation for Fencing.*Error $Lost connection to fencer$",
	r"pacemaker-controld.*:Could not connect to attrd: Connection refused",
	# This is overbroad, but we don't have a way to say that only
	# certain transition errors are acceptable (if the fencer respawns,
	# fence devices may appear multiply active). We have to rely on
	# other causes of a transition error logging their own error
	# message, which is the usual practice.
	r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
	]

	self._components["pacemaker-execd"] = [
	r"pacemaker-controld.*Connection to executor failed",
	r"pacemaker-controld.I_ERROR.lrm_connection_destroy",
	r"pacemaker-controld.State transition . S_RECOVERY",
	r"pacemaker-controld.: Input I_TERMINATE .from do_recover",
	r"pacemaker-controld.*Could not recover from internal error",
	r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1",
	r"pacemakerd.* Respawning pacemaker-execd subdaemon after unexpected exit",
	r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit",
	]

	self._components["pacemaker-execd-ignore"] = [
	r"pacemaker-(attrd\|controld).Connection to lrmd. (failed\|closed)",
	r"pacemaker-(attrd\|controld).*Could not execute alert",
	]

	self._components["pacemaker-controld"] = [
	r"State transition .* -> S_IDLE",
	]

	self._components["pacemaker-controld-ignore"] = []
	self._components["pacemaker-attrd"] = []
	self._components["pacemaker-attrd-ignore"] = []

	self._components["pacemaker-schedulerd"] = [
	r"State transition .* S_RECOVERY",
	r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit",
	r"pacemaker-controld\[[0-9]+\] exited with status 1 \(",
	r"Connection to the scheduler failed",
	r"pacemaker-controld.I_ERROR.save_cib_contents",
	r"pacemaker-controld.: Input I_TERMINATE .from do_recover",
	r"pacemaker-controld.*Could not recover from internal error",
	]

	self._components["pacemaker-schedulerd-ignore"] = [
	r"Connection to pengine.* (failed\|closed)",
	]

	self._components["pacemaker-fenced"] = [
	r"error:.Connection to (fencer\|stonith-ng). (closed\|failed\|lost)",
	r"Fencing daemon connection failed",
	r"pacemaker-controld.*Fencer successfully connected",
	]

	self._components["pacemaker-fenced-ignore"] = [
	r"(error\|warning):.Connection to (fencer\|stonith-ng). (closed\|failed\|lost)",
	r"crit:.*Fencing daemon connection failed",
	r"error:.*Fencer connection failed $will retry$",
	r"pacemaker-controld.:\s+Result of . operation for Fencing.*Error $Lost connection to fencer$",
	# This is overbroad, but we don't have a way to say that only
	# certain transition errors are acceptable (if the fencer respawns,
	# fence devices may appear multiply active). We have to rely on
	# other causes of a transition error logging their own error
	# message, which is the usual practice.
	r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
	]

	self._components["pacemaker-fenced-ignore"].extend(self._components["common-ignore"])


	patternVariants = {
	"crm-base": BasePatterns,
	"crm-corosync": Corosync2Patterns
	}


	class PatternSelector:
	""" A class for choosing one of several Pattern objects and then extracting
	various pieces of information from that object
	"""

	def __init__(self, name="crm-corosync"):
	""" Create a new PatternSelector object by instantiating whatever class
	is given by name. Defaults to Corosync2Patterns for "crm-corosync" or
	None. While other objects could be supported in the future, only this
	and the base object are supported at this time.
	"""

	self._name = name

	# If no name was given, use the default. Otherwise, look up the appropriate
	# class in patternVariants, instantiate it, and use that.
	if not name:
	self._base = Corosync2Patterns()
	else:
	self._base = patternVariants[name]()

	def get_patterns(self, kind):
	""" Call get_patterns on the previously instantiated pattern object """

	return self._base.get_patterns(kind)

	def get_template(self, key):
	""" Return a single pattern from the previously instantiated pattern
	object as a string, or None if no pattern exists for the given key.
	"""

	return self._base[key]

	def get_component(self, kind):
	""" Call get_component on the previously instantiated pattern object """

	return self._base.get_component(kind)

	def __getitem__(self, key):
	return self.get_template(key)


	# PYTHONPATH=python python python/pacemaker/_cts/patterns.py -k crm-corosync -t StartCmd
	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("-k", "--kind", metavar="KIND")
	parser.add_argument("-t", "--template", metavar="TEMPLATE")

	args = parser.parse_args()

	print(PatternSelector(args.kind)[args.template])

File Metadata

Mime Type: text/x-diff
Expires: Sat, Nov 23, 2:35 PM (19 h, 33 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1018793
Default Alt Text: (166 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions