diff --git a/cts/CTStests.py b/cts/CTStests.py
index c8291a1931..a57a5da805 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3130 +1,3130 @@
""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2000-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
#
# SPECIAL NOTE:
#
# Tests may NOT implement any cluster-manager-specific code in them.
# EXTEND the ClusterManager object to provide the base capabilities
# the test needs if you need to do something that the current CM classes
# do not. Otherwise you screw up the whole point of the object structure
# in CTS.
#
# Thank you.
#
import os
import re
import time
import subprocess
import tempfile
from stat import *
from cts import CTS
from cts.CTSaudits import *
from cts.CTSvars import *
from cts.patterns import PatternSelector
from cts.logging import LogFactory
from cts.remote import RemoteFactory, input_wrapper
from cts.watcher import LogWatcher
from cts.environment import EnvFactory
AllTestClasses = [ ]
class CTSTest(object):
'''
A Cluster test.
We implement the basic set of properties and behaviors for a generic
cluster test.
Cluster tests track their own statistics.
We keep each of the kinds of counts we track as separate {name,value}
pairs.
'''
def __init__(self, cm):
#self.name="the unnamed test"
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
# if not issubclass(cm.__class__, ClusterManager):
# raise ValueError("Must be a ClusterManager object")
self.CM = cm
self.Env = EnvFactory().getInstance()
self.rsh = RemoteFactory().getInstance()
self.logger = LogFactory()
self.templates = PatternSelector(cm["Name"])
self.Audits = []
self.timeout = 120
self.passed = 1
self.is_loop = 0
self.is_unsafe = 0
self.is_docker_unsafe = 0
self.is_experimental = 0
self.is_container = 0
self.is_valgrind = 0
self.benchmark = 0 # which tests to benchmark
self.timer = {} # timers
def log(self, args):
self.logger.log(args)
def debug(self, args):
self.logger.debug(args)
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
if str(key) == "0":
raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")
if key in self.Stats:
return self.Stats[key]
return None
def log_mark(self, msg):
self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
return
def get_timer(self,key = "test"):
try: return self.timer[key]
except: return 0
def set_timer(self,key = "test"):
self.timer[key] = time.time()
return self.timer[key]
def log_timer(self,key = "test"):
elapsed = 0
if key in self.timer:
elapsed = time.time() - self.timer[key]
s = key == "test" and self.name or "%s:%s" % (self.name,key)
self.debug("%s runtime: %.2f" % (s, elapsed))
del self.timer[key]
return elapsed
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
# Reset the test passed boolean
if name == "calls":
self.passed = 1
def failure(self, reason="none"):
'''Increment the failure count'''
self.passed = 0
self.incr("failure")
self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
return None
def success(self):
'''Increment the success count'''
self.incr("success")
return 1
def skipped(self):
'''Increment the skipped count'''
self.incr("skipped")
return 1
def __call__(self, node):
'''Perform the given test'''
raise ValueError("Abstract Class member (__call__)")
self.incr("calls")
return self.failure()
def audit(self):
passed = 1
if len(self.Audits) > 0:
for audit in self.Audits:
if not audit():
self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
self.incr("auditfail")
passed = 0
return passed
def setup(self, node):
'''Setup the given test'''
return self.success()
def teardown(self, node):
'''Tear down the given test'''
return self.success()
def create_watch(self, patterns, timeout, name=None):
if not name:
name = self.name
return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
def local_badnews(self, prefix, watch, local_ignore=[]):
errcount = 0
if not prefix:
prefix = "LocalBadNews:"
ignorelist = []
ignorelist.append(" CTS: ")
ignorelist.append(prefix)
ignorelist.extend(local_ignore)
while errcount < 100:
match = watch.look(0)
if match:
add_err = 1
for ignore in ignorelist:
if add_err == 1 and re.search(ignore, match):
add_err = 0
if add_err == 1:
self.logger.log(prefix + " " + match)
errcount = errcount + 1
else:
break
else:
self.logger.log("Too many errors!")
watch.end()
return errcount
def is_applicable(self):
return self.is_applicable_common()
def is_applicable_common(self):
'''Return TRUE if we are applicable in the current test configuration'''
#raise ValueError("Abstract Class member (is_applicable)")
if self.is_loop and not self.Env["loop-tests"]:
return 0
elif self.is_unsafe and not self.Env["unsafe-tests"]:
return 0
elif self.is_valgrind and not self.Env["valgrind-tests"]:
return 0
elif self.is_experimental and not self.Env["experimental-tests"]:
return 0
elif self.is_docker_unsafe and self.Env["docker"]:
return 0
elif self.is_container and not self.Env["container-tests"]:
return 0
elif self.Env["benchmark"] and self.benchmark == 0:
return 0
return 1
def find_ocfs2_resources(self, node):
self.r_o2cb = None
self.r_ocfs2 = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "o2cb" and r.parent != "NA":
self.debug("Found o2cb: %s" % self.r_o2cb)
self.r_o2cb = r.parent
if re.search("^Constraint", line):
c = AuditConstraint(self.CM, line)
if c.type == "rsc_colocation" and c.target == self.r_o2cb:
self.r_ocfs2.append(c.rsc)
self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
return len(self.r_ocfs2)
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
return 1
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return []
class StopTest(CTSTest):
'''Stop (deactivate) the cluster manager on a node'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stop"
def __call__(self, node):
'''Perform the 'stop' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] != "up":
return self.skipped()
patterns = []
# Technically we should always be able to notice ourselves stopping
patterns.append(self.templates["Pat:We_stopped"] % node)
# Any active node needs to notice this one left
# (note that this won't work if we have multiple partitions)
for other in self.Env["nodes"]:
if self.CM.ShouldBeStatus[other] == "up" and other != node:
patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
#self.debug("Checking %s will notice %s left"%(other, node))
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
if node == self.CM.OurNode:
self.incr("us")
else:
if self.CM.upcount() <= 1:
self.incr("all")
else:
self.incr("them")
self.CM.StopaCM(node)
watch_result = watch.lookforall()
failreason = None
UnmatchedList = "||"
if watch.unmatched:
(rc, output) = self.rsh(node, "/bin/ps axf", None)
for line in output:
self.debug(line)
(rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
for line in output:
self.debug(line)
for regex in watch.unmatched:
self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
UnmatchedList += regex + "||";
failreason = "Missing shutdown pattern"
self.CM.cluster_stable(self.Env["DeadTime"])
if not watch.unmatched or self.CM.upcount() == 0:
return self.success()
if len(watch.unmatched) >= self.CM.upcount():
return self.failure("no match against (%s)" % UnmatchedList)
if failreason == None:
return self.success()
else:
return self.failure(failreason)
#
# We don't register StopTest because it's better when called by
# another test...
#
class StartTest(CTSTest):
'''Start (activate) the cluster manager on a node'''
def __init__(self, cm, debug=None):
CTSTest.__init__(self,cm)
self.name = "start"
self.debug = debug
def __call__(self, node):
'''Perform the 'start' test. '''
self.incr("calls")
if self.CM.upcount() == 0:
self.incr("us")
else:
self.incr("them")
if self.CM.ShouldBeStatus[node] != "down":
return self.skipped()
elif self.CM.StartaCM(node):
return self.success()
else:
return self.failure("Startup %s on node %s failed"
% (self.Env["Name"], node))
#
# We don't register StartTest because it's better when called by
# another test...
#
class FlipTest(CTSTest):
'''If it's running, stop it. If it's stopped start it.
Overthrow the status quo...
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Flip"
self.start = StartTest(cm)
self.stop = StopTest(cm)
def __call__(self, node):
'''Perform the 'Flip' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] == "up":
self.incr("stopped")
ret = self.stop(node)
type = "up->down"
# Give the cluster time to recognize it's gone...
time.sleep(self.Env["StableTime"])
elif self.CM.ShouldBeStatus[node] == "down":
self.incr("started")
ret = self.start(node)
type = "down->up"
else:
return self.skipped()
self.incr(type)
if ret:
return self.success()
else:
return self.failure("%s failure" % type)
# Register FlipTest as a good test to run
AllTestClasses.append(FlipTest)
class RestartTest(CTSTest):
'''Stop and restart a node'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Restart"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.benchmark = 1
def __call__(self, node):
'''Perform the 'restart' test. '''
self.incr("calls")
self.incr("node:" + node)
ret1 = 1
if self.CM.StataCM(node):
self.incr("WasStopped")
if not self.start(node):
return self.failure("start (setup) failure: "+node)
self.set_timer()
if not self.stop(node):
return self.failure("stop failure: "+node)
if not self.start(node):
return self.failure("start failure: "+node)
return self.success()
# Register RestartTest as a good test to run
AllTestClasses.append(RestartTest)
class StonithdTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stonithd"
self.startall = SimulStartLite(cm)
self.benchmark = 1
def __call__(self, node):
self.incr("calls")
if len(self.Env["nodes"]) < 2:
return self.skipped()
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
is_dc = self.CM.is_node_dc(node)
watchpats = []
watchpats.append(self.templates["Pat:FenceOpOK"] % node)
watchpats.append(self.templates["Pat:NodeFenced"] % node)
if self.Env["at-boot"] == 0:
self.debug("Expecting %s to stay down" % node)
self.CM.ShouldBeStatus[node] = "down"
else:
self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
origin = self.Env.RandomGen.choice(self.Env["nodes"])
rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
if rc == 194:
# 194 - 256 = -62 = Timer expired
#
# Look for the patterns, usually this means the required
# device was running on the node to be fenced - or that
# the required devices were in the process of being loaded
# and/or moved
#
# Effectively the node committed suicide so there will be
# no confirmation, but pacemaker should be watching and
# fence the node again
self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
elif origin != node and rc != 0:
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
elif origin == node and rc != 255:
# 255 == broken pipe, ie. the node was fenced as expected
self.logger.log("Locally originated fencing returned %d" % rc)
self.set_timer("fence")
matched = watch.lookforall()
self.log_timer("fence")
self.set_timer("reform")
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected patterns")
elif not is_stable:
return self.failure("Cluster did not become stable")
self.log_timer("reform")
return self.success()
def errorstoignore(self):
return [
self.templates["Pat:Fencing_start"] % ".*",
self.templates["Pat:Fencing_ok"] % ".*",
r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
- r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired",
+ r"error.*: Operation 'reboot' targeting .* on .* for stonith_admin.*: Timer expired",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return 1
AllTestClasses.append(StonithdTest)
class StartOnebyOne(CTSTest):
'''Start all the nodes ~ one by one'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StartOnebyOne"
self.stopall = SimulStopLite(cm)
self.start = StartTest(cm)
self.ns = CTS.NodeStatus(cm.Env)
def __call__(self, dummy):
'''Perform the 'StartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Test setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.start(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to start: " + repr(failed))
return self.success()
# Register StartOnebyOne as a good test to run
AllTestClasses.append(StartOnebyOne)
class SimulStart(CTSTest):
'''Start all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStart"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStart' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
if not self.startall(None):
return self.failure("Startall failed")
return self.success()
# Register SimulStart as a good test to run
AllTestClasses.append(SimulStart)
class SimulStop(CTSTest):
'''Stop all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStop"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStop' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.stopall(None):
return self.failure("Stopall failed")
return self.success()
# Register SimulStop as a good test to run
AllTestClasses.append(SimulStop)
class StopOnebyOne(CTSTest):
'''Stop all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StopOnebyOne"
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
def __call__(self, dummy):
'''Perform the 'StopOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.stop(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to stop: " + repr(failed))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(StopOnebyOne)
class RestartOnebyOne(CTSTest):
'''Restart all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RestartOnebyOne"
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'RestartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
did_fail = []
self.set_timer()
self.restart = RestartTest(self.CM)
for node in self.Env["nodes"]:
if not self.restart(node):
did_fail.append(node)
if did_fail:
return self.failure("Could not restart %d nodes: %s"
% (len(did_fail), repr(did_fail)))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(RestartOnebyOne)
class PartialStart(CTSTest):
'''Start a node - but tell it to stop before it finishes starting up'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "PartialStart"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
self.stop = StopTest(cm)
#self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'PartialStart' test. '''
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
# FIXME! This should use the CM class to get the pattern
# then it would be applicable in general
watchpats = []
watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.CM.StartaCMnoBlock(node)
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
return self.failure("Setup of %s failed" % node)
ret = self.stop(node)
if not ret:
return self.failure("%s did not stop in time" % node)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# We might do some fencing in the 2-node case if we make it up far enough
return [
r"Executing reboot fencing operation",
r"Requesting fencing \([^)]+\) of node ",
]
# Register StopOnebyOne as a good test to run
AllTestClasses.append(PartialStart)
class StandbyTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Standby"
self.benchmark = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
# make sure the node is active
# set the node to standby mode
# check resources, none resource should be running on the node
# set the node to active mode
# check resouces, resources should have been migrated back (SHOULD THEY?)
def __call__(self, node):
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
self.debug("Make sure node %s is active" % node)
if self.CM.StandbyStatus(node) != "off":
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.debug("Getting resources running on node %s" % node)
rsc_on_node = self.CM.active_resources(node)
watchpats = []
watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.debug("Setting node %s to standby mode" % node)
if not self.CM.SetStandbyMode(node, "on"):
return self.failure("can't set node %s to standby mode" % node)
self.set_timer("on")
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.CM.SetStandbyMode(node, "off")
return self.failure("cluster didn't react to standby change on %s" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "on":
return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
self.log_timer("on")
self.debug("Checking resources")
bad_run = self.CM.active_resources(node)
if len(bad_run) > 0:
rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
self.debug("Setting node %s to active mode" % node)
self.CM.SetStandbyMode(node, "off")
return rc
self.debug("Setting node %s to active mode" % node)
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.set_timer("off")
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.log_timer("off")
return self.success()
AllTestClasses.append(StandbyTest)
class ValgrindTest(CTSTest):
'''Check for memory leaks'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Valgrind"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_valgrind = 1
self.is_loop = 1
def setup(self, node):
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind,
# and clear any valgrind logs from previous runs. For now, we rely on
# the user to do this manually.
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
return self.success()
def teardown(self, node):
# Return all nodes to normal
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
return self.success()
def find_leaks(self):
# Check for leaks
# (no longer used but kept in case feature is restored)
leaked = []
self.stop = StopTest(self.CM)
for node in self.Env["nodes"]:
rc = self.stop(node)
if not rc:
self.failure("Couldn't shut down %s" % node)
rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
if rc != 1:
leaked.append(node)
self.failure("Valgrind errors detected on %s" % node)
(rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
for line in output:
self.logger.log(line)
(rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
for line in output:
self.debug(line)
self.rsh(node, "rm -f %s" % self.logger.logPat, None)
return leaked
def __call__(self, node):
#leaked = self.find_leaks()
#if len(leaked) > 0:
# return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
r"pacemaker-based.*: .* avoid confusing Valgrind",
r"HA_VALGRIND_ENABLED",
]
class StandbyLoopTest(ValgrindTest):
'''Check for memory leaks by putting a node in and out of standby for an hour'''
# @TODO This is not a useful test for memory leaks
def __init__(self, cm):
ValgrindTest.__init__(self,cm)
self.name = "StandbyLoop"
def __call__(self, node):
lpc = 0
delay = 2
failed = 0
done = time.time() + self.Env["loop-minutes"] * 60
while time.time() <= done and not failed:
lpc = lpc + 1
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "on"):
self.failure("can't set node %s to standby mode" % node)
failed = lpc
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "off"):
self.failure("can't set node %s to active mode" % node)
failed = lpc
leaked = self.find_leaks()
if failed:
return self.failure("Iteration %d failed" % failed)
elif len(leaked) > 0:
return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
#AllTestClasses.append(StandbyLoopTest)
class BandwidthTest(CTSTest):
# Tests should not be cluster-manager-specific
# If you need to find out cluster manager configuration to do this, then
# it should be added to the generic cluster manager API.
'''Test the bandwidth which the cluster uses'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Bandwidth"
self.start = StartTest(cm)
self.__setitem__("min",0)
self.__setitem__("max",0)
self.__setitem__("totalbandwidth",0)
(handle, self.tempfile) = tempfile.mkstemp(".cts")
os.close(handle)
self.startall = SimulStartLite(cm)
def __call__(self, node):
'''Perform the Bandwidth test'''
self.incr("calls")
if self.CM.upcount() < 1:
return self.skipped()
Path = self.CM.InternalCommConfig()
if "ip" not in Path["mediatype"]:
return self.skipped()
port = Path["port"][0]
port = int(port)
ret = self.startall(None)
if not ret:
return self.failure("Test setup failed")
time.sleep(5) # We get extra messages right after startup.
fstmpfile = "/var/run/band_estimate"
dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
% (port, fstmpfile)
rc = self.rsh(node, dumpcmd)
if rc == 0:
farfile = "root@%s:%s" % (node, fstmpfile)
self.rsh.cp(farfile, self.tempfile)
Bandwidth = self.countbandwidth(self.tempfile)
if not Bandwidth:
self.logger.log("Could not compute bandwidth.")
return self.success()
intband = int(Bandwidth + 0.5)
self.logger.log("...bandwidth: %d bits/sec" % intband)
self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
if self.Stats["min"] == 0:
self.Stats["min"] = Bandwidth
if Bandwidth > self.Stats["max"]:
self.Stats["max"] = Bandwidth
if Bandwidth < self.Stats["min"]:
self.Stats["min"] = Bandwidth
self.rsh(node, "rm -f %s" % fstmpfile)
os.unlink(self.tempfile)
return self.success()
else:
return self.failure("no response from tcpdump command [%d]!" % rc)
def countbandwidth(self, file):
fp = open(file, "r")
fp.seek(0)
count = 0
sum = 0
while 1:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count + 1
linesplit = line.split(" ")
for j in range(len(linesplit)-1):
if linesplit[j] == "udp": break
if linesplit[j] == "length:": break
try:
sum = sum + int(linesplit[j+1])
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T1 = linesplit[0]
timesplit = T1.split(":")
time2split = timesplit[2].split(".")
time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
break
while count < 100:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count+1
linessplit = line.split(" ")
for j in range(len(linessplit)-1):
if linessplit[j] == "udp": break
if linessplit[j] == "length:": break
try:
sum = int(linessplit[j+1]) + sum
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T2 = linessplit[0]
timesplit = T2.split(":")
time2split = timesplit[2].split(".")
time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
time = time2-time1
if (time <= 0):
return 0
return int((sum*8)/time)
def is_applicable(self):
'''BandwidthTest never applicable'''
return 0
AllTestClasses.append(BandwidthTest)
###################################################################
class MaintenanceMode(CTSTest):
###################################################################
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "MaintenanceMode"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
#self.is_unsafe = 1
self.benchmark = 1
self.action = "asyncmon"
self.interval = 0
self.rid = "maintenanceDummy"
def toggleMaintenanceMode(self, node, action):
pats = []
pats.append(self.templates["Pat:DC_IDLE"])
# fail the resource right after turning Maintenance mode on
# verify it is not recovered until maintenance mode is turned off
if action == "On":
pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of %s on" % (self.action, self.rid))
else:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.debug("Turning maintenance mode %s" % action)
self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
if (action == "On"):
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover%s" % (action))
watch.lookforall()
self.log_timer("recover%s" % (action))
if watch.unmatched:
self.debug("Failed to find patterns when turning maintenance mode %s" % action)
return repr(watch.unmatched)
return ""
def insertMaintenanceDummy(self, node):
pats = []
pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.AddDummyRsc(node, self.rid)
self.set_timer("addDummy")
watch.lookforall()
self.log_timer("addDummy")
if watch.unmatched:
self.debug("Failed to find patterns when adding maintenance dummy resource")
return repr(watch.unmatched)
return ""
def removeMaintenanceDummy(self, node):
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.RemoveDummyRsc(node, self.rid)
self.set_timer("removeDummy")
watch.lookforall()
self.log_timer("removeDummy")
if watch.unmatched:
self.debug("Failed to find patterns when removing maintenance dummy resource")
return repr(watch.unmatched)
return ""
def managedRscList(self, node):
rscList = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.managed():
rscList.append(tmp.id)
return rscList
def verifyResources(self, node, rscList, managed):
managedList = list(rscList)
managed_str = "managed"
if not managed:
managed_str = "unmanaged"
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if managed and not tmp.managed():
continue
elif not managed and tmp.managed():
continue
elif managedList.count(tmp.id):
managedList.remove(tmp.id)
if len(managedList) == 0:
self.debug("Found all %s resources on %s" % (managed_str, node))
return True
self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
return False
def __call__(self, node):
'''Perform the 'MaintenanceMode' test. '''
self.incr("calls")
verify_managed = False
verify_unmanaged = False
failPat = ""
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
# get a list of all the managed resources. We use this list
# after enabling maintenance mode to verify all managed resources
# become un-managed. After maintenance mode is turned off, we use
# this list to verify all the resources become managed again.
managedResources = self.managedRscList(node)
if len(managedResources) == 0:
self.logger.log("No managed resources on %s" % node)
return self.skipped()
# insert a fake resource we can fail during maintenance mode
# so we can verify recovery does not take place until after maintenance
# mode is disabled.
failPat = failPat + self.insertMaintenanceDummy(node)
# toggle maintenance mode ON, then fail dummy resource.
failPat = failPat + self.toggleMaintenanceMode(node, "On")
# verify all the resources are now unmanaged
if self.verifyResources(node, managedResources, False):
verify_unmanaged = True
# Toggle maintenance mode OFF, verify dummy is recovered.
failPat = failPat + self.toggleMaintenanceMode(node, "Off")
# verify all the resources are now managed again
if self.verifyResources(node, managedResources, True):
verify_managed = True
# Remove our maintenance dummy resource.
failPat = failPat + self.removeMaintenanceDummy(node)
self.CM.cluster_stable()
if failPat != "":
return self.failure("Unmatched patterns: %s" % (failPat))
elif verify_unmanaged is False:
return self.failure("Failed to verify resources became unmanaged during maintenance mode")
elif verify_managed is False:
return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(MaintenanceMode)
class ResourceRecover(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ResourceRecover"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
self.rid = None
self.rid_alt = None
#self.is_unsafe = 1
self.benchmark = 1
# these are the values used for the new LRM API call
self.action = "asyncmon"
self.interval = 0
def __call__(self, node):
'''Perform the 'ResourceRecover' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
resourcelist = self.CM.active_resources(node)
# if there are no resourcelist, return directly
if len(resourcelist) == 0:
self.logger.log("No active resources on %s" % node)
return self.skipped()
self.rid = self.Env.RandomGen.choice(resourcelist)
self.rid_alt = self.rid
rsc = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.id == self.rid:
rsc = tmp
# Handle anonymous clones that get renamed
self.rid = rsc.clone_id
break
if not rsc:
return self.failure("Could not find %s in the resource list" % self.rid)
self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
pats = []
pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of (%s|%s) on" % (self.action,
rsc.id, rsc.clone_id))
if rsc.managed():
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
if rsc.unique():
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
else:
# Anonymous clones may get restarted with a different clone number
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover")
watch.lookforall()
self.log_timer("recover")
self.CM.cluster_stable()
recovered = self.CM.ResourceLocation(self.rid)
if watch.unmatched:
return self.failure("Patterns not found: %s" % repr(watch.unmatched))
elif rsc.unique() and len(recovered) > 1:
return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
elif len(recovered) > 0:
self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
elif rsc.managed():
return self.failure("%s was not recovered and is inactive" % self.rid)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(ResourceRecover)
class ComponentFail(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ComponentFail"
# TODO make this work correctly in docker.
self.is_docker_unsafe = 1
self.startall = SimulStartLite(cm)
self.complist = cm.Components()
self.patterns = []
self.okerrpatterns = []
self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'ComponentFail' test. '''
self.incr("calls")
self.patterns = []
self.okerrpatterns = []
# start all nodes
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.CM.cluster_stable(self.Env["StableTime"]):
return self.failure("Setup failed - unstable")
node_is_dc = self.CM.is_node_dc(node, None)
# select a component to kill
chosen = self.Env.RandomGen.choice(self.complist)
while chosen.dc_only == 1 and node_is_dc == 0:
chosen = self.Env.RandomGen.choice(self.complist)
self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
self.incr(chosen.name)
if chosen.name != "corosync":
self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
self.patterns.extend(chosen.pats)
if node_is_dc:
self.patterns.extend(chosen.dc_pats)
# @TODO this should be a flag in the Component
if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]:
# Ignore actions for fence devices if fencer will respawn
# (their registration will be lost, and probes will fail)
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
self.okerrpatterns.append(self.templates["Pat:Fencing_active"] % r.id)
self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id)
# supply a copy so self.patterns doesn't end up empty
tmpPats = []
tmpPats.extend(self.patterns)
self.patterns.extend(chosen.badnews_ignore)
# Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
stonithPats = []
stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
stonith = self.create_watch(stonithPats, 0)
stonith.setwatch()
# set the watch for stable
watch = self.create_watch(
tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
# kill the component
chosen.kill(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for any fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
self.CM.cluster_stable(self.Env["StartTime"])
self.debug("Checking if %s was shot" % node)
shot = stonith.look(60)
if shot:
self.debug("Found: " + repr(shot))
self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
if self.Env["at-boot"] == 0:
self.CM.ShouldBeStatus[node] = "down"
# If fencing occurred, chances are many (if not all) the expected logs
# will not be sent - or will be lost when the node reboots
return self.success()
# check for logs indicating a graceful recovery
matched = watch.lookforall(allow_multiple_matches=1)
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected %s patterns" % chosen.name)
elif not is_stable:
return self.failure("Cluster did not become stable after killing %s" % chosen.name)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Note that okerrpatterns refers to the last time we ran this test
# The good news is that this works fine for us...
self.okerrpatterns.extend(self.patterns)
return self.okerrpatterns
AllTestClasses.append(ComponentFail)
class SplitBrainTest(CTSTest):
'''It is used to test split-brain. when the path between the two nodes break
check the two nodes both take over the resource'''
def __init__(self,cm):
CTSTest.__init__(self,cm)
self.name = "SplitBrain"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.is_experimental = 1
def isolate_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
if len(other_nodes) == 0:
return 1
self.debug("Creating partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
if not self.CM.isolate_node(node, other_nodes):
self.logger.log("Could not isolate %s" % node)
return 0
return 1
def heal_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
if len(other_nodes) == 0:
return 1
self.debug("Healing partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
self.CM.unisolate_node(node, other_nodes)
def __call__(self, node):
'''Perform split-brain test'''
self.incr("calls")
self.passed = 1
partitions = {}
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
while 1:
# Retry until we get multiple partitions
partitions = {}
p_max = len(self.Env["nodes"])
for node in self.Env["nodes"]:
p = self.Env.RandomGen.randint(1, p_max)
if not p in partitions:
partitions[p] = []
partitions[p].append(node)
p_max = len(list(partitions.keys()))
if p_max > 1:
break
# else, try again
self.debug("Created %d partitions" % p_max)
for key in list(partitions.keys()):
self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
# Disabling STONITH to reduce test complexity for now
self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
for key in list(partitions.keys()):
self.isolate_partition(partitions[key])
count = 30
while count > 0:
if len(self.CM.find_partitions()) != p_max:
time.sleep(10)
else:
break
else:
self.failure("Expected partitions were not created")
# Target number of partitions formed - wait for stability
if not self.CM.cluster_stable():
self.failure("Partitioned cluster not stable")
# Now audit the cluster state
self.CM.partitions_expected = p_max
if not self.audit():
self.failure("Audits failed")
self.CM.partitions_expected = 1
# And heal them again
for key in list(partitions.keys()):
self.heal_partition(partitions[key])
# Wait for a single partition to form
count = 30
while count > 0:
if len(self.CM.find_partitions()) != 1:
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not reform")
# Wait for it to have the right number of members
count = 30
while count > 0:
members = []
partitions = self.CM.find_partitions()
if len(partitions) > 0:
members = partitions[0].split()
if len(members) != len(self.Env["nodes"]):
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not completely reform")
# Wait up to 20 minutes - the delay is more preferable than
# trying to continue with in a messed up state
if not self.CM.cluster_stable(1200):
self.failure("Reformed cluster not stable")
if self.Env["continue"] == 1:
answer = "Y"
else:
try:
answer = input_wrapper('Continue? [nY]')
except EOFError as e:
answer = "n"
if answer and answer == "n":
raise ValueError("Reformed cluster not stable")
# Turn fencing back on
if self.Env["DoFencing"]:
self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
self.CM.cluster_stable()
if self.passed:
return self.success()
return self.failure("See previous errors")
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return [
r"Another DC detected:",
r"(ERROR|error).*: .*Application of an update diff failed",
r"pacemaker-controld.*:.*not in our membership list",
r"CRIT:.*node.*returning after partition",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
return len(self.Env["nodes"]) > 2
AllTestClasses.append(SplitBrainTest)
class Reattach(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Reattach"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
self.is_unsafe = 0 # Handled by canrunnow()
def _is_managed(self, node):
is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1)
is_managed = is_managed[:-1] # Strip off the newline
return is_managed == "true"
def _set_unmanaged(self, node):
self.debug("Disable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
def _set_managed(self, node):
self.debug("Re-enable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
def setup(self, node):
attempt = 0
if not self.startall(None):
return None
# Make sure we are really _really_ stable and that all
# resources, including those that depend on transient node
# attributes, are started
while not self.CM.cluster_stable(double_check=True):
if attempt < 5:
attempt += 1
self.debug("Not stable yet, re-testing")
else:
self.logger.log("Cluster is not stable")
return None
return 1
def teardown(self, node):
# Make sure 'node' is up
start = StartTest(self.CM)
start(node)
if not self._is_managed(node):
self.logger.log("Attempting to re-enable resource management on %s" % node)
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
self.logger.log("Could not re-enable resource management")
return 0
return 1
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
if self.find_ocfs2_resources(node):
self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
return 0
return 1
def __call__(self, node):
self.incr("calls")
pats = []
# Conveniently, the scheduler will display this message when disabling
# management, even if fencing is not enabled, so we can rely on it.
managed = self.create_watch(["Delaying fencing operations"], 60)
managed.setwatch()
self._set_unmanaged(node)
if not managed.lookforall():
self.logger.log("Patterns not found: " + repr(managed.unmatched))
return self.failure("Resource management not disabled")
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
watch = self.create_watch(pats, 60, "ShutdownActivity")
watch.setwatch()
self.debug("Shutting down the cluster")
ret = self.stopall(None)
if not ret:
self._set_managed(node)
return self.failure("Couldn't shut down the cluster")
self.debug("Bringing the cluster back up")
ret = self.startall(None)
time.sleep(5) # allow ping to update the CIB
if not ret:
self._set_managed(node)
return self.failure("Couldn't restart the cluster")
if self.local_badnews("ResourceActivity:", watch):
self._set_managed(node)
return self.failure("Resources stopped or started during cluster restart")
watch = self.create_watch(pats, 60, "StartupActivity")
watch.setwatch()
# Re-enable resource management (and verify it happened).
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
# Ignore actions for STONITH resources
ignore = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.debug("Ignoring start actions for %s" % r.id)
ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
if self.local_badnews("ResourceActivity:", watch, ignore):
return self.failure("Resources stopped or started after resource management was re-enabled")
return ret
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"resource( was|s were) active at shutdown",
]
def is_applicable(self):
return 1
AllTestClasses.append(Reattach)
class SpecialTest1(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SpecialTest1"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, node):
'''Perform the 'SpecialTest1' test for Andrew. '''
self.incr("calls")
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Could not stop all nodes")
# Test config recovery when the other nodes come up
self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
# Start the selected node
ret = self.restart1(node)
if not ret:
return self.failure("Could not start "+node)
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Could not start the remaining nodes")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Errors that occur as a result of the CIB being wiped
return [
r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
r"error.*: Resource start-up disabled since no STONITH resources have been defined",
r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
]
AllTestClasses.append(SpecialTest1)
class HAETest(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "HAETest"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_loop = 1
def setup(self, node):
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
return self.success()
def wait_on_state(self, node, resource, expected_clones, attempts=240):
while attempts > 0:
active = 0
(rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
# Hack until crm_resource does the right thing
if rc == 0 and lines:
active = len(lines)
if len(lines) == expected_clones:
return 1
elif rc == 1:
self.debug("Resource %s is still inactive" % resource)
elif rc == 234:
self.logger.log("Unknown resource %s" % resource)
return 0
elif rc == 246:
self.logger.log("Cluster is inactive")
return 0
elif rc != 0:
self.logger.log("Call to crm_resource failed, rc=%d" % rc)
return 0
else:
self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
attempts -= 1
time.sleep(1)
return 0
def find_dlm(self, node):
self.r_dlm = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "controld" and r.parent != "NA":
self.debug("Found dlm: %s" % self.r_dlm)
self.r_dlm = r.parent
return 1
return 0
def find_hae_resources(self, node):
self.r_dlm = None
self.r_o2cb = None
self.r_ocfs2 = []
if self.find_dlm(node):
self.find_ocfs2_resources(node)
def is_applicable(self):
if not self.is_applicable_common():
return 0
if self.Env["Schema"] == "hae":
return 1
return None
class HAERoleTest(HAETest):
def __init__(self, cm):
'''Lars' mount/unmount test for the HA extension. '''
HAETest.__init__(self,cm)
self.name = "HAERoleTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
delay = 2
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "Stopped")
if not self.wait_on_state(node, self.r_dlm, 0):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "Started")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAERoleTest)
class HAEStandbyTest(HAETest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
HAETest.__init__(self,cm)
self.name = "HAEStandbyTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "true")
if not self.wait_on_state(node, self.r_dlm, clone_max-1):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "false")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAEStandbyTest)
class NearQuorumPointTest(CTSTest):
'''
This test brings larger clusters near the quorum point (50%).
In addition, it will test doing starts and stops at the same time.
Here is how I think it should work:
- loop over the nodes and decide randomly which will be up and which
will be down Use a 50% probability for each of up/down.
- figure out what to do to get into that state from the current state
- in parallel, bring up those going up and bring those going down.
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "NearQuorumPoint"
def __call__(self, dummy):
'''Perform the 'NearQuorumPoint' test. '''
self.incr("calls")
startset = []
stopset = []
stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
#decide what to do with each node
for node in self.Env["nodes"]:
action = self.Env.RandomGen.choice(["start","stop"])
#action = self.Env.RandomGen.choice(["start","stop","no change"])
if action == "start" :
startset.append(node)
elif action == "stop" :
stopset.append(node)
self.debug("start nodes:" + repr(startset))
self.debug("stop nodes:" + repr(stopset))
#add search patterns
watchpats = [ ]
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
watchpats.append(self.templates["Pat:We_stopped"] % node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
#watchpats.append(self.templates["Pat:NonDC_started"] % node)
watchpats.append(self.templates["Pat:Local_started"] % node)
else:
for stopping in stopset:
if self.CM.ShouldBeStatus[stopping] == "up":
watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
if len(watchpats) == 0:
return self.skipped()
if len(startset) != 0:
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
#begin actions
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
self.CM.StartaCMnoBlock(node)
#get the result
if watch.lookforall():
self.CM.cluster_stable()
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
return self.success()
self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
#get the "bad" nodes
upnodes = []
for node in stopset:
if self.CM.StataCM(node) == 1:
upnodes.append(node)
downnodes = []
for node in startset:
if self.CM.StataCM(node) == 0:
downnodes.append(node)
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
if upnodes == [] and downnodes == []:
self.CM.cluster_stable()
# Make sure they're completely down with no residule
for node in stopset:
self.rsh(node, self.templates["StopCmd"])
return self.success()
if len(upnodes) > 0:
self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
if len(downnodes) > 0:
self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
return self.failure()
def is_applicable(self):
return 1
AllTestClasses.append(NearQuorumPointTest)
class RollingUpgradeTest(CTSTest):
'''Perform a rolling upgrade of the cluster'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RollingUpgrade"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def setup(self, node):
# Start all remaining nodes
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.downgrade(node, None):
return self.failure("Couldn't downgrade %s" % node)
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.upgrade(node, None):
return self.failure("Couldn't upgrade %s" % node)
return self.success()
def install(self, node, version, start=1, flags="--force"):
target_dir = "/tmp/rpm-%s" % version
src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
self.logger.log("Installing %s on %s with %s" % (version, node, flags))
if not self.stop(node):
return self.failure("stop failure: "+node)
rc = self.rsh(node, "mkdir -p %s" % target_dir)
rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
(rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
for line in lines:
line = line[:-1]
rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
if start and not self.start(node):
return self.failure("start failure: "+node)
return self.success()
def upgrade(self, node, start=1):
return self.install(node, self.Env["current-version"], start)
def downgrade(self, node, start=1):
return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
def __call__(self, node):
'''Perform the 'Rolling Upgrade' test. '''
self.incr("calls")
for node in self.Env["nodes"]:
if self.upgrade(node):
return self.failure("Couldn't upgrade %s" % node)
self.CM.cluster_stable()
return self.success()
def is_applicable(self):
if not self.is_applicable_common():
return None
if not "rpm-dir" in list(self.Env.keys()):
return None
if not "current-version" in list(self.Env.keys()):
return None
if not "previous-version" in list(self.Env.keys()):
return None
return 1
# Register RestartTest as a good test to run
AllTestClasses.append(RollingUpgradeTest)
class BSC_AddResource(CTSTest):
'''Add a resource to the cluster'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "AddResource"
self.resource_offset = 0
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
def __call__(self, node):
self.incr("calls")
self.resource_offset = self.resource_offset + 1
r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok"
patterns = []
patterns.append(start_pat % r_id)
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
ip = self.NextIP()
if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
return self.failure("Make resource %s failed" % r_id)
failed = 0
watch_result = watch.lookforall()
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Pattern not found: %s" % (regex))
failed = 1
if failed:
return self.failure("Resource pattern(s) not found")
if not self.CM.cluster_stable(self.Env["DeadTime"]):
return self.failure("Unstable cluster")
return self.success()
def NextIP(self):
ip = self.Env["IPBase"]
if ":" in ip:
fields = ip.rpartition(":")
fields[2] = str(hex(int(fields[2], 16)+1))
print(str(hex(int(f[2], 16)+1)))
else:
fields = ip.rpartition('.')
fields[2] = str(int(fields[2])+1)
ip = fields[0] + fields[1] + fields[3];
self.Env["IPBase"] = ip
return ip.strip()
def make_ip_resource(self, node, id, rclass, type, ip):
self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
rsc_xml="""
""" % (id, rclass, type, id, id, ip)
node_constraint = """
""" % (id, id, id, id, node)
rc = 0
(rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
if rc != 0:
self.logger.log("Constraint creation failed: %d" % rc)
return None
(rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
if rc != 0:
self.logger.log("Resource creation failed: %d" % rc)
return None
return 1
def is_applicable(self):
if self.Env["DoBSC"]:
return 1
return None
AllTestClasses.append(BSC_AddResource)
class SimulStopLite(CTSTest):
'''Stop any active nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStopLite"
def __call__(self, dummy):
'''Perform the 'SimulStopLite' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
watchpats = [ ]
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.incr("WasStarted")
watchpats.append(self.templates["Pat:We_stopped"] % node)
if len(watchpats) == 0:
return self.success()
# Stop all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.set_timer()
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
if watch.lookforall():
# Make sure they're completely down with no residule
for node in self.Env["nodes"]:
self.rsh(node, self.templates["StopCmd"])
return self.success()
did_fail = 0
up_nodes = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 1:
did_fail = 1
up_nodes.append(node)
if did_fail:
return self.failure("Active nodes exist: " + repr(up_nodes))
self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
+ repr(watch.unmatched))
return self.failure("Missing log message: "+repr(watch.unmatched))
def is_applicable(self):
'''SimulStopLite is a setup test and never applicable'''
return 0
class SimulStartLite(CTSTest):
'''Start any stopped nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStartLite"
def __call__(self, dummy):
'''Perform the 'SimulStartList' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
node_list = []
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "down":
self.incr("WasStopped")
node_list.append(node)
self.set_timer()
while len(node_list) > 0:
# Repeat until all nodes come up
watchpats = [ ]
uppat = self.templates["Pat:NonDC_started"]
if self.CM.upcount() == 0:
uppat = self.templates["Pat:Local_started"]
watchpats.append(self.templates["Pat:DC_IDLE"])
for node in node_list:
watchpats.append(uppat % node)
watchpats.append(self.templates["Pat:InfraUp"] % node)
watchpats.append(self.templates["Pat:PacemakerUp"] % node)
# Start all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
stonith = self.CM.prepare_fencing_watcher(self.name)
for node in node_list:
self.CM.StartaCMnoBlock(node)
watch.lookforall()
node_list = self.CM.fencing_cleanup(self.name, stonith)
if node_list == None:
return self.failure("Cluster did not stabilize")
# Remove node_list messages from watch.unmatched
for node in node_list:
self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
if watch.unmatched:
try:
watch.unmatched.remove(uppat % node)
except:
self.debug("Already matched: %s" % (uppat % node))
try:
watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
try:
watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
if not self.CM.cluster_stable():
return self.failure("Cluster did not stabilize")
did_fail = 0
unstable = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 0:
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstarted nodes exist: " + repr(unstable))
unstable = []
for node in self.Env["nodes"]:
if not self.CM.node_stable(node):
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstable cluster nodes exist: " + repr(unstable))
return self.success()
def is_applicable(self):
'''SimulStartLite is a setup test and never applicable'''
return 0
def TestList(cm, audits):
result = []
for testclass in AllTestClasses:
bound_test = testclass(cm)
if bound_test.is_applicable():
bound_test.Audits = audits
result.append(bound_test)
return result
class RemoteLXC(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteLXC"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = 1
self.is_docker_unsafe = 1
self.failed = 0
self.fail_string = ""
def start_lxc_simple(self, node):
# restore any artifacts laying around from a previous test.
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
# generate the containers, put them in the config, add some resources to them
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
self.set_timer("remoteSimpleInit")
watch.lookforall()
self.log_timer("remoteSimpleInit")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
def cleanup_lxc_simple(self, node):
pats = [ ]
# if the test failed, attempt to clean up the cib and libvirt environment
# as best as possible
if self.failed == 1:
# restore libvirt and cib
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
return
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
self.set_timer("remoteSimpleCleanup")
watch.lookforall()
self.log_timer("remoteSimpleCleanup")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
# cleanup libvirt
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
def __call__(self, node):
'''Perform the 'RemoteLXC' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed, start all nodes failed.")
rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
if rc == 1:
self.log("Environment test for lxc support failed.")
return self.skipped()
self.start_lxc_simple(node)
self.cleanup_lxc_simple(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed == 1:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for ping",
r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
# The orphaned lxc-ms resource causes an expected transition error
# that is a result of the scheduler not having knowledge that the
# promotable resource used to be a clone. As a result, it looks like that
# resource is running in multiple locations when it shouldn't... But in
# this instance we know why this error is occurring and that it is expected.
r"Calculated [Tt]ransition .*pe-error",
r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
r"Unknown operation: fail",
r"VirtualDomain.*ERROR: Unable to determine emulator",
]
AllTestClasses.append(RemoteLXC)
class RemoteDriver(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = self.__class__.__name__
self.is_docker_unsafe = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
self.remote_rsc = "remote-rsc"
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
self.reset()
def reset(self):
self.pcmk_started = 0
self.failed = False
self.fail_string = ""
self.remote_node_added = 0
self.remote_rsc_added = 0
self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False])
def fail(self, msg):
""" Mark test as failed. """
self.failed = True
# Always log the failure.
self.logger.log(msg)
# Use first failure as test status, as it's likely to be most useful.
if not self.fail_string:
self.fail_string = msg
def get_othernode(self, node):
for othernode in self.Env["nodes"]:
if othernode == node:
# we don't want to try and use the cib that we just shutdown.
# find a cluster node that is not our soon to be remote-node.
continue
else:
return othernode
def del_rsc(self, node, rsc):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
if rc != 0:
self.fail("Removal of resource '%s' failed" % rsc)
def add_rsc(self, node, rsc_xml):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
if rc != 0:
self.fail("resource creation failed")
def add_primitive_rsc(self, node):
rsc_xml = """
""" % { "node": self.remote_rsc }
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_rsc_added = 1
def add_connection_rsc(self, node):
rsc_xml = """
""" % { "node": self.remote_node, "server": node }
if self.remote_use_reconnect_interval:
# Set reconnect interval on resource
rsc_xml = rsc_xml + """
""" % (self.remote_node)
rsc_xml = rsc_xml + """
""" % { "node": self.remote_node }
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_node_added = 1
def disable_services(self, node):
self.corosync_enabled = self.Env.service_is_enabled(node, "corosync")
if self.corosync_enabled:
self.Env.disable_service(node, "corosync")
self.pacemaker_enabled = self.Env.service_is_enabled(node, "pacemaker")
if self.pacemaker_enabled:
self.Env.disable_service(node, "pacemaker")
def restore_services(self, node):
if self.corosync_enabled:
self.Env.enable_service(node, "corosync")
if self.pacemaker_enabled:
self.Env.enable_service(node, "pacemaker")
def stop_pcmk_remote(self, node):
# disable pcmk remote
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote stop")
if rc != 0:
time.sleep(6)
else:
break
def start_pcmk_remote(self, node):
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote start")
if rc != 0:
time.sleep(6)
else:
self.pcmk_started = 1
break
def freeze_pcmk_remote(self, node):
""" Simulate a Pacemaker Remote daemon failure. """
# We freeze the process.
self.rsh(node, "killall -STOP pacemaker-remoted")
def resume_pcmk_remote(self, node):
# We resume the process.
self.rsh(node, "killall -CONT pacemaker-remoted")
def start_metal(self, node):
# Cluster nodes are reused as remote nodes in remote tests. If cluster
# services were enabled at boot, in case the remote node got fenced, the
# cluster node would join instead of the expected remote one. Meanwhile
# pacemaker_remote would not be able to start. Depending on the chances,
# the situations might not be able to be orchestrated gracefully any more.
#
# Temporarily disable any enabled cluster serivces.
self.disable_services(node)
pcmk_started = 0
# make sure the resource doesn't already exist for some reason
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
if not self.stop(node):
self.fail("Failed to shutdown cluster node %s" % node)
return
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
# Convert node to baremetal now that it has shutdown the cluster stack
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
self.add_connection_rsc(node)
self.set_timer("remoteMetalInit")
watch.lookforall()
self.log_timer("remoteMetalInit")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def migrate_connection(self, node):
if self.failed:
return
pats = [ ]
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(pats, 120)
watch.setwatch()
(rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
if rc != 0:
self.fail("failed to move remote node connection resource")
return
self.set_timer("remoteMetalMigrate")
watch.lookforall()
self.log_timer("remoteMetalMigrate")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def fail_rsc(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, 120)
watch.setwatch()
self.debug("causing dummy rsc to fail.")
rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
self.set_timer("remoteRscFail")
watch.lookforall()
self.log_timer("remoteRscFail")
if watch.unmatched:
self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)
def fail_connection(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
watch = self.create_watch(watchpats, 120)
watch.setwatch()
# freeze the pcmk remote daemon. this will result in fencing
self.debug("Force stopped active remote node")
self.freeze_pcmk_remote(node)
self.debug("Waiting for remote node to be fenced.")
self.set_timer("remoteMetalFence")
watch.lookforall()
self.log_timer("remoteMetalFence")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
self.debug("Waiting for the remote node to come back up")
self.CM.ns.WaitForNodeToComeUp(node, 120);
pats = [ ]
watch = self.create_watch(pats, 240)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
# start the remote node again watch it integrate back into cluster.
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
self.debug("Waiting for remote node to rejoin cluster after being fenced.")
self.set_timer("remoteMetalRestart")
watch.lookforall()
self.log_timer("remoteMetalRestart")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def add_dummy_rsc(self, node):
if self.failed:
return
# verify we can put a resource on the remote node
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
# Add a resource that must live on remote-node
self.add_primitive_rsc(node)
# force that rsc to prefer the remote node.
(rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
if rc != 0:
self.fail("Failed to place remote resource on remote node.")
return
self.set_timer("remoteMetalRsc")
watch.lookforall()
self.log_timer("remoteMetalRsc")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def test_attributes(self, node):
if self.failed:
return
# This verifies permanent attributes can be set on a remote-node. It also
# verifies the remote-node can edit its own cib node section remotely.
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to get remote-node attribute")
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to delete remote-node attribute")
return
def cleanup_metal(self, node):
self.restore_services(node)
if self.pcmk_started == 0:
return
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
if self.remote_node_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))
self.set_timer("remoteMetalCleanup")
self.resume_pcmk_remote(node)
if self.remote_rsc_added == 1:
# Remove dummy resource added for remote node tests
self.debug("Cleaning up dummy rsc put on remote node")
self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc)
self.del_rsc(node, self.remote_rsc)
if self.remote_node_added == 1:
# Remove remote node's connection resource
self.debug("Cleaning up remote node connection resource")
self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node))
self.del_rsc(node, self.remote_node)
watch.lookforall()
self.log_timer("remoteMetalCleanup")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
self.stop_pcmk_remote(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.remote_node_added == 1:
# Remove remote node itself
self.debug("Cleaning up node entry for remote node")
self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)
def setup_env(self, node):
self.remote_node = "remote-%s" % (node)
# we are assuming if all nodes have a key, that it is
# the right key... If any node doesn't have a remote
# key, we regenerate it everywhere.
if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
return
# create key locally
(handle, keyfile) = tempfile.mkstemp(".cts")
os.close(handle)
devnull = open(os.devnull, 'wb')
subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
stdout=devnull, stderr=devnull)
devnull.close()
# sync key throughout the cluster
for node in self.Env["nodes"]:
self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
os.unlink(keyfile)
def is_applicable(self):
if not self.is_applicable_common():
return False
for node in self.Env["nodes"]:
rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
if rc != 0:
return False
return True
def start_new_test(self, node):
self.incr("calls")
self.reset()
ret = self.startall(None)
if not ret:
return self.failure("setup failed: could not start all nodes")
self.setup_env(node)
self.start_metal(node)
self.add_dummy_rsc(node)
return True
def __call__(self, node):
return self.failure("This base class is not meant to be called directly.")
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [ r"""is running on remote.*which isn't allowed""",
r"""Connection terminated""",
r"""Could not send remote""",
]
# RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses
class RemoteBasic(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteBaremetal' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.test_attributes(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteBasic)
class RemoteStonithd(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteStonithd' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.fail_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return False
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return True
def errorstoignore(self):
ignore_pats = [
r"Lost connection to Pacemaker Remote node",
r"Software caused connection abort",
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
r"error: Result of monitor operation for .* on remote-.*: No executor connection",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteStonithd)
class RemoteMigrate(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteMigrate' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.migrate_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteMigrate)
class RemoteRscFailure(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteRscFailure' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
# This is an important step. We are migrating the connection
# before failing the resource. This verifies that the migration
# has properly maintained control over the remote-node.
self.migrate_connection(node)
self.fail_rsc(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
ignore_pats = [
r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
r"Dummy.*: No process state file found",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteRscFailure)
# vim:ts=4:sw=4:et:
diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in
index 65a9916629..2d9999ca01 100644
--- a/cts/cts-fencing.in
+++ b/cts/cts-fencing.in
@@ -1,1527 +1,1527 @@
#!@PYTHON@
""" Regression tests for Pacemaker's fencer
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
-__copyright__ = "Copyright 2012-2018 the Pacemaker project contributors"
+__copyright__ = "Copyright 2012-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import io
import os
import re
import sys
import subprocess
import shlex
import time
import tempfile
import signal
# Where to find test binaries
# Prefer the source tree if available
BUILD_DIR = "@abs_top_builddir@"
SCHEMA_DIR = "@CRM_SCHEMA_DIRECTORY@"
TEST_DIR = sys.path[0]
AUTOGEN_COROSYNC_TEMPLATE = """
totem {
version: 2
cluster_name: cts-fencing
crypto_cipher: none
crypto_hash: none
transport: udp
}
nodelist {
node {
nodeid: 1
name: %s
ring0_addr: 127.0.0.1
}
}
logging {
debug: off
to_syslog: no
to_stderr: no
to_logfile: yes
logfile: %s
}
"""
# These values must be kept in sync with include/crm/crm.h
class CrmExit(object):
OK = 0
ERROR = 1
INVALID_PARAM = 2
UNIMPLEMENT_FEATURE = 3
INSUFFICIENT_PRIV = 4
NOT_INSTALLED = 5
NOT_CONFIGURED = 6
NOT_RUNNING = 7
USAGE = 64
DATAERR = 65
NOINPUT = 66
NOUSER = 67
NOHOST = 68
UNAVAILABLE = 69
SOFTWARE = 70
OSERR = 71
OSFILE = 72
CANTCREAT = 73
IOERR = 74
TEMPFAIL = 75
PROTOCOL = 76
NOPERM = 77
CONFIG = 78
FATAL = 100
PANIC = 101
DISCONNECT = 102
SOLO = 103
DIGEST = 104
NOSUCH = 105
QUORUM = 106
UNSAFE = 107
EXISTS = 108
MULTIPLE = 109
OLD = 110
TIMEOUT = 124
MAX = 255
def update_path():
""" Set the PATH environment variable appropriately for the tests """
new_path = os.environ['PATH']
if os.path.exists("%s/cts-fencing.in" % TEST_DIR):
print("Running tests from the source tree: %s (%s)" % (BUILD_DIR, TEST_DIR))
# For pacemaker-fenced and cts-fence-helper
new_path = "%s/daemons/fenced:%s" % (BUILD_DIR, new_path)
new_path = "%s/tools:%s" % (BUILD_DIR, new_path) # For stonith_admin
new_path = "%s/cts:%s" % (BUILD_DIR, new_path) # For cts-support
else:
print("Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % TEST_DIR)
# For pacemaker-fenced, cts-fence-helper, and cts-support
new_path = "@CRM_DAEMON_DIR@:%s" % (new_path)
print('Using PATH="{}"'.format(new_path))
os.environ['PATH'] = new_path
def find_validator(rng_file):
if os.access("/usr/bin/xmllint", os.X_OK):
return ["xmllint", "--relaxng", rng_file, "-"]
else:
return None
def rng_directory():
if "PCMK_schema_directory" in os.environ:
return os.environ["PCMK_schema_directory"]
elif os.path.exists("%s/cts-fencing.in" % TEST_DIR):
return "xml"
else:
return SCHEMA_DIR
def pipe_communicate(pipes, stdout=True, stderr=False, stdin=None):
""" Wrapper to get text output from pipes regardless of Python version """
output = ""
if stdin:
if sys.version_info < (3,):
pipe_outputs = pipes.communicate(input=stdin)
else:
pipe_outputs = pipes.communicate(input=stdin.encode())
else:
pipe_outputs = pipes.communicate()
if sys.version_info < (3,):
if stdout:
output = output + pipe_outputs[0]
if stderr:
output = output + pipe_outputs[1]
else:
if stdout:
output = output + pipe_outputs[0].decode(sys.stdout.encoding)
if stderr:
output = output + pipe_outputs[1].decode(sys.stderr.encoding)
return output
def output_from_command(command):
""" Execute command and return its standard output """
test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
test.wait()
return pipe_communicate(test).split("\n")
def localname():
""" Return the uname of the local host """
our_uname = output_from_command("uname -n")
if our_uname:
our_uname = our_uname[0]
else:
our_uname = "localhost"
return our_uname
def killall(process):
""" Kill all instances of a process """
cmd = shlex.split("killall -9 -q %s" % process)
test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
test.wait()
class TestError(Exception):
""" Base class for exceptions in this module """
pass
class ExitCodeError(TestError):
""" Exception raised when command exit status is unexpected """
def __init__(self, exit_code):
self.exit_code = exit_code
def __str__(self):
return repr(self.exit_code)
class OutputNotFoundError(TestError):
""" Exception raised when command output does not contain wanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class OutputFoundError(TestError):
""" Exception raised when command output contains unwanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class XmlValidationError(TestError):
""" Exception raised when xmllint fails """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class Test(object):
""" Executor for a single test """
def __init__(self, name, description, verbose=0, with_cpg=0):
self.name = name
self.description = description
self.cmds = []
self.verbose = verbose
self.result_txt = ""
self.cmd_tool_output = ""
self.result_exitcode = CrmExit.OK
if with_cpg:
self.stonith_options = "-c"
self.enable_corosync = 1
else:
self.stonith_options = "-s"
self.enable_corosync = 0
self.stonith_process = None
self.stonith_output = ""
self.stonith_patterns = []
self.negative_stonith_patterns = []
self.executed = 0
def __new_cmd(self, cmd, args, exitcode, stdout_match="", no_wait=0, stdout_negative_match="", kill=None, validate=True):
""" Add a command to be executed as part of this test """
self.cmds.append(
{
"cmd" : cmd,
"kill" : kill,
"args" : args,
"expected_exitcode" : exitcode,
"stdout_match" : stdout_match,
"stdout_negative_match" : stdout_negative_match,
"no_wait" : no_wait,
"validate" : validate,
}
)
def start_environment(self):
""" Prepare the host for executing a test """
# Make sure we are in full control
killall("pacemakerd")
killall("pacemaker-fenced")
if self.verbose:
self.stonith_options = self.stonith_options + " -V"
print("Starting pacemaker-fenced with %s" % self.stonith_options)
if os.path.exists("/tmp/stonith-regression.log"):
os.remove('/tmp/stonith-regression.log')
cmd = "pacemaker-fenced %s -l /tmp/stonith-regression.log" % self.stonith_options
self.stonith_process = subprocess.Popen(shlex.split(cmd))
time.sleep(1)
def clean_environment(self):
""" Clean up the host after executing a test """
if self.stonith_process:
if self.stonith_process.poll() == None:
self.stonith_process.terminate()
self.stonith_process.wait()
else:
return_code = {
getattr(signal, _signame): _signame
for _signame in dir(signal)
if _signame.startswith('SIG') and not _signame.startswith("SIG_")
}.get(-self.stonith_process.returncode, "RET=%d" % (self.stonith_process.returncode))
msg = "FAILURE - '%s' failed. pacemaker-fenced abnormally exited during test (%s)."
self.result_txt = msg % (self.name, return_code)
self.result_exitcode = CrmExit.ERROR
self.stonith_output = ""
self.stonith_process = None
# the default for utf-8 encoding would error out if e.g. memory corruption
# makes fenced output any kind of 8 bit value - while still interesting
# for debugging and we'd still like the regression-test to go over the
# full set of test-cases
logfile = io.open('/tmp/stonith-regression.log', 'rt', encoding = "ISO-8859-1")
for line in logfile.readlines():
self.stonith_output = self.stonith_output + line
if self.verbose:
print("Daemon Output Start")
print(self.stonith_output)
print("Daemon Output End")
os.remove('/tmp/stonith-regression.log')
def add_stonith_log_pattern(self, pattern):
""" Add a log pattern to expect from this test """
self.stonith_patterns.append(pattern)
def add_stonith_neg_log_pattern(self, pattern):
""" Add a log pattern that should not occur with this test """
self.negative_stonith_patterns.append(pattern)
def add_cmd(self, cmd, args, validate=True):
""" Add a simple command to be executed as part of this test """
self.__new_cmd(cmd, args, CrmExit.OK, "", validate=validate)
def add_cmd_no_wait(self, cmd, args):
""" Add a simple command to be executed (without waiting) as part of this test """
self.__new_cmd(cmd, args, CrmExit.OK, "", 1)
def add_cmd_check_stdout(self, cmd, args, match, no_match=""):
""" Add a simple command with expected output to be executed as part of this test """
self.__new_cmd(cmd, args, CrmExit.OK, match, 0, no_match)
def add_expected_fail_cmd(self, cmd, args, exitcode=CrmExit.ERROR):
""" Add a command to be executed as part of this test and expected to fail """
self.__new_cmd(cmd, args, exitcode, "")
def get_exitcode(self):
""" Return the exit status of the last test execution """
return self.result_exitcode
def print_result(self, filler):
""" Print the result of the last test execution """
print("%s%s" % (filler, self.result_txt))
def run_cmd(self, args):
""" Execute a command as part of this test """
cmd = shlex.split(args['args'])
cmd.insert(0, args['cmd'])
if self.verbose:
print("\n\nRunning: "+" ".join(cmd))
test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if args['kill']:
if self.verbose:
print("Also running: "+args['kill'])
subprocess.Popen(shlex.split(args['kill']))
if args['no_wait'] == 0:
test.wait()
else:
return CrmExit.OK
output = pipe_communicate(test, stderr=True)
if self.verbose:
print(output)
if test.returncode != args['expected_exitcode']:
raise ExitCodeError(test.returncode)
if (args['stdout_match'] != "" and
re.search(args['stdout_match'], output) is None):
raise OutputNotFoundError(output)
if (args['stdout_negative_match'] != "" and
re.search(args['stdout_negative_match'], output) is not None):
raise OutputFoundError(output)
if args['validate']:
rng_file = rng_directory() + "/api/api-result.rng"
cmd = find_validator(rng_file)
if not cmd:
return
if self.verbose:
print("\nRunning: "+" ".join(cmd))
validator = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = pipe_communicate(validator, stderr=True, stdin=output)
if self.verbose:
print(output)
if validator.returncode != 0:
raise XmlValidationError(output)
def count_negative_matches(self, outline):
""" Return 1 if a line matches patterns that shouldn't have occurred """
count = 0
for line in self.negative_stonith_patterns:
if outline.count(line):
count = 1
if self.verbose:
print("This pattern should not have matched = '%s" % (line))
return count
def match_stonith_patterns(self):
""" Check test output for expected patterns """
negative_matches = 0
cur = 0
pats = self.stonith_patterns
total_patterns = len(self.stonith_patterns)
if len(self.stonith_patterns) == 0 and len(self.negative_stonith_patterns) == 0:
return
for line in self.stonith_output.split("\n"):
negative_matches = negative_matches + self.count_negative_matches(line)
if len(pats) == 0:
continue
cur = -1
for pat in pats:
cur = cur + 1
if line.count(pats[cur]):
del pats[cur]
break
if len(pats) > 0 or negative_matches:
if self.verbose:
for pat in pats:
print("Pattern Not Matched = '%s'" % pat)
msg = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches."
self.result_txt = msg % (self.name, len(pats), total_patterns, negative_matches)
self.result_exitcode = CrmExit.ERROR
def set_error(self, step, cmd):
""" Record failure of this test """
msg = "FAILURE - '%s' failed at step %d. Command: %s %s"
self.result_txt = msg % (self.name, step, cmd['cmd'], cmd['args'])
self.result_exitcode = CrmExit.ERROR
def run(self):
""" Execute this test. """
res = 0
i = 1
self.start_environment()
if self.verbose:
print("\n--- START TEST - %s" % self.name)
self.result_txt = "SUCCESS - '%s'" % (self.name)
self.result_exitcode = CrmExit.OK
for cmd in self.cmds:
try:
self.run_cmd(cmd)
except ExitCodeError as e:
print("Step %d FAILED - command returned %s, expected %d" % (i, e, cmd['expected_exitcode']))
self.set_error(i, cmd);
break
except OutputNotFoundError as e:
print("Step %d FAILED - '%s' was not found in command output: %s" % (i, cmd['stdout_match'], e))
self.set_error(i, cmd);
break
except OutputFoundError as e:
print("Step %d FAILED - '%s' was found in command output: %s" % (i, cmd['stdout_negative_match'], e))
self.set_error(i, cmd);
break
if self.verbose:
print("Step %d SUCCESS" % (i))
i = i + 1
self.clean_environment()
if self.result_exitcode == CrmExit.OK:
self.match_stonith_patterns()
print(self.result_txt)
if self.verbose:
print("--- END TEST - %s\n" % self.name)
self.executed = 1
return res
class Tests(object):
""" Collection of all fencing regression tests """
def __init__(self, verbose=0):
self.tests = []
self.verbose = verbose
self.autogen_corosync_cfg = not os.path.exists("/etc/corosync/corosync.conf")
def new_test(self, name, description, with_cpg=0):
""" Create a named test """
test = Test(name, description, self.verbose, with_cpg)
self.tests.append(test)
return test
def print_list(self):
""" List all registered tests """
print("\n==== %d TESTS FOUND ====" % (len(self.tests)))
print("%35s - %s" % ("TEST NAME", "TEST DESCRIPTION"))
print("%35s - %s" % ("--------------------", "--------------------"))
for test in self.tests:
print("%35s - %s" % (test.name, test.description))
print("==== END OF LIST ====\n")
def start_corosync(self):
""" Start the corosync process """
if self.verbose:
print("Starting corosync")
test = subprocess.Popen("corosync", stdout=subprocess.PIPE)
test.wait()
time.sleep(10)
def run_single(self, name):
""" Run a single named test """
for test in self.tests:
if test.name == name:
test.run()
break
def run_tests_matching(self, pattern):
""" Run all tests whose name matches a pattern """
for test in self.tests:
if test.name.count(pattern) != 0:
test.run()
def run_cpg_only(self):
""" Run all corosync-enabled tests """
for test in self.tests:
if test.enable_corosync:
test.run()
def run_no_cpg(self):
""" Run all standalone tests """
for test in self.tests:
if not test.enable_corosync:
test.run()
def run_tests(self):
""" Run all tests """
for test in self.tests:
test.run()
def exit(self):
""" Exit (with error status code if any test failed) """
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
sys.exit(CrmExit.ERROR)
sys.exit(CrmExit.OK)
def print_results(self):
""" Print summary of results of executed tests """
failures = 0
success = 0
print("\n\n======= FINAL RESULTS ==========")
print("\n--- FAILURE RESULTS:")
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
failures = failures + 1
test.print_result(" ")
else:
success = success + 1
if failures == 0:
print(" None")
print("\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures))
def build_api_sanity_tests(self):
""" Register tests to verify basic API usage """
verbose_arg = ""
if self.verbose:
verbose_arg = "-V"
test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.")
test.add_cmd("cts-fence-helper", "-t %s" % (verbose_arg), validate=False)
test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1)
test.add_cmd("cts-fence-helper", "-m %s" % (verbose_arg), validate=False)
def build_custom_timeout_tests(self):
""" Register tests to verify custom timeout usage """
# custom timeout without topology
test = self.new_test("cpg_custom_timeout_1",
"Verify per device timeouts work as expected without using topology.", 1)
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"')
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"')
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4"')
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# timeout is 5+1+4 = 10
test.add_stonith_log_pattern("Total timeout set to 10")
# custom timeout _WITH_ topology
test = self.new_test("cpg_custom_timeout_2",
"Verify per device timeouts work as expected _WITH_ topology.", 1)
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"')
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"')
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4000"')
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# timeout is 5+1+4000 = 4006
test.add_stonith_log_pattern("Total timeout set to 4006")
def build_fence_merge_tests(self):
""" Register tests to verify when fence operations should be merged """
### Simple test that overlapping fencing operations get merged
test = self.new_test("cpg_custom_merge_single",
"Verify overlapping identical fencing operations are merged, no fencing levels used.", 1)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### one merger will happen
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
### Test that multiple mergers occur
test = self.new_test("cpg_custom_merge_multiple",
"Verify multiple overlapping identical fencing operations are merged", 1)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"delay=2\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### 4 mergers should occur
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
### Test that multiple mergers occur with topologies used
test = self.new_test("cpg_custom_merge_with_topology",
"Verify multiple overlapping identical fencing operations are merged with fencing levels.",
1)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### 4 mergers should occur
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
- test.add_stonith_log_pattern("Operation off of node3 by")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
+ test.add_stonith_log_pattern("Operation 'off' targeting node3 on")
def build_fence_no_merge_tests(self):
""" Register tests to verify when fence operations should not be merged """
test = self.new_test("cpg_custom_no_merge",
"Verify differing fencing operations are not merged", 1)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node2 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_stonith_neg_log_pattern("Merging stonith action 'off' targeting node3 originating from client")
def build_standalone_tests(self):
""" Register a grab bag of tests that can be executed in standalone or corosync mode """
test_types = [
{
"prefix" : "standalone",
"use_cpg" : 0,
},
{
"prefix" : "cpg",
"use_cpg" : 1,
},
]
# test what happens when all devices timeout
for test_type in test_types:
test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"],
"Verify that all devices timeout, a fencing failure is returned.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
if test_type["use_cpg"] == 1:
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 2", CrmExit.TIMEOUT)
test.add_stonith_log_pattern("Total timeout set to 6")
else:
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 2", CrmExit.ERROR)
test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: ")
test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: ")
test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: ")
# test what happens when multiple devices can fence a node, but the first device fails.
for test_type in test_types:
test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"],
"Verify that when one fence device fails for a node, the others are tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
if test_type["use_cpg"] == 1:
test.add_stonith_log_pattern("Total timeout set to 15")
# test what happens when we try to use a missing fence-agent.
for test_type in test_types:
test = self.new_test("%s_fence_missing_agent" % test_type["prefix"],
"Verify proper error-handling when using a non-existent fence-agent.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_missing -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node2\"")
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 5", CrmExit.ERROR)
test.add_cmd("stonith_admin", "--output-as=xml -F node2 -t 5")
# simple topology test for one device
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_simple" % test_type["prefix"],
"Verify all fencing devices at a level are used.", test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_stonith_log_pattern("Total timeout set to 5")
test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
# add topology, delete topology, verify fencing still works
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_add_remove" % test_type["prefix"],
"Verify fencing occurrs after all topology levels are removed",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 1")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_stonith_log_pattern("Total timeout set to 5")
test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
# test what happens when the first fencing level has multiple devices.
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_device_fails" % test_type["prefix"],
"Verify if one device in a level fails, the other is tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20")
test.add_stonith_log_pattern("Total timeout set to 40")
test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -201")
test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
# test what happens when the first fencing level fails.
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"],
"Verify if one level fails, the next leve is tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3")
test.add_stonith_log_pattern("Total timeout set to 18")
test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -201")
test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
# test what happens when the first fencing level had devices that no one has registered
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_missing_devices" % test_type["prefix"],
"Verify topology can continue with missing devices.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# Test what happens if multiple fencing levels are defined, and then the first one is removed.
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_level_removal" % test_type["prefix"],
"Verify level removal works.", test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
# Now remove level 2, verify none of the devices in level two are hit.
test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 2")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20")
test.add_stonith_log_pattern("Total timeout set to 8")
test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
test.add_stonith_neg_log_pattern("for host 'node3' with device 'false2' returned: ")
test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
# Test targeting a topology level by node name pattern.
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_topology_level_pattern" % test_type["prefix"],
"Verify targeting topology by node name pattern works.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"""--output-as=xml -R true -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1 node2 node3" """)
test.add_cmd("stonith_admin", """--output-as=xml -r '@node.*' -i 1 -v true""")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
# test allowing commas and semicolons as delimiters in pcmk_host_list
for test_type in test_types:
test = self.new_test("%s_host_list_delimiters" % test_type["prefix"],
"Verify commas and semicolons can be used as pcmk_host_list delimiters",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1,node2,node3" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=pcmk1;pcmk2;pcmk3" """)
test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F node2 -t 5")
test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F pcmk3 -t 5")
test.add_stonith_log_pattern("for host 'node2' with device 'true1' returned: 0")
test.add_stonith_log_pattern("for host 'pcmk3' with device 'true2' returned: 0")
# test the stonith builds the correct list of devices that can fence a node.
for test_type in test_types:
test = self.new_test("%s_list_devices" % test_type["prefix"],
"Verify list of devices that can fence a node is correct",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true2", "true1")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true3", "true1")
# simple test of device monitor
for test_type in test_types:
test = self.new_test("%s_monitor" % test_type["prefix"],
"Verify device is reachable", test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -Q true1")
test.add_cmd("stonith_admin", "--output-as=xml -Q false1")
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true2", CrmExit.ERROR)
# Verify monitor occurs for duration of timeout period on failure
for test_type in test_types:
test = self.new_test("%s_monitor_timeout" % test_type["prefix"],
"Verify monitor uses duration of timeout period given.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
'--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"')
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1 -t 5", CrmExit.ERROR)
test.add_stonith_log_pattern("Attempt 2 to execute")
# Verify monitor occurs for duration of timeout period on failure, but stops at max retries
for test_type in test_types:
test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"],
"Verify monitor retries until max retry value or timeout is hit.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
'--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"')
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1 -t 15", CrmExit.ERROR)
test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times")
# simple register test
for test_type in test_types:
test = self.new_test("%s_register" % test_type["prefix"],
"Verify devices can be registered and un-registered",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -Q true1")
test.add_cmd("stonith_admin", "--output-as=xml -D true1")
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1", CrmExit.ERROR)
# simple reboot test
for test_type in test_types:
test = self.new_test("%s_reboot" % test_type["prefix"],
"Verify devices can be rebooted",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node3 -t 5")
test.add_cmd("stonith_admin", "--output-as=xml -D true1")
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1", CrmExit.ERROR)
# test fencing history.
for test_type in test_types:
if test_type["use_cpg"] == 0:
continue
test = self.new_test("%s_fence_history" % test_type["prefix"],
"Verify last fencing operation is returned.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5 -V")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -H node3", 'status="success" .* action="off" target="node3"')
# simple test of dynamic list query
for test_type in test_types:
test = self.new_test("%s_dynamic_list_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"')
# fence using dynamic list query
for test_type in test_types:
test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -F fake_port_1 -t 5 -V")
# simple test of query using status action
for test_type in test_types:
test = self.new_test("%s_status_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"')
# test what happens when no reboot action is advertised
for test_type in test_types:
test = self.new_test("%s_no_reboot_support" % test_type["prefix"],
"Verify reboot action defaults to off when no reboot action is advertised by agent.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V")
test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'")
test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)")
# make sure reboot is used when reboot action is advertised
for test_type in test_types:
test = self.new_test("%s_with_reboot_support" % test_type["prefix"],
"Verify reboot action can be used when metadata advertises it.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V")
test.add_stonith_neg_log_pattern("does not advertise support for 'reboot', performing 'off'")
test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)")
def build_nodeid_tests(self):
""" Register tests that use a corosync node id """
our_uname = localname()
### verify nodeid is supplied when nodeid is in the metadata parameters
test = self.new_test("cpg_supply_nodeid",
"Verify nodeid is given when fence agent has nodeid as parameter", 1)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname))
test.add_stonith_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters
test = self.new_test("cpg_do_not_supply_nodeid",
"Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter",
1)
# use a host name that won't be in corosync.conf
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=regr-test\"")
test.add_cmd("stonith_admin", "--output-as=xml -F regr-test -t 3")
test.add_stonith_neg_log_pattern("For stonith action (off) for victim regr-test, adding nodeid")
### verify nodeid use doesn't explode standalone mode
test = self.new_test("standalone_do_not_supply_nodeid",
"Verify nodeid in metadata parameter list doesn't kill standalone mode",
0)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname))
test.add_stonith_neg_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
def build_unfence_tests(self):
""" Register tests that verify unfencing """
our_uname = localname()
### verify unfencing using automatic unfencing
test = self.new_test("cpg_unfence_required_1",
"Verify require unfencing on all devices when automatic=true in agent's metadata",
1)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
# both devices should be executed
test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)")
test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)")
### verify unfencing using automatic unfencing fails if any of the required agents fail
test = self.new_test("cpg_unfence_required_2",
"Verify require unfencing on all devices when automatic=true in agent's metadata",
1)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=fail" -o "pcmk_host_list=%s"' % (our_uname))
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U %s -t 6" % (our_uname), CrmExit.ERROR)
### verify unfencing using automatic devices with topology
test = self.new_test("cpg_unfence_required_3",
"Verify require unfencing on all devices even when at different topology levels",
1)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)")
test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)")
### verify unfencing using automatic devices with topology
test = self.new_test("cpg_unfence_required_4",
"Verify all required devices are executed even with topology levels fail.",
1)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true3 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true4 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false3 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false4 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v false1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false3" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true3" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)")
test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)")
test.add_stonith_log_pattern("with device 'true3' returned: 0 (OK)")
test.add_stonith_log_pattern("with device 'true4' returned: 0 (OK)")
def build_unfence_on_target_tests(self):
""" Register tests that verify unfencing that runs on the target """
our_uname = localname()
### verify unfencing using on_target device
test = self.new_test("cpg_unfence_on_target_1",
"Verify unfencing with on_target = true", 1)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_stonith_log_pattern("(on) to be executed on the target node")
### verify failure of unfencing using on_target device
test = self.new_test("cpg_unfence_on_target_2",
"Verify failure unfencing with on_target = true",
1)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname))
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U node_fake_1234 -t 3", CrmExit.ERROR)
test.add_stonith_log_pattern("(on) to be executed on the target node")
### verify unfencing using on_target device with topology
test = self.new_test("cpg_unfence_on_target_3",
"Verify unfencing with on_target = true using topology",
1)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_stonith_log_pattern("(on) to be executed on the target node")
### verify unfencing using on_target device with topology fails when victim node doesn't exist
test = self.new_test("cpg_unfence_on_target_4",
"Verify unfencing failure with on_target = true using topology",
1)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true2")
test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U node_fake -t 3", CrmExit.ERROR)
test.add_stonith_log_pattern("(on) to be executed on the target node")
def build_remap_tests(self):
""" Register tests that verify remapping of reboots to off-on """
test = self.new_test("cpg_remap_simple",
"Verify sequential topology reboot is remapped to all-off-then-all-on", 1)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
"""-o "pcmk_off_timeout=1" -o "pcmk_reboot_timeout=10" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
"""-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
- test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
+ test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake")
# timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30)
- test.add_stonith_log_pattern("Total timeout set to 3 for peer's fencing of node_fake")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'")
- test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
+ test.add_stonith_log_pattern("Total timeout set to 3 for peer's fencing targeting node_fake")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'")
+ test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
# fence_dummy sets "on" as an on_target action
- test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake")
- test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake")
- test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+ test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake")
+ test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake")
+ test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake")
test = self.new_test("cpg_remap_automatic",
"Verify remapped topology reboot skips automatic 'on'", 1)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy_auto_unfence """
"""-o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy_auto_unfence """
"""-o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
- test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'")
- test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
- test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
- test.add_stonith_neg_log_pattern("perform op 'node_fake on' with")
+ test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'")
+ test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
+ test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake")
+ test.add_stonith_neg_log_pattern("perform 'on' action targeting node_fake using")
test.add_stonith_neg_log_pattern("'on' failure")
test = self.new_test("cpg_remap_complex_1",
"Verify remapped topology reboot in second level works if non-remapped first level fails",
1)
test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
- test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'false1'")
- test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'")
- test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
- test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake")
- test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake")
- test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+ test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'false1'")
+ test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'")
+ test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
+ test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake")
+ test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake")
+ test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake")
test = self.new_test("cpg_remap_complex_2",
"Verify remapped topology reboot failure in second level proceeds to third level",
1)
test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true3 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v false2 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 3 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
- test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'false1'")
- test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'")
- test.add_stonith_log_pattern("perform op 'node_fake off' with 'false2'")
+ test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'false1'")
+ test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'")
+ test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'false2'")
test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times")
- test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
- test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'true2'")
+ test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake")
+ test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'true2'")
test.add_stonith_neg_log_pattern("node_fake with true3")
def setup_environment(self, use_corosync):
""" Prepare the host before executing any tests """
if use_corosync:
if self.autogen_corosync_cfg:
(handle, self.autogen_corosync_log) = tempfile.mkstemp(prefix="cts-fencing-",
suffix=".corosync.log")
os.close(handle)
corosync_cfg = io.open("/etc/corosync/corosync.conf", "w")
corosync_cfg.write(AUTOGEN_COROSYNC_TEMPLATE % (localname(), self.autogen_corosync_log))
corosync_cfg.close()
### make sure we are in control ###
killall("corosync")
self.start_corosync()
subprocess.call(["cts-support", "install"])
def cleanup_environment(self, use_corosync):
""" Clean up the host after executing desired tests """
if use_corosync:
killall("corosync")
if self.autogen_corosync_cfg:
if self.verbose:
print("Corosync output")
logfile = io.open(self.autogen_corosync_log, 'rt')
for line in logfile.readlines():
print(line.strip())
logfile.close()
os.remove(self.autogen_corosync_log)
os.remove("/etc/corosync/corosync.conf")
subprocess.call(["cts-support", "uninstall"])
class TestOptions(object):
""" Option handler """
def __init__(self):
self.options = {}
self.options['list-tests'] = 0
self.options['run-all'] = 1
self.options['run-only'] = ""
self.options['run-only-pattern'] = ""
self.options['verbose'] = 0
self.options['invalid-arg'] = ""
self.options['cpg-only'] = 0
self.options['no-cpg'] = 0
self.options['show-usage'] = 0
def build_options(self, argv):
""" Set options based on command-line arguments """
args = argv[1:]
skip = 0
for i in range(0, len(args)):
if skip:
skip = 0
continue
elif args[i] == "-h" or args[i] == "--help":
self.options['show-usage'] = 1
elif args[i] == "-l" or args[i] == "--list-tests":
self.options['list-tests'] = 1
elif args[i] == "-V" or args[i] == "--verbose":
self.options['verbose'] = 1
elif args[i] == "-n" or args[i] == "--no-cpg":
self.options['no-cpg'] = 1
elif args[i] == "-c" or args[i] == "--cpg-only":
self.options['cpg-only'] = 1
elif args[i] == "-r" or args[i] == "--run-only":
self.options['run-only'] = args[i+1]
skip = 1
elif args[i] == "-p" or args[i] == "--run-only-pattern":
self.options['run-only-pattern'] = args[i+1]
skip = 1
def show_usage(self):
""" Show command usage """
print("usage: " + sys.argv[0] + " [options]")
print("If no options are provided, all tests will run")
print("Options:")
print("\t [--help | -h] Show usage")
print("\t [--list-tests | -l] Print out all registered tests.")
print("\t [--cpg-only | -c] Only run tests that require corosync.")
print("\t [--no-cpg | -n] Only run tests that do not require corosync")
print("\t [--run-only | -r 'testname'] Run a specific test")
print("\t [--verbose | -V] Verbose output")
print("\t [--run-only-pattern | -p 'string'] Run only tests containing the string value")
print("\n\tExample: Run only the test 'start_stop'")
print("\t\t " + sys.argv[0] + " --run-only start_stop")
print("\n\tExample: Run only the tests with the string 'systemd' present in them")
print("\t\t " + sys.argv[0] + " --run-only-pattern systemd")
def main(argv):
""" Run fencing regression tests as specified by arguments """
update_path()
opts = TestOptions()
opts.build_options(argv)
use_corosync = 1
tests = Tests(opts.options['verbose'])
tests.build_standalone_tests()
tests.build_custom_timeout_tests()
tests.build_api_sanity_tests()
tests.build_fence_merge_tests()
tests.build_fence_no_merge_tests()
tests.build_unfence_tests()
tests.build_unfence_on_target_tests()
tests.build_nodeid_tests()
tests.build_remap_tests()
if opts.options['list-tests']:
tests.print_list()
sys.exit(CrmExit.OK)
elif opts.options['show-usage']:
opts.show_usage()
sys.exit(CrmExit.OK)
print("Starting ...")
if opts.options['no-cpg']:
use_corosync = 0
tests.setup_environment(use_corosync)
if opts.options['run-only-pattern'] != "":
tests.run_tests_matching(opts.options['run-only-pattern'])
tests.print_results()
elif opts.options['run-only'] != "":
tests.run_single(opts.options['run-only'])
tests.print_results()
elif opts.options['no-cpg']:
tests.run_no_cpg()
tests.print_results()
elif opts.options['cpg-only']:
tests.run_cpg_only()
tests.print_results()
else:
tests.run_tests()
tests.print_results()
tests.cleanup_environment(use_corosync)
tests.exit()
if __name__ == "__main__":
main(sys.argv)
diff --git a/cts/patterns.py b/cts/patterns.py
index 85eddb24fd..b0b8784a02 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -1,411 +1,411 @@
""" Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2008-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import sys, os
from cts.CTSvars import *
patternvariants = {}
class BasePatterns(object):
def __init__(self, name):
self.name = name
patternvariants[name] = self
self.ignore = [
"avoid confusing Valgrind",
# Logging bug in some versions of libvirtd
r"libvirtd.*: internal error: Failed to parse PCI config address",
]
self.BadNews = []
self.components = {}
self.commands = {
"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
"CibQuery" : "cibadmin -Ql",
"CibAddXml" : "cibadmin --modify -c --xml-text %s",
"CibDelXpath" : "cibadmin --delete --xpath %s",
# 300,000 == 5 minutes
"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s",
"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",
# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
"ReduceCommCmd" : "",
"RestoreCommCmd" : "tc qdisc del dev lo root",
"MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''",
"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
}
self.search = {
"Pat:DC_IDLE" : "pacemaker-controld.*State transition.*-> S_IDLE",
# This won't work if we have multiple partitions
"Pat:Local_started" : "%s\W.*controller successfully started",
"Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC",
"Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE",
"Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN",
"Pat:They_stopped" : "%s\W.*LOST:.* %s ",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
- "Pat:Fencing_start" : r"(Initiating remote operation|Requesting peer fencing ).* (for|of) %s",
- "Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* of %s by .* for .*@.*: OK",
+ "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s",
+ "Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* targeting %s on .* for .*@.*: OK",
"Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s",
"Pat:Fencing_active" : r"pacemaker-schedulerd.*: Resource %s is active on .* nodes",
"Pat:Fencing_probe" : r"pacemaker-controld.* Result of probe operation for %s on .*: Error",
"Pat:RscOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?ok",
"Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
"Pat:NodeFenced" : r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK",
"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
}
def get_component(self, key):
if key in self.components:
return self.components[key]
print("Unknown component '%s' for %s" % (key, self.name))
return []
def get_patterns(self, key):
if key == "BadNews":
return self.BadNews
elif key == "BadNewsIgnore":
return self.ignore
elif key == "Commands":
return self.commands
elif key == "Search":
return self.search
elif key == "Components":
return self.components
def __getitem__(self, key):
if key == "Name":
return self.name
elif key in self.commands:
return self.commands[key]
elif key in self.search:
return self.search[key]
else:
print("Unknown template '%s' for %s" % (key, self.name))
return None
class crm_corosync(BasePatterns):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"StartCmd" : "service corosync start && service pacemaker start",
"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop",
"EpochCmd" : "crm_node -e",
"QuorumCmd" : "crm_node -q",
"PartitionCmd" : "crm_node -p",
})
self.search.update({
# Close enough ... "Corosync Cluster Engine exiting normally" isn't
# printed reliably.
"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost",
"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
# "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:InfraUp" : "%s\W.*corosync.*Initializing transport",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
self.ignore = self.ignore + [
r"crm_mon:",
r"crmadmin:",
r"update_trace_data",
r"async_notify:.*strange, client not found",
r"Parse error: Ignoring unknown option .*nodename",
r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:",
r"getinfo response error: 1$",
r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)",
]
self.BadNews = [
r"error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting",
r"schedulerd.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)",
r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)",
r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
r"Peer is not part of our cluster",
r"We appear to be in an election loop",
r"Unknown node -> we will not deliver message",
r"(Blackbox dump requested|Problem detected)",
r"pacemakerd.*Could not connect to Cluster Configuration Database API",
r"Receiving messages from a node we think is dead",
r"share the same cluster nodeid",
r"share the same name",
#r"crm_ipc_send:.*Request .* failed",
#r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received",
# Not inherently bad, but worth tracking
#r"No need to invoke the TE",
#r"ping.*: DEBUG: Updated connected = 0",
#r"Digest mis-match:",
r"pacemaker-controld:.*Transition failed: terminated",
r"Local CIB .* differs from .*:",
r"warn.*:\s*Continuing but .* will NOT be used",
r"warn.*:\s*Cluster configuration file .* is corrupt",
#r"Executing .* fencing operation",
r"Election storm",
r"stalled the FSA with pending inputs",
]
self.components["common-ignore"] = [
r"Pending action:",
r"resource( was|s were) active at shutdown",
r"pending LRM operations at shutdown",
r"Lost connection to the CIB manager",
r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported",
r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ",
r".*:\s*Executing .* fencing operation \(.*\) on ",
r".*:\s*Requesting fencing \([^)]+\) of node ",
r"(Blackbox dump requested|Problem detected)",
# "Resource .*stonith::.* is active on 2 nodes attempting recovery",
# "Transition .* ERRORs found during PE processing",
]
self.components["corosync-ignore"] = [
r"error:.*Connection to the CPG API failed: Library error",
r"\[[0-9]+\] exited with status [0-9]+ \(",
r"pacemaker-based.*error:.*Corosync connection lost",
r"pacemaker-fenced.*error:.*Corosync connection terminated",
r"pacemaker-controld.*State transition .* S_RECOVERY",
r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state",
r"pacemaker-controld.*error:.*Could not recover from internal error",
r"error:.*Connection to cib_(shm|rw).* (failed|closed)",
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"crit: Fencing daemon connection failed",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["corosync"] = [
# We expect each daemon to lose its cluster connection.
# However, if the CIB manager loses its connection first,
# it's possible for another daemon to lose that connection and
# exit before losing the cluster connection.
r"pacemakerd.*:\s*(crit|error):.*Lost connection to cluster layer",
r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer",
r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"schedulerd.*Scheduling Node .* for STONITH",
r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK",
]
self.components["pacemaker-based"] = [
r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102",
r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1",
r"pacemakerd.* Respawning failed child process: pacemaker-attrd",
r"pacemakerd.* Respawning failed child process: pacemaker-based",
r"pacemakerd.* Respawning failed child process: pacemaker-controld",
r"pacemakerd.* Respawning failed child process: pacemaker-fenced",
r"pacemaker-.* Connection to cib_.* (failed|closed)",
r"pacemaker-attrd.*:.*Lost connection to the CIB manager",
r"pacemaker-controld.*:.*Lost connection to the CIB manager",
r"pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy",
r"pacemaker-controld.* State transition .* S_RECOVERY",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
r"pacemaker-controld.*Could not recover from internal error",
]
self.components["pacemaker-based-ignore"] = [
r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["pacemaker-execd"] = [
r"pacemaker-controld.*Connection to (pacemaker-execd|lrmd|executor) (failed|closed)",
r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy",
r"pacemaker-controld.*State transition .* S_RECOVERY",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
r"pacemaker-controld.*Could not recover from internal error",
r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1",
r"pacemakerd.*Respawning failed child process: pacemaker-execd",
r"pacemakerd.*Respawning failed child process: pacemaker-controld",
]
self.components["pacemaker-execd-ignore"] = [
r"pacemaker-attrd.*Connection to lrmd (failed|closed)",
r"pacemaker-(attrd|controld).*Could not execute alert",
]
self.components["pacemaker-controld"] = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# Only if the node wasn't the DC: "State transition S_IDLE",
"State transition .* -> S_IDLE",
]
self.components["pacemaker-controld-ignore"] = []
self.components["pacemaker-attrd"] = []
self.components["pacemaker-attrd-ignore"] = []
self.components["pacemaker-schedulerd"] = [
"State transition .* S_RECOVERY",
r"Respawning failed child process: pacemaker-controld",
r"pacemaker-controld\[[0-9]+\] exited with status 1 \(",
"Connection to pengine failed",
"Connection to pengine.* closed",
r"Connection to the scheduler failed",
"pacemaker-controld.*I_ERROR.*save_cib_contents",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
"pacemaker-controld.*Could not recover from internal error",
]
self.components["pacemaker-schedulerd-ignore"] = []
self.components["pacemaker-fenced"] = [
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"Fencing daemon connection failed",
r"pacemaker-controld.*Fencer successfully connected",
]
self.components["pacemaker-fenced-ignore"] = [
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"crit:.*Fencing daemon connection failed",
r"error:.*Fencer connection failed \(will retry\)",
r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations",
r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["pacemaker-fenced-ignore"].extend(self.components["common-ignore"])
class crm_corosync_docker(crm_corosync):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
crm_corosync.__init__(self, name)
self.commands.update({
"StartCmd" : "pcmk_start",
"StopCmd" : "pcmk_stop",
})
class PatternSelector(object):
def __init__(self, name=None):
self.name = name
self.base = BasePatterns("crm-base")
if not name:
crm_corosync("crm-corosync")
elif name == "crm-corosync":
crm_corosync(name)
elif name == "crm-corosync-docker":
crm_corosync_docker(name)
def get_variant(self, variant):
if variant in patternvariants:
return patternvariants[variant]
print("defaulting to crm-base for %s" % variant)
return self.base
def get_patterns(self, variant, kind):
return self.get_variant(variant).get_patterns(kind)
def get_template(self, variant, key):
v = self.get_variant(variant)
return v[key]
def get_component(self, variant, kind):
return self.get_variant(variant).get_component(kind)
def __getitem__(self, key):
return self.get_template(self.name, key)
# python cts/CTSpatt.py -k crm-corosync -t StartCmd
if __name__ == '__main__':
pdir=os.path.dirname(sys.path[0])
sys.path.insert(0, pdir) # So that things work from the source directory
kind=None
template=None
skipthis=None
args=sys.argv[1:]
for i in range(0, len(args)):
if skipthis:
skipthis=None
continue
elif args[i] == "-k" or args[i] == "--kind":
skipthis=1
kind = args[i+1]
elif args[i] == "-t" or args[i] == "--template":
skipthis=1
template = args[i+1]
else:
print("Illegal argument " + args[i])
print(PatternSelector(kind)[template])
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 9c27e6553d..76a8a0ed57 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,2064 +1,2086 @@
/*
* Copyright 2009-2019 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define TIMEOUT_MULTIPLY_FACTOR 1.2
/* When one fencer queries its peers for devices able to handle a fencing
* request, each peer will reply with a list of such devices available to it.
* Each reply will be parsed into a st_query_result_t, with each device's
* information kept in a device_properties_t.
*/
typedef struct device_properties_s {
/* Whether access to this device has been verified */
gboolean verified;
/* The remaining members are indexed by the operation's "phase" */
/* Whether this device has been executed in each phase */
gboolean executed[st_phase_max];
/* Whether this device is disallowed from executing in each phase */
gboolean disallowed[st_phase_max];
/* Action-specific timeout for each phase */
int custom_action_timeout[st_phase_max];
/* Action-specific maximum random delay for each phase */
int delay_max[st_phase_max];
/* Action-specific base delay for each phase */
int delay_base[st_phase_max];
} device_properties_t;
typedef struct st_query_result_s {
/* Name of peer that sent this result */
char *host;
/* Only try peers for non-topology based operations once */
gboolean tried;
/* Number of entries in the devices table */
int ndevices;
/* Devices available to this host that are capable of fencing the target */
GHashTable *devices;
} st_query_result_t;
GHashTable *stonith_remote_op_list = NULL;
void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer);
static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup);
extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
int call_options);
static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
static int get_op_total_timeout(const remote_fencing_op_t *op,
const st_query_result_t *chosen_peer);
static gint
sort_strings(gconstpointer a, gconstpointer b)
{
return strcmp(a, b);
}
static void
free_remote_query(gpointer data)
{
if (data) {
st_query_result_t *query = data;
crm_trace("Free'ing query result from %s", query->host);
g_hash_table_destroy(query->devices);
free(query->host);
free(query);
}
}
void
free_stonith_remote_op_list()
{
if (stonith_remote_op_list != NULL) {
g_hash_table_destroy(stonith_remote_op_list);
stonith_remote_op_list = NULL;
}
}
struct peer_count_data {
const remote_fencing_op_t *op;
gboolean verified_only;
int count;
};
/*!
* \internal
* \brief Increment a counter if a device has not been executed yet
*
* \param[in] key Device ID (ignored)
* \param[in] value Device properties
* \param[in] user_data Peer count data
*/
static void
count_peer_device(gpointer key, gpointer value, gpointer user_data)
{
device_properties_t *props = (device_properties_t*)value;
struct peer_count_data *data = user_data;
if (!props->executed[data->op->phase]
&& (!data->verified_only || props->verified)) {
++(data->count);
}
}
/*!
* \internal
* \brief Check the number of available devices in a peer's query results
*
* \param[in] op Operation that results are for
* \param[in] peer Peer to count
* \param[in] verified_only Whether to count only verified devices
*
* \return Number of devices available to peer that were not already executed
*/
static int
count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
gboolean verified_only)
{
struct peer_count_data data;
data.op = op;
data.verified_only = verified_only;
data.count = 0;
if (peer) {
g_hash_table_foreach(peer->devices, count_peer_device, &data);
}
return data.count;
}
/*!
* \internal
* \brief Search for a device in a query result
*
* \param[in] op Operation that result is for
* \param[in] peer Query result for a peer
* \param[in] device Device ID to search for
*
* \return Device properties if found, NULL otherwise
*/
static device_properties_t *
find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
const char *device)
{
device_properties_t *props = g_hash_table_lookup(peer->devices, device);
return (props && !props->executed[op->phase]
&& !props->disallowed[op->phase])? props : NULL;
}
/*!
* \internal
* \brief Find a device in a peer's device list and mark it as executed
*
* \param[in] op Operation that peer result is for
* \param[in,out] peer Peer with results to search
* \param[in] device ID of device to mark as done
* \param[in] verified_devices_only Only consider verified devices
*
* \return TRUE if device was found and marked, FALSE otherwise
*/
static gboolean
grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
const char *device, gboolean verified_devices_only)
{
device_properties_t *props = find_peer_device(op, peer, device);
if ((props == NULL) || (verified_devices_only && !props->verified)) {
return FALSE;
}
crm_trace("Removing %s from %s (%d remaining)",
device, peer->host, count_peer_devices(op, peer, FALSE));
props->executed[op->phase] = TRUE;
return TRUE;
}
static void
clear_remote_op_timers(remote_fencing_op_t * op)
{
if (op->query_timer) {
g_source_remove(op->query_timer);
op->query_timer = 0;
}
if (op->op_timer_total) {
g_source_remove(op->op_timer_total);
op->op_timer_total = 0;
}
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
op->op_timer_one = 0;
}
}
static void
free_remote_op(gpointer data)
{
remote_fencing_op_t *op = data;
crm_trace("Free'ing op %s for %s", op->id, op->target);
crm_log_xml_debug(op->request, "Destroying");
clear_remote_op_timers(op);
free(op->id);
free(op->action);
free(op->delegate);
free(op->target);
free(op->client_id);
free(op->client_name);
free(op->originator);
if (op->query_results) {
g_list_free_full(op->query_results, free_remote_query);
}
if (op->request) {
free_xml(op->request);
op->request = NULL;
}
if (op->devices_list) {
g_list_free_full(op->devices_list, free);
op->devices_list = NULL;
}
g_list_free_full(op->automatic_list, free);
g_list_free(op->duplicates);
free(op);
}
void
init_stonith_remote_op_hash_table(GHashTable **table)
{
if (*table == NULL) {
*table = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_remote_op);
}
}
/*!
* \internal
* \brief Return an operation's originally requested action (before any remap)
*
* \param[in] op Operation to check
*
* \return Operation's original action
*/
static const char *
op_requested_action(const remote_fencing_op_t *op)
{
return ((op->phase > st_phase_requested)? "reboot" : op->action);
}
/*!
* \internal
* \brief Remap a "reboot" operation to the "off" phase
*
* \param[in,out] op Operation to remap
*/
static void
op_phase_off(remote_fencing_op_t *op)
{
- crm_info("Remapping multiple-device reboot of %s (%s) to off",
+ crm_info("Remapping multiple-device reboot targeting %s (%s) to 'off'",
op->target, op->id);
op->phase = st_phase_off;
/* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
* memory allocation at each phase.
*/
strcpy(op->action, "off");
}
/*!
* \internal
* \brief Advance a remapped reboot operation to the "on" phase
*
* \param[in,out] op Operation to remap
*/
static void
op_phase_on(remote_fencing_op_t *op)
{
GListPtr iter = NULL;
- crm_info("Remapped off of %s complete, remapping to on for %s.%.8s",
+ crm_info("Remapped 'off' targeting %s complete, "
+ "remapping to 'on' for %s.%.8s",
op->target, op->client_name, op->id);
op->phase = st_phase_on;
strcpy(op->action, "on");
/* Skip devices with automatic unfencing, because the cluster will handle it
* when the node rejoins.
*/
for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
GListPtr match = g_list_find_custom(op->devices_list, iter->data,
sort_strings);
if (match) {
op->devices_list = g_list_remove(op->devices_list, match->data);
}
}
g_list_free_full(op->automatic_list, free);
op->automatic_list = NULL;
/* Rewind device list pointer */
op->devices = op->devices_list;
}
/*!
* \internal
* \brief Reset a remapped reboot operation
*
* \param[in,out] op Operation to reset
*/
static void
undo_op_remap(remote_fencing_op_t *op)
{
if (op->phase > 0) {
- crm_info("Undoing remap of reboot of %s for %s.%.8s",
+ crm_info("Undoing remap of reboot targeting %s for %s.%.8s",
op->target, op->client_name, op->id);
op->phase = st_phase_requested;
strcpy(op->action, "reboot");
}
}
static xmlNode *
create_op_done_notify(remote_fencing_op_t * op, int rc)
{
xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
crm_xml_add_int(notify_data, "state", op->state);
crm_xml_add_int(notify_data, F_STONITH_RC, rc);
crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
return notify_data;
}
void
stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc)
{
static int count = 0;
xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
xmlNode *notify_data = create_op_done_notify(op, rc);
count++;
crm_trace("Broadcasting result to peers");
crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
crm_xml_add(bcast, F_SUBTYPE, "broadcast");
crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
crm_xml_add_int(bcast, "count", count);
add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
free_xml(notify_data);
free_xml(bcast);
return;
}
static void
handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc)
{
xmlNode *notify_data = NULL;
xmlNode *reply = NULL;
if (op->notify_sent == TRUE) {
/* nothing to do */
return;
}
/* Do notification with a clean data object */
notify_data = create_op_done_notify(op, rc);
crm_xml_add_int(data, "state", op->state);
crm_xml_add(data, F_STONITH_TARGET, op->target);
crm_xml_add(data, F_STONITH_OPERATION, op->action);
reply = stonith_construct_reply(op->request, NULL, data, rc);
crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
/* Send fencing OP reply to local client that initiated fencing */
do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE);
/* bcast to all local clients that the fencing operation happend */
do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
/* mark this op as having notify's already sent */
op->notify_sent = TRUE;
free_xml(reply);
free_xml(notify_data);
}
static void
handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
{
GListPtr iter = NULL;
for (iter = op->duplicates; iter != NULL; iter = iter->next) {
remote_fencing_op_t *other = iter->data;
if (other->state == st_duplicate) {
other->state = op->state;
crm_debug("Performing duplicate notification for %s@%s.%.8s = %s",
other->client_name, other->originator, other->id,
pcmk_strerror(rc));
remote_op_done(other, data, rc, TRUE);
} else {
// Possible if (for example) it timed out already
crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name,
other->originator, other->state);
}
}
}
/*!
* \internal
* \brief Finalize a remote operation.
*
* \description This function has two code paths.
*
* Path 1. This node is the owner of the operation and needs
* to notify the cpg group via a broadcast as to the operation's
* results.
*
* Path 2. The cpg broadcast is received. All nodes notify their local
* stonith clients the operation results.
*
* So, The owner of the operation first notifies the cluster of the result,
* and once that cpg notify is received back it notifies all the local clients.
*
* Nodes that are passive watchers of the operation will receive the
* broadcast and only need to notify their local clients the operation finished.
*
* \param op, The fencing operation to finalize
* \param data, The xml msg reply (if present) of the last delegated fencing
* operation.
* \param dup, Is this operation a duplicate, if so treat it a little differently
* making sure the broadcast is not sent out.
*/
static void
remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
{
int level = LOG_ERR;
const char *subt = NULL;
xmlNode *local_data = NULL;
op->completed = time(NULL);
clear_remote_op_timers(op);
undo_op_remap(op);
if (op->notify_sent == TRUE) {
- crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s",
- op->action, op->target, op->delegate ? op->delegate : "",
- op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc));
+ crm_err("Already sent notifications for '%s' targeting %s on %s for "
+ "client %s@%s.%.8s: %s " CRM_XS " rc=%d state=%d",
+ op->action, op->target,
+ (op->delegate? op->delegate : "unknown node"),
+ op->client_name, op->originator, op->id, pcmk_strerror(rc),
+ rc, op->state);
goto remote_op_done_cleanup;
}
if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) {
xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data, LOG_TRACE);
if(ndata) {
op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE);
} else {
op->delegate = crm_element_value_copy(data, F_ORIG);
}
}
if (data == NULL) {
data = create_xml_node(NULL, "remote-op");
local_data = data;
}
/* Tell everyone the operation is done, we will continue
* with doing the local notifications once we receive
* the broadcast back. */
subt = crm_element_value(data, F_SUBTYPE);
if (dup == FALSE && safe_str_neq(subt, "broadcast")) {
/* Defer notification until the bcast message arrives */
stonith_bcast_result_to_peers(op, rc);
goto remote_op_done_cleanup;
}
if (rc == pcmk_ok || dup) {
level = LOG_NOTICE;
} else if (safe_str_neq(op->originator, stonith_our_uname)) {
level = LOG_NOTICE;
}
- do_crm_log(level,
- "Operation %s of %s by %s for %s@%s.%.8s: %s",
- op->action, op->target, op->delegate ? op->delegate : "",
+ do_crm_log(level, "Operation '%s'%s%s on %s for %s@%s.%.8s: %s",
+ op->action, (op->target? " targeting " : ""),
+ (op->target? op->target : ""),
+ (op->delegate? op->delegate : ""),
op->client_name, op->originator, op->id, pcmk_strerror(rc));
handle_local_reply_and_notify(op, data, rc);
if (dup == FALSE) {
handle_duplicates(op, data, rc);
}
/* Free non-essential parts of the record
* Keep the record around so we can query the history
*/
if (op->query_results) {
g_list_free_full(op->query_results, free_remote_query);
op->query_results = NULL;
}
if (op->request) {
free_xml(op->request);
op->request = NULL;
}
remote_op_done_cleanup:
free_xml(local_data);
}
static gboolean
remote_op_watchdog_done(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_one = 0;
crm_notice("Self-fencing (%s) by %s for %s.%8s assumed complete",
op->action, op->target, op->client_name, op->id);
op->state = st_done;
remote_op_done(op, NULL, pcmk_ok, FALSE);
return FALSE;
}
static gboolean
remote_op_timeout_one(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_one = 0;
- crm_notice("Peer's fencing (%s) of %s for %s timed out" CRM_XS "id=%s",
- op->action, op->target, op->client_name, op->id);
+ crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
+ " id=%s", op->action, op->target, op->client_name, op->id);
call_remote_stonith(op, NULL);
return FALSE;
}
static gboolean
remote_op_timeout(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_total = 0;
if (op->state == st_done) {
- crm_debug("Action %s (%s) for %s (%s) already completed",
- op->action, op->id, op->target, op->client_name);
+ crm_debug("Action '%s' targeting %s for client %s already completed "
+ CRM_XS " id=%s",
+ op->action, op->target, op->client_name, op->id);
return FALSE;
}
- crm_debug("Action %s (%s) for %s (%s) timed out",
- op->action, op->id, op->target, op->client_name);
+ crm_debug("Action '%s' targeting %s for client %s timed out "
+ CRM_XS " id=%s",
+ op->action, op->target, op->client_name, op->id);
if (op->phase == st_phase_on) {
/* A remapped reboot operation timed out in the "on" phase, but the
* "off" phase completed successfully, so quit trying any further
* devices, and return success.
*/
remote_op_done(op, NULL, pcmk_ok, FALSE);
return FALSE;
}
op->state = st_failed;
remote_op_done(op, NULL, -ETIME, FALSE);
return FALSE;
}
static gboolean
remote_op_query_timeout(gpointer data)
{
remote_fencing_op_t *op = data;
op->query_timer = 0;
if (op->state == st_done) {
- crm_debug("Operation %s for %s already completed", op->id, op->target);
+ crm_debug("Operation %s targeting %s already completed",
+ op->id, op->target);
} else if (op->state == st_exec) {
- crm_debug("Operation %s for %s already in progress", op->id, op->target);
+ crm_debug("Operation %s targeting %s already in progress",
+ op->id, op->target);
} else if (op->query_results) {
- crm_debug("Query %s for %s complete: %d", op->id, op->target, op->state);
+ crm_debug("Query %s targeting %s complete (state=%d)",
+ op->id, op->target, op->state);
call_remote_stonith(op, NULL);
} else {
- crm_debug("Query %s for %s timed out: %d", op->id, op->target, op->state);
+ crm_debug("Query %s targeting %s timed out (state=%d)",
+ op->id, op->target, op->state);
if (op->op_timer_total) {
g_source_remove(op->op_timer_total);
op->op_timer_total = 0;
}
remote_op_timeout(op);
}
return FALSE;
}
static gboolean
topology_is_empty(stonith_topology_t *tp)
{
int i;
if (tp == NULL) {
return TRUE;
}
for (i = 0; i < ST_LEVEL_MAX; i++) {
if (tp->levels[i] != NULL) {
return FALSE;
}
}
return TRUE;
}
/*!
* \internal
* \brief Add a device to an operation's automatic unfencing list
*
* \param[in,out] op Operation to modify
* \param[in] device Device ID to add
*/
static void
add_required_device(remote_fencing_op_t *op, const char *device)
{
GListPtr match = g_list_find_custom(op->automatic_list, device,
sort_strings);
if (!match) {
op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
}
}
/*!
* \internal
* \brief Remove a device from the automatic unfencing list
*
* \param[in,out] op Operation to modify
* \param[in] device Device ID to remove
*/
static void
remove_required_device(remote_fencing_op_t *op, const char *device)
{
GListPtr match = g_list_find_custom(op->automatic_list, device,
sort_strings);
if (match) {
op->automatic_list = g_list_remove(op->automatic_list, match->data);
}
}
/* deep copy the device list */
static void
set_op_device_list(remote_fencing_op_t * op, GListPtr devices)
{
GListPtr lpc = NULL;
if (op->devices_list) {
g_list_free_full(op->devices_list, free);
op->devices_list = NULL;
}
for (lpc = devices; lpc != NULL; lpc = lpc->next) {
op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
}
op->devices = op->devices_list;
}
/*!
* \internal
* \brief Check whether a node matches a topology target
*
* \param[in] tp Topology table entry to check
* \param[in] node Name of node to check
*
* \return TRUE if node matches topology target
*/
static gboolean
topology_matches(const stonith_topology_t *tp, const char *node)
{
regex_t r_patt;
CRM_CHECK(node && tp && tp->target, return FALSE);
switch(tp->kind) {
case 2:
/* This level targets by attribute, so tp->target is a NAME=VALUE pair
* of a permanent attribute applied to targeted nodes. The test below
* relies on the locally cached copy of the CIB, so if fencing needs to
* be done before the initial CIB is received or after a malformed CIB
* is received, then the topology will be unable to be used.
*/
if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
crm_notice("Matched %s with %s by attribute", node, tp->target);
return TRUE;
}
break;
case 1:
/* This level targets by name, so tp->target is a regular expression
* matching names of nodes to be targeted.
*/
if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
crm_info("Bad regex '%s' for fencing level", tp->target);
} else {
int status = regexec(&r_patt, node, 0, NULL, 0);
regfree(&r_patt);
if (status == 0) {
crm_notice("Matched %s with %s by name", node, tp->target);
return TRUE;
}
}
break;
case 0:
crm_trace("Testing %s against %s", node, tp->target);
return safe_str_eq(tp->target, node);
}
crm_trace("No match for %s with %s", node, tp->target);
return FALSE;
}
stonith_topology_t *
find_topology_for_host(const char *host)
{
GHashTableIter tIter;
stonith_topology_t *tp = g_hash_table_lookup(topology, host);
if(tp != NULL) {
crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
return tp;
}
g_hash_table_iter_init(&tIter, topology);
while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
if (topology_matches(tp, host)) {
crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
return tp;
}
}
crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
return NULL;
}
/*!
* \internal
* \brief Set fencing operation's device list to target's next topology level
*
* \param[in,out] op Remote fencing operation to modify
*
* \return pcmk_ok if successful, target was not specified (i.e. queries) or
* target has no topology, or -EINVAL if no more topology levels to try
*/
static int
stonith_topology_next(remote_fencing_op_t * op)
{
stonith_topology_t *tp = NULL;
if (op->target) {
/* Queries don't have a target set */
tp = find_topology_for_host(op->target);
}
if (topology_is_empty(tp)) {
return pcmk_ok;
}
set_bit(op->call_options, st_opt_topology);
/* This is a new level, so undo any remapping left over from previous */
undo_op_remap(op);
do {
op->level++;
} while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
if (op->level < ST_LEVEL_MAX) {
- crm_trace("Attempting fencing level %d for %s (%d devices) - %s@%s.%.8s",
+ crm_trace("Attempting fencing level %d targeting %s (%d devices) "
+ "for client %s@%s.%.8s",
op->level, op->target, g_list_length(tp->levels[op->level]),
op->client_name, op->originator, op->id);
set_op_device_list(op, tp->levels[op->level]);
if (g_list_next(op->devices_list) && safe_str_eq(op->action, "reboot")) {
/* A reboot has been requested for a topology level with multiple
* devices. Instead of rebooting the devices sequentially, we will
* turn them all off, then turn them all on again. (Think about
* switched power outlets for redundant power supplies.)
*/
op_phase_off(op);
}
return pcmk_ok;
}
- crm_notice("All fencing options to fence %s for %s@%s.%.8s failed",
+ crm_notice("All fencing options targeting %s for client %s@%s.%.8s failed",
op->target, op->client_name, op->originator, op->id);
return -EINVAL;
}
/*!
* \brief Check to see if this operation is a duplicate of another in flight
* operation. If so merge this operation into the inflight operation, and mark
* it as a duplicate.
*/
static void
merge_duplicates(remote_fencing_op_t * op)
{
GHashTableIter iter;
remote_fencing_op_t *other = NULL;
time_t now = time(NULL);
g_hash_table_iter_init(&iter, stonith_remote_op_list);
while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
crm_node_t *peer = NULL;
const char *other_action = op_requested_action(other);
if (other->state > st_exec) {
/* Must be in-progress */
continue;
} else if (safe_str_neq(op->target, other->target)) {
/* Must be for the same node */
continue;
} else if (safe_str_neq(op->action, other_action)) {
crm_trace("Must be for the same action: %s vs. %s",
op->action, other_action);
continue;
} else if (safe_str_eq(op->client_name, other->client_name)) {
crm_trace("Must be for different clients: %s", op->client_name);
continue;
} else if (safe_str_eq(other->target, other->originator)) {
crm_trace("Can't be a suicide operation: %s", other->target);
continue;
}
peer = crm_get_peer(0, other->originator);
if(fencing_peer_active(peer) == FALSE) {
- crm_notice("Failing stonith action %s for node %s originating from %s@%s.%.8s: Originator is dead",
+ crm_notice("Failing action '%s' targeting %s originating from "
+ "client %s@%s.%.8s: Originator is dead",
other->action, other->target, other->client_name, other->originator, other->id);
other->state = st_failed;
continue;
} else if(other->total_timeout > 0 && now > (other->total_timeout + other->created)) {
- crm_info("Stonith action %s for node %s originating from %s@%s.%.8s is too old: %ld vs. %ld + %d",
+ crm_info("Action '%s' targeting %s originating from client "
+ "%s@%s.%.8s is too old: %ld vs. %ld + %d",
other->action, other->target, other->client_name, other->originator, other->id,
now, other->created, other->total_timeout);
continue;
}
/* There is another in-flight request to fence the same host
* Piggyback on that instead. If it fails, so do we.
*/
other->duplicates = g_list_append(other->duplicates, op);
if (other->total_timeout == 0) {
crm_trace("Making a best-guess as to the timeout used");
other->total_timeout = op->total_timeout =
TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
}
- crm_notice
- ("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)",
- op->action, op->target, op->client_name, op->id, other->client_name, other->originator,
- other->id, other->total_timeout);
+ crm_notice("Merging stonith action '%s' targeting %s originating from "
+ "client %s.%.8s with identical request from %s@%s.%.8s (%ds)",
+ op->action, op->target, op->client_name, op->id,
+ other->client_name, other->originator, other->id,
+ other->total_timeout);
report_timeout_period(op, other->total_timeout);
op->state = st_duplicate;
}
}
static uint32_t fencing_active_peers(void)
{
uint32_t count = 0;
crm_node_t *entry;
GHashTableIter gIter;
g_hash_table_iter_init(&gIter, crm_peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
if(fencing_peer_active(entry)) {
count++;
}
}
return count;
}
int
stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
{
xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
op->state = st_done;
op->completed = time(NULL);
op->delegate = strdup("a human");
crm_notice("Injecting manual confirmation that %s is safely off/down",
crm_element_value(dev, F_STONITH_TARGET));
remote_op_done(op, msg, pcmk_ok, FALSE);
/* Replies are sent via done_cb->stonith_send_async_reply()->do_local_reply() */
return -EINPROGRESS;
}
/*!
* \internal
* \brief Create a new remote stonith operation
*
* \param[in] client ID of local stonith client that initiated the operation
* \param[in] request The request from the client that started the operation
* \param[in] peer TRUE if this operation is owned by another stonith peer
* (an operation owned by one peer is stored on all peers,
* but only the owner executes it; all nodes get the results
* once the owner finishes execution)
*/
void *
create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
{
remote_fencing_op_t *op = NULL;
xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE);
int call_options = 0;
init_stonith_remote_op_hash_table(&stonith_remote_op_list);
/* If this operation is owned by another node, check to make
* sure we haven't already created this operation. */
if (peer && dev) {
const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(op_id != NULL, return NULL);
op = g_hash_table_lookup(stonith_remote_op_list, op_id);
if (op) {
crm_debug("%s already exists", op_id);
return op;
}
}
op = calloc(1, sizeof(remote_fencing_op_t));
crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
if (peer && dev) {
op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
} else {
op->id = crm_generate_uuid();
}
g_hash_table_replace(stonith_remote_op_list, op->id, op);
CRM_LOG_ASSERT(g_hash_table_lookup(stonith_remote_op_list, op->id) != NULL);
crm_trace("Created %s", op->id);
op->state = st_query;
op->replies_expected = fencing_active_peers();
op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
op->created = time(NULL);
if (op->originator == NULL) {
/* Local or relayed request */
op->originator = strdup(stonith_our_uname);
}
CRM_LOG_ASSERT(client != NULL);
if (client) {
op->client_id = strdup(client);
}
op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
op->request = copy_xml(request); /* TODO: Figure out how to avoid this */
crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
op->call_options = call_options;
crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
- crm_trace("%s new stonith op: %s - %s of %s for %s",
- (peer
- && dev) ? "Recorded" : "Generated", op->id, op->action, op->target, op->client_name);
+ crm_trace("%s new stonith op %s ('%s' targeting %s for client %s)",
+ (peer && dev)? "Recorded" : "Generated", op->id, op->action,
+ op->target, op->client_name);
if (op->call_options & st_opt_cs_nodeid) {
int nodeid = crm_atoi(op->target, NULL);
crm_node_t *node = crm_find_known_peer_full(nodeid, NULL, CRM_GET_PEER_ANY);
/* Ensure the conversion only happens once */
op->call_options &= ~st_opt_cs_nodeid;
if (node && node->uname) {
free(op->target);
op->target = strdup(node->uname);
} else {
crm_warn("Could not expand nodeid '%s' into a host name", op->target);
}
}
/* check to see if this is a duplicate operation of another in-flight operation */
merge_duplicates(op);
if (op->state != st_duplicate) {
/* kick history readers */
do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
}
/* safe to trim as long as that doesn't touch pending ops */
stonith_fence_history_trim();
return op;
}
remote_fencing_op_t *
initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean manual_ack)
{
int query_timeout = 0;
xmlNode *query = NULL;
const char *client_id = NULL;
remote_fencing_op_t *op = NULL;
if (client) {
client_id = client->id;
} else {
client_id = crm_element_value(request, F_STONITH_CLIENTID);
}
CRM_LOG_ASSERT(client_id != NULL);
op = create_remote_stonith_op(client_id, request, FALSE);
op->owner = TRUE;
if (manual_ack) {
crm_notice("Initiating manual confirmation for %s: %s",
op->target, op->id);
return op;
}
CRM_CHECK(op->action, return NULL);
if (stonith_topology_next(op) != pcmk_ok) {
op->state = st_failed;
}
switch (op->state) {
case st_failed:
- crm_warn("Could not request peer fencing (%s) of %s "
+ crm_warn("Could not request peer fencing (%s) targeting %s "
CRM_XS " id=%s", op->action, op->target, op->id);
remote_op_done(op, NULL, -EINVAL, FALSE);
return op;
case st_duplicate:
- crm_info("Requesting peer fencing (%s) of %s (duplicate) "
+ crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
CRM_XS " id=%s", op->action, op->target, op->id);
return op;
default:
- crm_notice("Requesting peer fencing (%s) of %s "
+ crm_notice("Requesting peer fencing (%s) targeting %s "
CRM_XS " id=%s state=%d",
op->action, op->target, op->id, op->state);
}
query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
NULL, op->call_options);
crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(query, F_STONITH_TARGET, op->target);
crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
free_xml(query);
query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
return op;
}
enum find_best_peer_options {
/*! Skip checking the target peer for capable fencing devices */
FIND_PEER_SKIP_TARGET = 0x0001,
/*! Only check the target peer for capable fencing devices */
FIND_PEER_TARGET_ONLY = 0x0002,
/*! Skip peers and devices that are not verified */
FIND_PEER_VERIFIED_ONLY = 0x0004,
};
static st_query_result_t *
find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
{
GListPtr iter = NULL;
gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
if (!device && is_set(op->call_options, st_opt_topology)) {
return NULL;
}
for (iter = op->query_results; iter != NULL; iter = iter->next) {
st_query_result_t *peer = iter->data;
- crm_trace("Testing result from %s for %s with %d devices: %d %x",
+ crm_trace("Testing result from %s targeting %s with %d devices: %d %x",
peer->host, op->target, peer->ndevices, peer->tried, options);
if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) {
continue;
}
if ((options & FIND_PEER_TARGET_ONLY) && safe_str_neq(peer->host, op->target)) {
continue;
}
if (is_set(op->call_options, st_opt_topology)) {
if (grab_peer_device(op, peer, device, verified_devices_only)) {
return peer;
}
} else if ((peer->tried == FALSE)
&& count_peer_devices(op, peer, verified_devices_only)) {
/* No topology: Use the current best peer */
crm_trace("Simple fencing");
return peer;
}
}
return NULL;
}
static st_query_result_t *
stonith_choose_peer(remote_fencing_op_t * op)
{
const char *device = NULL;
st_query_result_t *peer = NULL;
uint32_t active = fencing_active_peers();
do {
if (op->devices) {
device = op->devices->data;
crm_trace("Checking for someone to fence (%s) %s with %s",
op->action, op->target, device);
} else {
crm_trace("Checking for someone to fence (%s) %s",
op->action, op->target);
}
/* Best choice is a peer other than the target with verified access */
peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
if (peer) {
crm_trace("Found verified peer %s for %s", peer->host, device?device:"");
return peer;
}
if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
return NULL;
}
/* If no other peer has verified access, next best is unverified access */
peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
if (peer) {
crm_trace("Found best unverified peer %s", peer->host);
return peer;
}
/* If no other peer can do it, last option is self-fencing
* (which is never allowed for the "on" phase of a remapped reboot)
*/
if (op->phase != st_phase_on) {
peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
if (peer) {
crm_trace("%s will fence itself", peer->host);
return peer;
}
}
/* Try the next fencing level if there is one (unless we're in the "on"
* phase of a remapped "reboot", because we ignore errors in that case)
*/
} while ((op->phase != st_phase_on)
&& is_set(op->call_options, st_opt_topology)
&& stonith_topology_next(op) == pcmk_ok);
crm_notice("Couldn't find anyone to fence (%s) %s with %s",
op->action, op->target, (device? device : "any device"));
return NULL;
}
static int
get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
const char *device)
{
device_properties_t *props;
if (!peer || !device) {
return op->base_timeout;
}
props = g_hash_table_lookup(peer->devices, device);
if (!props) {
return op->base_timeout;
}
return (props->custom_action_timeout[op->phase]?
props->custom_action_timeout[op->phase] : op->base_timeout)
+ props->delay_max[op->phase];
}
struct timeout_data {
const remote_fencing_op_t *op;
const st_query_result_t *peer;
int total_timeout;
};
/*!
* \internal
* \brief Add timeout to a total if device has not been executed yet
*
* \param[in] key GHashTable key (device ID)
* \param[in] value GHashTable value (device properties)
* \param[in] user_data Timeout data
*/
static void
add_device_timeout(gpointer key, gpointer value, gpointer user_data)
{
const char *device_id = key;
device_properties_t *props = value;
struct timeout_data *timeout = user_data;
if (!props->executed[timeout->op->phase]
&& !props->disallowed[timeout->op->phase]) {
timeout->total_timeout += get_device_timeout(timeout->op,
timeout->peer, device_id);
}
}
static int
get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
{
struct timeout_data timeout;
timeout.op = op;
timeout.peer = peer;
timeout.total_timeout = 0;
g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
}
static int
get_op_total_timeout(const remote_fencing_op_t *op,
const st_query_result_t *chosen_peer)
{
int total_timeout = 0;
stonith_topology_t *tp = find_topology_for_host(op->target);
if (is_set(op->call_options, st_opt_topology) && tp) {
int i;
GListPtr device_list = NULL;
GListPtr iter = NULL;
/* Yep, this looks scary, nested loops all over the place.
* Here is what is going on.
* Loop1: Iterate through fencing levels.
* Loop2: If a fencing level has devices, loop through each device
* Loop3: For each device in a fencing level, see what peer owns it
* and what that peer has reported the timeout is for the device.
*/
for (i = 0; i < ST_LEVEL_MAX; i++) {
if (!tp->levels[i]) {
continue;
}
for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
for (iter = op->query_results; iter != NULL; iter = iter->next) {
const st_query_result_t *peer = iter->data;
if (find_peer_device(op, peer, device_list->data)) {
total_timeout += get_device_timeout(op, peer,
device_list->data);
break;
}
} /* End Loop3: match device with peer that owns device, find device's timeout period */
} /* End Loop2: iterate through devices at a specific level */
} /*End Loop1: iterate through fencing levels */
} else if (chosen_peer) {
total_timeout = get_peer_timeout(op, chosen_peer);
} else {
total_timeout = op->base_timeout;
}
return total_timeout ? total_timeout : op->base_timeout;
}
static void
report_timeout_period(remote_fencing_op_t * op, int op_timeout)
{
GListPtr iter = NULL;
xmlNode *update = NULL;
const char *client_node = NULL;
const char *client_id = NULL;
const char *call_id = NULL;
if (op->call_options & st_opt_sync_call) {
/* There is no reason to report the timeout for a synchronous call. It
* is impossible to use the reported timeout to do anything when the client
* is blocking for the response. This update is only important for
* async calls that require a callback to report the results in. */
return;
} else if (!op->request) {
return;
}
crm_trace("Reporting timeout for %s.%.8s", op->client_name, op->id);
client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
call_id = crm_element_value(op->request, F_STONITH_CALLID);
client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
if (!client_node || !call_id || !client_id) {
return;
}
if (safe_str_eq(client_node, stonith_our_uname)) {
/* The client is connected to this node, send the update direclty to them */
do_stonith_async_timeout_update(client_id, call_id, op_timeout);
return;
}
/* The client is connected to another node, relay this update to them */
update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(update, F_STONITH_CLIENTID, client_id);
crm_xml_add(update, F_STONITH_CALLID, call_id);
crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
free_xml(update);
for (iter = op->duplicates; iter != NULL; iter = iter->next) {
remote_fencing_op_t *dup = iter->data;
crm_trace("Reporting timeout for duplicate %s.%.8s", dup->client_name, dup->id);
report_timeout_period(iter->data, op_timeout);
}
}
/*!
* \internal
* \brief Advance an operation to the next device in its topology
*
* \param[in,out] op Operation to advance
* \param[in] device ID of device just completed
* \param[in] msg XML reply that contained device result (if available)
* \param[in] rc Return code of device's execution
*/
static void
advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg,
int rc)
{
/* Advance to the next device at this topology level, if any */
if (op->devices) {
op->devices = op->devices->next;
}
/* Handle automatic unfencing if an "on" action was requested */
if ((op->phase == st_phase_requested) && safe_str_eq(op->action, "on")) {
/* If the device we just executed was required, it's not anymore */
remove_required_device(op, device);
/* If there are no more devices at this topology level, run through any
* remaining devices with automatic unfencing
*/
if (op->devices == NULL) {
op->devices = op->automatic_list;
}
}
if ((op->devices == NULL) && (op->phase == st_phase_off)) {
/* We're done with this level and with required devices, but we had
* remapped "reboot" to "off", so start over with "on". If any devices
* need to be turned back on, op->devices will be non-NULL after this.
*/
op_phase_on(op);
}
if (op->devices) {
/* Necessary devices remain, so execute the next one */
- crm_trace("Next for %s on behalf of %s@%s (rc was %d)",
+ crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)",
op->target, op->originator, op->client_name, rc);
call_remote_stonith(op, NULL);
} else {
/* We're done with all devices and phases, so finalize operation */
- crm_trace("Marking complex fencing op for %s as complete", op->target);
+ crm_trace("Marking complex fencing op targeting %s as complete",
+ op->target);
op->state = st_done;
remote_op_done(op, msg, rc, FALSE);
}
}
void
call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
{
const char *device = NULL;
int timeout = op->base_timeout;
crm_trace("State for %s.%.8s: %s %d", op->target, op->client_name, op->id, op->state);
if (peer == NULL && !is_set(op->call_options, st_opt_topology)) {
peer = stonith_choose_peer(op);
}
if (!op->op_timer_total) {
int total_timeout = get_op_total_timeout(op, peer);
op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
report_timeout_period(op, op->total_timeout);
- crm_info("Total timeout set to %d for peer's fencing of %s for %s"
+ crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
CRM_XS "id=%s",
total_timeout, op->target, op->client_name, op->id);
}
if (is_set(op->call_options, st_opt_topology) && op->devices) {
/* Ignore any peer preference, they might not have the device we need */
/* When using topology, stonith_choose_peer() removes the device from
* further consideration, so be sure to calculate timeout beforehand */
peer = stonith_choose_peer(op);
device = op->devices->data;
timeout = get_device_timeout(op, peer, device);
}
if (peer) {
int timeout_one = 0;
xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
if (device) {
timeout_one = TIMEOUT_MULTIPLY_FACTOR *
get_device_timeout(op, peer, device);
- crm_notice("Requesting that '%s' perform op '%s %s' with '%s' " CRM_XS " for %s (%ds)", peer->host,
- op->target, op->action, device, op->client_name, timeout_one);
+ crm_notice("Requesting that %s perform '%s' action targeting %s "
+ "using '%s' " CRM_XS " for client %s (%ds)",
+ peer->host, op->action, op->target, device,
+ op->client_name, timeout_one);
crm_xml_add(remote_op, F_STONITH_DEVICE, device);
crm_xml_add(remote_op, F_STONITH_MODE, "slave");
} else {
timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
- crm_notice("Requesting that '%s' perform op '%s %s' " CRM_XS " for %s (%ds, %lds)",
- peer->host, op->target, op->action, op->client_name, timeout_one, stonith_watchdog_timeout_ms);
+ crm_notice("Requesting that %s perform '%s' action targeting %s "
+ CRM_XS " for client %s (%ds, %lds)",
+ peer->host, op->action, op->target, op->client_name,
+ timeout_one, stonith_watchdog_timeout_ms);
crm_xml_add(remote_op, F_STONITH_MODE, "smart");
-
}
op->state = st_exec;
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
}
if(stonith_watchdog_timeout_ms > 0 && device && safe_str_eq(device, "watchdog")) {
- crm_notice("Waiting %lds for %s to self-fence (%s) for %s.%.8s (%p)",
- stonith_watchdog_timeout_ms/1000, op->target,
- op->action, op->client_name, op->id, device);
+ crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
+ stonith_watchdog_timeout_ms/1000, op->target, op->action,
+ op->client_name, op->id);
op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
/* TODO check devices to verify watchdog will be in use */
} else if(stonith_watchdog_timeout_ms > 0
&& safe_str_eq(peer->host, op->target)
&& safe_str_neq(op->action, "on")) {
- crm_notice("Waiting %lds for %s to self-fence (%s) for %s.%.8s (%p)",
- stonith_watchdog_timeout_ms/1000, op->target,
- op->action, op->client_name, op->id, device);
+ crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
+ stonith_watchdog_timeout_ms/1000, op->target, op->action,
+ op->client_name, op->id);
op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
} else {
op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
}
send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
peer->tried = TRUE;
free_xml(remote_op);
return;
} else if (op->phase == st_phase_on) {
/* A remapped "on" cannot be executed, but the node was already
* turned off successfully, so ignore the error and continue.
*/
- crm_warn("Ignoring %s 'on' failure (no capable peers) for %s after successful 'off'",
- device, op->target);
+ crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
+ "after successful 'off'", device, op->target);
advance_op_topology(op, device, NULL, pcmk_ok);
return;
} else if (op->owner == FALSE) {
- crm_err("Fencing (%s) of %s for %s is not ours to control",
+ crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
op->action, op->target, op->client_name);
} else if (op->query_timer == 0) {
/* We've exhausted all available peers */
- crm_info("No remaining peers capable of fencing (%s) %s for %s (%d)",
- op->target, op->action, op->client_name, op->state);
+ crm_info("No remaining peers capable of fencing (%s) %s for client %s "
+ CRM_XS " state=%d",
+ op->action, op->target, op->client_name, op->state);
CRM_LOG_ASSERT(op->state < st_done);
remote_op_timeout(op);
} else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
int rc = -EHOSTUNREACH;
/* if the operation never left the query state,
* but we have all the expected replies, then no devices
* are available to execute the fencing operation. */
if(stonith_watchdog_timeout_ms && (device == NULL || safe_str_eq(device, "watchdog"))) {
- crm_notice("Waiting %lds for %s to self-fence (%s) for %s.%.8s (%p)",
+ crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
stonith_watchdog_timeout_ms/1000, op->target,
- op->action, op->client_name, op->id, device);
+ op->action, op->client_name, op->id);
op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
return;
}
if (op->state == st_query) {
- crm_info("No peers (out of %d) have devices capable of fencing (%s) %s for %s (%d)",
- op->replies, op->action, op->target, op->client_name,
- op->state);
+ crm_info("No peers (out of %d) have devices capable of fencing "
+ "(%s) %s for client %s " CRM_XS " state=%d",
+ op->replies, op->action, op->target, op->client_name,
+ op->state);
rc = -ENODEV;
} else {
- crm_info("No peers (out of %d) are capable of fencing (%s) %s for %s (%d)",
- op->replies, op->action, op->target, op->client_name,
- op->state);
+ crm_info("No peers (out of %d) are capable of fencing (%s) %s "
+ "for client %s " CRM_XS " state=%d",
+ op->replies, op->action, op->target, op->client_name,
+ op->state);
}
op->state = st_failed;
remote_op_done(op, NULL, rc, FALSE);
- } else if (device) {
- crm_info("Waiting for additional peers capable of fencing (%s) %s with %s for %s.%.8s",
- op->action, op->target, device, op->client_name, op->id);
} else {
- crm_info("Waiting for additional peers capable of fencing (%s) %s for %s%.8s",
- op->action, op->target, op->client_name, op->id);
+ crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
+ "for client %s%.8s",
+ op->action, op->target, (device? " with " : ""),
+ (device? device : ""), op->client_name, op->id);
}
}
/*!
* \internal
* \brief Comparison function for sorting query results
*
* \param[in] a GList item to compare
* \param[in] b GList item to compare
*
* \return Per the glib documentation, "a negative integer if the first value
* comes before the second, 0 if they are equal, or a positive integer
* if the first value comes after the second."
*/
static gint
sort_peers(gconstpointer a, gconstpointer b)
{
const st_query_result_t *peer_a = a;
const st_query_result_t *peer_b = b;
return (peer_b->ndevices - peer_a->ndevices);
}
/*!
* \internal
* \brief Determine if all the devices in the topology are found or not
*/
static gboolean
all_topology_devices_found(remote_fencing_op_t * op)
{
GListPtr device = NULL;
GListPtr iter = NULL;
device_properties_t *match = NULL;
stonith_topology_t *tp = NULL;
gboolean skip_target = FALSE;
int i;
tp = find_topology_for_host(op->target);
if (!tp) {
return FALSE;
}
if (safe_str_eq(op->action, "off") || safe_str_eq(op->action, "reboot")) {
/* Don't count the devices on the target node if we are killing
* the target node. */
skip_target = TRUE;
}
for (i = 0; i < ST_LEVEL_MAX; i++) {
for (device = tp->levels[i]; device; device = device->next) {
match = NULL;
for (iter = op->query_results; iter && !match; iter = iter->next) {
st_query_result_t *peer = iter->data;
if (skip_target && safe_str_eq(peer->host, op->target)) {
continue;
}
match = find_peer_device(op, peer, device->data);
}
if (!match) {
return FALSE;
}
}
}
return TRUE;
}
/*!
* \internal
* \brief Parse action-specific device properties from XML
*
* \param[in] msg XML element containing the properties
* \param[in] peer Name of peer that sent XML (for logs)
* \param[in] device Device ID (for logs)
* \param[in] action Action the properties relate to (for logs)
* \param[in] phase Phase the properties relate to
* \param[in,out] props Device properties to update
*/
static void
parse_action_specific(xmlNode *xml, const char *peer, const char *device,
const char *action, remote_fencing_op_t *op,
enum st_remap_phase phase, device_properties_t *props)
{
props->custom_action_timeout[phase] = 0;
crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
&props->custom_action_timeout[phase]);
if (props->custom_action_timeout[phase]) {
crm_trace("Peer %s with device %s returned %s action timeout %d",
peer, device, action, props->custom_action_timeout[phase]);
}
props->delay_max[phase] = 0;
crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
if (props->delay_max[phase]) {
crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
peer, device, props->delay_max[phase], action);
}
props->delay_base[phase] = 0;
crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
if (props->delay_base[phase]) {
crm_trace("Peer %s with device %s returned base delay %d for %s",
peer, device, props->delay_base[phase], action);
}
/* Handle devices with automatic unfencing */
if (safe_str_eq(action, "on")) {
int required = 0;
crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
if (required) {
crm_trace("Peer %s requires device %s to execute for action %s",
peer, device, action);
add_required_device(op, device);
}
}
/* If a reboot is remapped to off+on, it's possible that a node is allowed
* to perform one action but not another.
*/
if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
props->disallowed[phase] = TRUE;
crm_trace("Peer %s is disallowed from executing %s for device %s",
peer, action, device);
}
}
/*!
* \internal
* \brief Parse one device's properties from peer's XML query reply
*
* \param[in] xml XML node containing device properties
* \param[in,out] op Operation that query and reply relate to
* \param[in,out] result Peer's results
* \param[in] device ID of device being parsed
*/
static void
add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
st_query_result_t *result, const char *device)
{
xmlNode *child;
int verified = 0;
device_properties_t *props = calloc(1, sizeof(device_properties_t));
/* Add a new entry to this result's devices list */
CRM_ASSERT(props != NULL);
g_hash_table_insert(result->devices, strdup(device), props);
/* Peers with verified (monitored) access will be preferred */
crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
if (verified) {
crm_trace("Peer %s has confirmed a verified device %s",
result->host, device);
props->verified = TRUE;
}
/* Parse action-specific device properties */
parse_action_specific(xml, result->host, device, op_requested_action(op),
op, st_phase_requested, props);
for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
/* Replies for "reboot" operations will include the action-specific
* values for "off" and "on" in child elements, just in case the reboot
* winds up getting remapped.
*/
if (safe_str_eq(ID(child), "off")) {
parse_action_specific(child, result->host, device, "off",
op, st_phase_off, props);
} else if (safe_str_eq(ID(child), "on")) {
parse_action_specific(child, result->host, device, "on",
op, st_phase_on, props);
}
}
}
/*!
* \internal
* \brief Parse a peer's XML query reply and add it to operation's results
*
* \param[in,out] op Operation that query and reply relate to
* \param[in] host Name of peer that sent this reply
* \param[in] ndevices Number of devices expected in reply
* \param[in] xml XML node containing device list
*
* \return Newly allocated result structure with parsed reply
*/
static st_query_result_t *
add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
{
st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
xmlNode *child;
CRM_CHECK(result != NULL, return NULL);
result->host = strdup(host);
result->devices = crm_str_table_new();
/* Each child element describes one capable device available to the peer */
for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
const char *device = ID(child);
if (device) {
add_device_properties(child, op, result, device);
}
}
result->ndevices = g_hash_table_size(result->devices);
CRM_CHECK(ndevices == result->ndevices,
crm_err("Query claimed to have %d devices but %d found",
ndevices, result->ndevices));
op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
return result;
}
/*!
* \internal
* \brief Handle a peer's reply to our fencing query
*
* Parse a query result from XML and store it in the remote operation
* table, and when enough replies have been received, issue a fencing request.
*
* \param[in] msg XML reply received
*
* \return pcmk_ok on success, -errno on error
*
* \note See initiate_remote_stonith_op() for how the XML query was initially
* formed, and stonith_query() for how the peer formed its XML reply.
*/
int
process_remote_stonith_query(xmlNode * msg)
{
int ndevices = 0;
gboolean host_is_target = FALSE;
gboolean have_all_replies = FALSE;
const char *id = NULL;
const char *host = NULL;
remote_fencing_op_t *op = NULL;
st_query_result_t *result = NULL;
uint32_t replies_expected;
xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(id != NULL, return -EPROTO);
dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
op = g_hash_table_lookup(stonith_remote_op_list, id);
if (op == NULL) {
crm_debug("Received query reply for unknown or expired operation %s",
id);
return -EOPNOTSUPP;
}
replies_expected = fencing_active_peers();
if (op->replies_expected < replies_expected) {
replies_expected = op->replies_expected;
}
if ((++op->replies >= replies_expected) && (op->state == st_query)) {
have_all_replies = TRUE;
}
host = crm_element_value(msg, F_ORIG);
host_is_target = safe_str_eq(host, op->target);
crm_info("Query result %d of %d from %s for %s/%s (%d devices) %s",
op->replies, replies_expected, host,
op->target, op->action, ndevices, id);
if (ndevices > 0) {
result = add_result(op, host, ndevices, dev);
}
if (is_set(op->call_options, st_opt_topology)) {
/* If we start the fencing before all the topology results are in,
* it is possible fencing levels will be skipped because of the missing
* query results. */
if (op->state == st_query && all_topology_devices_found(op)) {
/* All the query results are in for the topology, start the fencing ops. */
crm_trace("All topology devices found");
call_remote_stonith(op, result);
} else if (have_all_replies) {
crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
replies_expected, op->replies);
call_remote_stonith(op, NULL);
}
} else if (op->state == st_query) {
int nverified = count_peer_devices(op, result, TRUE);
/* We have a result for a non-topology fencing op that looks promising,
* go ahead and start fencing before query timeout */
if (result && (host_is_target == FALSE) && nverified) {
/* we have a verified device living on a peer that is not the target */
crm_trace("Found %d verified devices", nverified);
call_remote_stonith(op, result);
} else if (have_all_replies) {
crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
replies_expected, op->replies);
call_remote_stonith(op, NULL);
} else {
crm_trace("Waiting for more peer results before launching fencing operation");
}
} else if (result && (op->state == st_done)) {
crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
result->host, result->ndevices, op->state);
}
return pcmk_ok;
}
/*!
* \internal
* \brief Handle a peer's reply to a fencing request
*
* Parse a fencing reply from XML, and either finalize the operation
* or attempt another device as appropriate.
*
* \param[in] msg XML reply received
*
* \return pcmk_ok on success, -errno on error
*/
int
process_remote_stonith_exec(xmlNode * msg)
{
int rc = 0;
const char *id = NULL;
const char *device = NULL;
remote_fencing_op_t *op = NULL;
xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(id != NULL, return -EPROTO);
dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
crm_element_value_int(dev, F_STONITH_RC, &rc);
device = crm_element_value(dev, F_STONITH_DEVICE);
if (stonith_remote_op_list) {
op = g_hash_table_lookup(stonith_remote_op_list, id);
}
if (op == NULL && rc == pcmk_ok) {
/* Record successful fencing operations */
const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
op = create_remote_stonith_op(client_id, dev, TRUE);
}
if (op == NULL) {
/* Could be for an event that began before we started */
/* TODO: Record the op for later querying */
crm_info("Received peer result of unknown or expired operation %s", id);
return -EOPNOTSUPP;
}
if (op->devices && device && safe_str_neq(op->devices->data, device)) {
crm_err("Received outdated reply for device %s (instead of %s) to "
"fence (%s) %s. Operation already timed out at peer level.",
device, (const char *) op->devices->data, op->action, op->target);
return rc;
}
if (safe_str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast")) {
crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)",
op->action, op->target, op->client_name, op->id, op->originator,
pcmk_strerror(rc), rc);
if (rc == pcmk_ok) {
op->state = st_done;
} else {
op->state = st_failed;
}
remote_op_done(op, msg, rc, FALSE);
return pcmk_ok;
} else if (safe_str_neq(op->originator, stonith_our_uname)) {
/* If this isn't a remote level broadcast, and we are not the
* originator of the operation, we should not be receiving this msg. */
crm_err
("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)",
stonith_our_uname, device, op->target);
return rc;
}
if (is_set(op->call_options, st_opt_topology)) {
const char *device = crm_element_value(msg, F_STONITH_DEVICE);
- crm_notice("Call to %s for '%s %s' on behalf of %s@%s: %s (%d)",
- device, op->target, op->action, op->client_name, op->originator,
- pcmk_strerror(rc), rc);
+ crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s "
+ CRM_XS " rc=%d",
+ op->action, op->target, device, op->client_name,
+ op->originator, pcmk_strerror(rc), rc);
/* We own the op, and it is complete. broadcast the result to all nodes
* and notify our local clients. */
if (op->state == st_done) {
remote_op_done(op, msg, rc, FALSE);
return rc;
}
if ((op->phase == 2) && (rc != pcmk_ok)) {
/* A remapped "on" failed, but the node was already turned off
* successfully, so ignore the error and continue.
*/
- crm_warn("Ignoring %s 'on' failure (exit code %d) for %s after successful 'off'",
- device, rc, op->target);
+ crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s "
+ "after successful 'off'", device, rc, op->target);
rc = pcmk_ok;
}
if (rc == pcmk_ok) {
/* An operation completed successfully. Try another device if
* necessary, otherwise mark the operation as done. */
advance_op_topology(op, device, msg, rc);
return rc;
} else {
/* This device failed, time to try another topology level. If no other
* levels are available, mark this operation as failed and report results. */
if (stonith_topology_next(op) != pcmk_ok) {
op->state = st_failed;
remote_op_done(op, msg, rc, FALSE);
return rc;
}
}
} else if (rc == pcmk_ok && op->devices == NULL) {
crm_trace("All done for %s", op->target);
op->state = st_done;
remote_op_done(op, msg, rc, FALSE);
return rc;
} else if (rc == -ETIME && op->devices == NULL) {
/* If the operation timed out don't bother retrying other peers. */
op->state = st_failed;
remote_op_done(op, msg, rc, FALSE);
return rc;
} else {
/* fall-through and attempt other fencing action using another peer */
}
/* Retry on failure */
crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
op->client_name, rc);
call_remote_stonith(op, NULL);
return rc;
}
gboolean
stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
{
GHashTableIter iter;
time_t now = time(NULL);
remote_fencing_op_t *rop = NULL;
crm_trace("tolerance=%d, stonith_remote_op_list=%p", tolerance,
stonith_remote_op_list);
if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
action == NULL) {
return FALSE;
}
g_hash_table_iter_init(&iter, stonith_remote_op_list);
while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
if (strcmp(rop->target, target) != 0) {
continue;
} else if (rop->state != st_done) {
continue;
/* We don't have to worry about remapped reboots here
* because if state is done, any remapping has been undone
*/
} else if (strcmp(rop->action, action) != 0) {
continue;
} else if ((rop->completed + tolerance) < now) {
continue;
}
crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
target, action, tolerance, rop->delegate, rop->originator);
return TRUE;
}
return FALSE;
}