diff --git a/cts/CTStests.py b/cts/CTStests.py index a57a5da805..f8f2cb626f 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -1,3130 +1,3129 @@ """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS) """ # Pacemaker targets compatibility with Python 2.7 and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division __copyright__ = "Copyright 2000-2019 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import os import re import time import subprocess import tempfile from stat import * from cts import CTS from cts.CTSaudits import * from cts.CTSvars import * from cts.patterns import PatternSelector from cts.logging import LogFactory from cts.remote import RemoteFactory, input_wrapper from cts.watcher import LogWatcher from cts.environment import EnvFactory AllTestClasses = [ ] class CTSTest(object): ''' A Cluster test. We implement the basic set of properties and behaviors for a generic cluster test. Cluster tests track their own statistics. We keep each of the kinds of counts we track as separate {name,value} pairs. ''' def __init__(self, cm): #self.name="the unnamed test" self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} # if not issubclass(cm.__class__, ClusterManager): # raise ValueError("Must be a ClusterManager object") self.CM = cm self.Env = EnvFactory().getInstance() self.rsh = RemoteFactory().getInstance() self.logger = LogFactory() self.templates = PatternSelector(cm["Name"]) self.Audits = [] self.timeout = 120 self.passed = 1 self.is_loop = 0 self.is_unsafe = 0 self.is_docker_unsafe = 0 self.is_experimental = 0 self.is_container = 0 self.is_valgrind = 0 self.benchmark = 0 # which tests to benchmark self.timer = {} # timers def log(self, args): self.logger.log(args) def debug(self, args): self.logger.debug(args) def has_key(self, key): return key in self.Stats def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): if str(key) == "0": raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead") if key in self.Stats: return self.Stats[key] return None def log_mark(self, msg): self.debug("MARK: test %s %s %d" % (self.name,msg,time.time())) return def get_timer(self,key = "test"): try: return self.timer[key] except: return 0 def set_timer(self,key = "test"): self.timer[key] = time.time() return self.timer[key] def log_timer(self,key = "test"): elapsed = 0 if key in self.timer: elapsed = time.time() - self.timer[key] s = key == "test" and self.name or "%s:%s" % (self.name,key) self.debug("%s runtime: %.2f" % (s, elapsed)) del self.timer[key] return elapsed def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not name in self.Stats: self.Stats[name] = 0 self.Stats[name] = self.Stats[name]+1 # Reset the test passed boolean if name == "calls": self.passed = 1 def failure(self, reason="none"): '''Increment the failure count''' self.passed = 0 self.incr("failure") self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason) return None def success(self): '''Increment the success count''' self.incr("success") return 1 def skipped(self): '''Increment the skipped count''' self.incr("skipped") return 1 def __call__(self, node): '''Perform the given test''' raise ValueError("Abstract Class member (__call__)") self.incr("calls") return self.failure() def audit(self): passed = 1 if len(self.Audits) > 0: for audit in self.Audits: if not audit(): self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name())) self.incr("auditfail") passed = 0 return passed def setup(self, node): '''Setup the given test''' return self.success() def teardown(self, node): '''Tear down the given test''' return self.success() def create_watch(self, patterns, timeout, name=None): if not name: name = self.name return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) def local_badnews(self, prefix, watch, local_ignore=[]): errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [] ignorelist.append(" CTS: ") ignorelist.append(prefix) ignorelist.extend(local_ignore) while errcount < 100: match = watch.look(0) if match: add_err = 1 for ignore in ignorelist: if add_err == 1 and re.search(ignore, match): add_err = 0 if add_err == 1: self.logger.log(prefix + " " + match) errcount = errcount + 1 else: break else: self.logger.log("Too many errors!") watch.end() return errcount def is_applicable(self): return self.is_applicable_common() def is_applicable_common(self): '''Return TRUE if we are applicable in the current test configuration''' #raise ValueError("Abstract Class member (is_applicable)") if self.is_loop and not self.Env["loop-tests"]: return 0 elif self.is_unsafe and not self.Env["unsafe-tests"]: return 0 elif self.is_valgrind and not self.Env["valgrind-tests"]: return 0 elif self.is_experimental and not self.Env["experimental-tests"]: return 0 elif self.is_docker_unsafe and self.Env["docker"]: return 0 elif self.is_container and not self.Env["container-tests"]: return 0 elif self.Env["benchmark"] and self.benchmark == 0: return 0 return 1 def find_ocfs2_resources(self, node): self.r_o2cb = None self.r_ocfs2 = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "o2cb" and r.parent != "NA": self.debug("Found o2cb: %s" % self.r_o2cb) self.r_o2cb = r.parent if re.search("^Constraint", line): c = AuditConstraint(self.CM, line) if c.type == "rsc_colocation" and c.target == self.r_o2cb: self.r_ocfs2.append(c.rsc) self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2)) return len(self.r_ocfs2) def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' return 1 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] class StopTest(CTSTest): '''Stop (deactivate) the cluster manager on a node''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Stop" def __call__(self, node): '''Perform the 'stop' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] != "up": return self.skipped() patterns = [] # Technically we should always be able to notice ourselves stopping patterns.append(self.templates["Pat:We_stopped"] % node) # Any active node needs to notice this one left # (note that this won't work if we have multiple partitions) for other in self.Env["nodes"]: if self.CM.ShouldBeStatus[other] == "up" and other != node: patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) #self.debug("Checking %s will notice %s left"%(other, node)) watch = self.create_watch(patterns, self.Env["DeadTime"]) watch.setwatch() if node == self.CM.OurNode: self.incr("us") else: if self.CM.upcount() <= 1: self.incr("all") else: self.incr("them") self.CM.StopaCM(node) watch_result = watch.lookforall() failreason = None UnmatchedList = "||" if watch.unmatched: (rc, output) = self.rsh(node, "/bin/ps axf", None) for line in output: self.debug(line) (rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None) for line in output: self.debug(line) for regex in watch.unmatched: self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex)) UnmatchedList += regex + "||"; failreason = "Missing shutdown pattern" self.CM.cluster_stable(self.Env["DeadTime"]) if not watch.unmatched or self.CM.upcount() == 0: return self.success() if len(watch.unmatched) >= self.CM.upcount(): return self.failure("no match against (%s)" % UnmatchedList) if failreason == None: return self.success() else: return self.failure(failreason) # # We don't register StopTest because it's better when called by # another test... # class StartTest(CTSTest): '''Start (activate) the cluster manager on a node''' def __init__(self, cm, debug=None): CTSTest.__init__(self,cm) self.name = "start" self.debug = debug def __call__(self, node): '''Perform the 'start' test. ''' self.incr("calls") if self.CM.upcount() == 0: self.incr("us") else: self.incr("them") if self.CM.ShouldBeStatus[node] != "down": return self.skipped() elif self.CM.StartaCM(node): return self.success() else: return self.failure("Startup %s on node %s failed" % (self.Env["Name"], node)) # # We don't register StartTest because it's better when called by # another test... # class FlipTest(CTSTest): '''If it's running, stop it. If it's stopped start it. Overthrow the status quo... ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Flip" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'Flip' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == "up": self.incr("stopped") ret = self.stop(node) type = "up->down" # Give the cluster time to recognize it's gone... time.sleep(self.Env["StableTime"]) elif self.CM.ShouldBeStatus[node] == "down": self.incr("started") ret = self.start(node) type = "down->up" else: return self.skipped() self.incr(type) if ret: return self.success() else: return self.failure("%s failure" % type) # Register FlipTest as a good test to run AllTestClasses.append(FlipTest) class RestartTest(CTSTest): '''Stop and restart a node''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Restart" self.start = StartTest(cm) self.stop = StopTest(cm) self.benchmark = 1 def __call__(self, node): '''Perform the 'restart' test. ''' self.incr("calls") self.incr("node:" + node) ret1 = 1 if self.CM.StataCM(node): self.incr("WasStopped") if not self.start(node): return self.failure("start (setup) failure: "+node) self.set_timer() if not self.stop(node): return self.failure("stop failure: "+node) if not self.start(node): return self.failure("start failure: "+node) return self.success() # Register RestartTest as a good test to run AllTestClasses.append(RestartTest) class StonithdTest(CTSTest): def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Stonithd" self.startall = SimulStartLite(cm) self.benchmark = 1 def __call__(self, node): self.incr("calls") if len(self.Env["nodes"]) < 2: return self.skipped() ret = self.startall(None) if not ret: return self.failure("Setup failed") is_dc = self.CM.is_node_dc(node) watchpats = [] watchpats.append(self.templates["Pat:FenceOpOK"] % node) watchpats.append(self.templates["Pat:NodeFenced"] % node) if self.Env["at-boot"] == 0: self.debug("Expecting %s to stay down" % node) self.CM.ShouldBeStatus[node] = "down" else: self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"])) watchpats.append("%s.* S_STARTING -> S_PENDING" % node) watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node) watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) watch.setwatch() origin = self.Env.RandomGen.choice(self.Env["nodes"]) rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node) if rc == 194: # 194 - 256 = -62 = Timer expired # # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node)) elif origin != node and rc != 0: self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting for fenced node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc)) elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as expected self.logger.log("Locally originated fencing returned %d" % rc) self.set_timer("fence") matched = watch.lookforall() self.log_timer("fence") self.set_timer("reform") if watch.unmatched: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting for fenced node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.Env["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() def errorstoignore(self): return [ self.templates["Pat:Fencing_start"] % ".*", self.templates["Pat:Fencing_ok"] % ".*", r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery", r"error.*: Operation 'reboot' targeting .* on .* for stonith_admin.*: Timer expired", ] def is_applicable(self): if not self.is_applicable_common(): return 0 if "DoFencing" in list(self.Env.keys()): return self.Env["DoFencing"] return 1 AllTestClasses.append(StonithdTest) class StartOnebyOne(CTSTest): '''Start all the nodes ~ one by one''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "StartOnebyOne" self.stopall = SimulStopLite(cm) self.start = StartTest(cm) self.ns = CTS.NodeStatus(cm.Env) def __call__(self, dummy): '''Perform the 'StartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Test setup failed") failed = [] self.set_timer() for node in self.Env["nodes"]: if not self.start(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to start: " + repr(failed)) return self.success() # Register StartOnebyOne as a good test to run AllTestClasses.append(StartOnebyOne) class SimulStart(CTSTest): '''Start all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStart" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'SimulStart' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Setup failed") if not self.startall(None): return self.failure("Startall failed") return self.success() # Register SimulStart as a good test to run AllTestClasses.append(SimulStart) class SimulStop(CTSTest): '''Stop all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStop" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, dummy): '''Perform the 'SimulStop' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.stopall(None): return self.failure("Stopall failed") return self.success() # Register SimulStop as a good test to run AllTestClasses.append(SimulStop) class StopOnebyOne(CTSTest): '''Stop all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "StopOnebyOne" self.startall = SimulStartLite(cm) self.stop = StopTest(cm) def __call__(self, dummy): '''Perform the 'StopOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") failed = [] self.set_timer() for node in self.Env["nodes"]: if not self.stop(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to stop: " + repr(failed)) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(StopOnebyOne) class RestartOnebyOne(CTSTest): '''Restart all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RestartOnebyOne" self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'RestartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") did_fail = [] self.set_timer() self.restart = RestartTest(self.CM) for node in self.Env["nodes"]: if not self.restart(node): did_fail.append(node) if did_fail: return self.failure("Could not restart %d nodes: %s" % (len(did_fail), repr(did_fail))) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(RestartOnebyOne) class PartialStart(CTSTest): '''Start a node - but tell it to stop before it finishes starting up''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "PartialStart" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) self.stop = StopTest(cm) #self.is_unsafe = 1 def __call__(self, node): '''Perform the 'PartialStart' test. ''' self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Setup failed") # FIXME! This should use the CM class to get the pattern # then it would be applicable in general watchpats = [] watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure") watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.CM.StartaCMnoBlock(node) ret = watch.lookforall() if not ret: self.logger.log("Patterns not found: " + repr(watch.unmatched)) return self.failure("Setup of %s failed" % node) ret = self.stop(node) if not ret: return self.failure("%s did not stop in time" % node) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # We might do some fencing in the 2-node case if we make it up far enough return [ r"Executing reboot fencing operation", r"Requesting fencing \([^)]+\) of node ", ] # Register StopOnebyOne as a good test to run AllTestClasses.append(PartialStart) class StandbyTest(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Standby" self.benchmark = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resouces, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Start all nodes failed") self.debug("Make sure node %s is active" % node) if self.CM.StandbyStatus(node) != "off": if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.debug("Getting resources running on node %s" % node) rsc_on_node = self.CM.active_resources(node) watchpats = [] watchpats.append(r"State transition .* -> S_POLICY_ENGINE") watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.debug("Setting node %s to standby mode" % node) if not self.CM.SetStandbyMode(node, "on"): return self.failure("can't set node %s to standby mode" % node) self.set_timer("on") ret = watch.lookforall() if not ret: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.CM.SetStandbyMode(node, "off") return self.failure("cluster didn't react to standby change on %s" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "on": return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) self.log_timer("on") self.debug("Checking resources") bad_run = self.CM.active_resources(node) if len(bad_run) > 0: rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run))) self.debug("Setting node %s to active mode" % node) self.CM.SetStandbyMode(node, "off") return rc self.debug("Setting node %s to active mode" % node) if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.set_timer("off") self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.log_timer("off") return self.success() AllTestClasses.append(StandbyTest) class ValgrindTest(CTSTest): '''Check for memory leaks''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Valgrind" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_valgrind = 1 self.is_loop = 1 def setup(self, node): self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Stop all nodes failed") # @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind, # and clear any valgrind logs from previous runs. For now, we rely on # the user to do this manually. ret = self.startall(None) if not ret: return self.failure("Start all nodes failed") return self.success() def teardown(self, node): # Return all nodes to normal # @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind ret = self.stopall(None) if not ret: return self.failure("Stop all nodes failed") return self.success() def find_leaks(self): # Check for leaks # (no longer used but kept in case feature is restored) leaked = [] self.stop = StopTest(self.CM) for node in self.Env["nodes"]: rc = self.stop(node) if not rc: self.failure("Couldn't shut down %s" % node) rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0) if rc != 1: leaked.append(node) self.failure("Valgrind errors detected on %s" % node) (rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None) for line in output: self.logger.log(line) (rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None) for line in output: self.debug(line) self.rsh(node, "rm -f %s" % self.logger.logPat, None) return leaked def __call__(self, node): #leaked = self.find_leaks() #if len(leaked) > 0: # return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*", r"pacemaker-based.*: .* avoid confusing Valgrind", r"HA_VALGRIND_ENABLED", ] class StandbyLoopTest(ValgrindTest): '''Check for memory leaks by putting a node in and out of standby for an hour''' # @TODO This is not a useful test for memory leaks def __init__(self, cm): ValgrindTest.__init__(self,cm) self.name = "StandbyLoop" def __call__(self, node): lpc = 0 delay = 2 failed = 0 done = time.time() + self.Env["loop-minutes"] * 60 while time.time() <= done and not failed: lpc = lpc + 1 time.sleep(delay) if not self.CM.SetStandbyMode(node, "on"): self.failure("can't set node %s to standby mode" % node) failed = lpc time.sleep(delay) if not self.CM.SetStandbyMode(node, "off"): self.failure("can't set node %s to active mode" % node) failed = lpc leaked = self.find_leaks() if failed: return self.failure("Iteration %d failed" % failed) elif len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() #AllTestClasses.append(StandbyLoopTest) class BandwidthTest(CTSTest): # Tests should not be cluster-manager-specific # If you need to find out cluster manager configuration to do this, then # it should be added to the generic cluster manager API. '''Test the bandwidth which the cluster uses''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Bandwidth" self.start = StartTest(cm) self.__setitem__("min",0) self.__setitem__("max",0) self.__setitem__("totalbandwidth",0) (handle, self.tempfile) = tempfile.mkstemp(".cts") os.close(handle) self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform the Bandwidth test''' self.incr("calls") if self.CM.upcount() < 1: return self.skipped() Path = self.CM.InternalCommConfig() if "ip" not in Path["mediatype"]: return self.skipped() port = Path["port"][0] port = int(port) ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(5) # We get extra messages right after startup. fstmpfile = "/var/run/band_estimate" dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ % (port, fstmpfile) rc = self.rsh(node, dumpcmd) if rc == 0: farfile = "root@%s:%s" % (node, fstmpfile) self.rsh.cp(farfile, self.tempfile) Bandwidth = self.countbandwidth(self.tempfile) if not Bandwidth: self.logger.log("Could not compute bandwidth.") return self.success() intband = int(Bandwidth + 0.5) self.logger.log("...bandwidth: %d bits/sec" % intband) self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth if self.Stats["min"] == 0: self.Stats["min"] = Bandwidth if Bandwidth > self.Stats["max"]: self.Stats["max"] = Bandwidth if Bandwidth < self.Stats["min"]: self.Stats["min"] = Bandwidth self.rsh(node, "rm -f %s" % fstmpfile) os.unlink(self.tempfile) return self.success() else: return self.failure("no response from tcpdump command [%d]!" % rc) def countbandwidth(self, file): fp = open(file, "r") fp.seek(0) count = 0 sum = 0 while 1: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count + 1 linesplit = line.split(" ") for j in range(len(linesplit)-1): if linesplit[j] == "udp": break if linesplit[j] == "length:": break try: sum = sum + int(linesplit[j+1]) except ValueError: self.logger.log("Invalid tcpdump line: %s" % line) return None T1 = linesplit[0] timesplit = T1.split(":") time2split = timesplit[2].split(".") time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 break while count < 100: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count+1 linessplit = line.split(" ") for j in range(len(linessplit)-1): if linessplit[j] == "udp": break if linessplit[j] == "length:": break try: sum = int(linessplit[j+1]) + sum except ValueError: self.logger.log("Invalid tcpdump line: %s" % line) return None T2 = linessplit[0] timesplit = T2.split(":") time2split = timesplit[2].split(".") time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 time = time2-time1 if (time <= 0): return 0 return int((sum*8)/time) def is_applicable(self): '''BandwidthTest never applicable''' return 0 AllTestClasses.append(BandwidthTest) ################################################################### class MaintenanceMode(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "MaintenanceMode" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max = 30 #self.is_unsafe = 1 self.benchmark = 1 self.action = "asyncmon" self.interval = 0 self.rid = "maintenanceDummy" def toggleMaintenanceMode(self, node, action): pats = [] pats.append(self.templates["Pat:DC_IDLE"]) # fail the resource right after turning Maintenance mode on # verify it is not recovered until maintenance mode is turned off if action == "On": - pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of %s on" % (self.action, self.rid)) + pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid)) else: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) watch = self.create_watch(pats, 60) watch.setwatch() self.debug("Turning maintenance mode %s" % action) self.rsh(node, self.templates["MaintenanceMode%s" % (action)]) if (action == "On"): self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) self.set_timer("recover%s" % (action)) watch.lookforall() self.log_timer("recover%s" % (action)) if watch.unmatched: self.debug("Failed to find patterns when turning maintenance mode %s" % action) return repr(watch.unmatched) return "" def insertMaintenanceDummy(self, node): pats = [] pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid))) watch = self.create_watch(pats, 60) watch.setwatch() self.CM.AddDummyRsc(node, self.rid) self.set_timer("addDummy") watch.lookforall() self.log_timer("addDummy") if watch.unmatched: self.debug("Failed to find patterns when adding maintenance dummy resource") return repr(watch.unmatched) return "" def removeMaintenanceDummy(self, node): pats = [] pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) watch = self.create_watch(pats, 60) watch.setwatch() self.CM.RemoveDummyRsc(node, self.rid) self.set_timer("removeDummy") watch.lookforall() self.log_timer("removeDummy") if watch.unmatched: self.debug("Failed to find patterns when removing maintenance dummy resource") return repr(watch.unmatched) return "" def managedRscList(self, node): rscList = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.managed(): rscList.append(tmp.id) return rscList def verifyResources(self, node, rscList, managed): managedList = list(rscList) managed_str = "managed" if not managed: managed_str = "unmanaged" (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if managed and not tmp.managed(): continue elif not managed and tmp.managed(): continue elif managedList.count(tmp.id): managedList.remove(tmp.id) if len(managedList) == 0: self.debug("Found all %s resources on %s" % (managed_str, node)) return True self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) return False def __call__(self, node): '''Perform the 'MaintenanceMode' test. ''' self.incr("calls") verify_managed = False verify_unmanaged = False failPat = "" ret = self.startall(None) if not ret: return self.failure("Setup failed") # get a list of all the managed resources. We use this list # after enabling maintenance mode to verify all managed resources # become un-managed. After maintenance mode is turned off, we use # this list to verify all the resources become managed again. managedResources = self.managedRscList(node) if len(managedResources) == 0: self.logger.log("No managed resources on %s" % node) return self.skipped() # insert a fake resource we can fail during maintenance mode # so we can verify recovery does not take place until after maintenance # mode is disabled. failPat = failPat + self.insertMaintenanceDummy(node) # toggle maintenance mode ON, then fail dummy resource. failPat = failPat + self.toggleMaintenanceMode(node, "On") # verify all the resources are now unmanaged if self.verifyResources(node, managedResources, False): verify_unmanaged = True # Toggle maintenance mode OFF, verify dummy is recovered. failPat = failPat + self.toggleMaintenanceMode(node, "Off") # verify all the resources are now managed again if self.verifyResources(node, managedResources, True): verify_managed = True # Remove our maintenance dummy resource. failPat = failPat + self.removeMaintenanceDummy(node) self.CM.cluster_stable() if failPat != "": return self.failure("Unmatched patterns: %s" % (failPat)) elif verify_unmanaged is False: return self.failure("Failed to verify resources became unmanaged during maintenance mode") elif verify_managed is False: return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for %s" % self.rid, r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid, r"Unknown operation: fail", self.templates["Pat:RscOpOK"] % (self.action, self.rid), r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), ] AllTestClasses.append(MaintenanceMode) class ResourceRecover(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ResourceRecover" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max = 30 self.rid = None self.rid_alt = None #self.is_unsafe = 1 self.benchmark = 1 # these are the values used for the new LRM API call self.action = "asyncmon" self.interval = 0 def __call__(self, node): '''Perform the 'ResourceRecover' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed") resourcelist = self.CM.active_resources(node) # if there are no resourcelist, return directly if len(resourcelist) == 0: self.logger.log("No active resources on %s" % node) return self.skipped() self.rid = self.Env.RandomGen.choice(resourcelist) self.rid_alt = self.rid rsc = None (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.id == self.rid: rsc = tmp # Handle anonymous clones that get renamed self.rid = rsc.clone_id break if not rsc: return self.failure("Could not find %s in the resource list" % self.rid) self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) pats = [] - pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of (%s|%s) on" % (self.action, - rsc.id, rsc.clone_id)) + pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id)) if rsc.managed(): pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) if rsc.unique(): pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) else: # Anonymous clones may get restarted with a different clone number pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) watch = self.create_watch(pats, 60) watch.setwatch() self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) self.set_timer("recover") watch.lookforall() self.log_timer("recover") self.CM.cluster_stable() recovered = self.CM.ResourceLocation(self.rid) if watch.unmatched: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) elif rsc.unique() and len(recovered) > 1: return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) elif len(recovered) > 0: self.debug("%s is running on: %s" % (self.rid, repr(recovered))) elif rsc.managed(): return self.failure("%s was not recovered and is inactive" % self.rid) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for %s" % self.rid, r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt), r"Unknown operation: fail", self.templates["Pat:RscOpOK"] % (self.action, self.rid), r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), ] AllTestClasses.append(ResourceRecover) class ComponentFail(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ComponentFail" # TODO make this work correctly in docker. self.is_docker_unsafe = 1 self.startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] self.okerrpatterns = [] self.is_unsafe = 1 def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") self.patterns = [] self.okerrpatterns = [] # start all nodes ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.CM.cluster_stable(self.Env["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self.CM.is_node_dc(node, None) # select a component to kill chosen = self.Env.RandomGen.choice(self.complist) while chosen.dc_only == 1 and node_is_dc == 0: chosen = self.Env.RandomGen.choice(self.complist) self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot)) self.incr(chosen.name) if chosen.name != "corosync": self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) self.patterns.extend(chosen.pats) if node_is_dc: self.patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) self.okerrpatterns.append(self.templates["Pat:Fencing_active"] % r.id) self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) # supply a copy so self.patterns doesn't end up empty tmpPats = [] tmpPats.extend(self.patterns) self.patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonithPats = [] stonithPats.append(self.templates["Pat:Fencing_ok"] % node) stonith = self.create_watch(stonithPats, 0) stonith.setwatch() # set the watch for stable watch = self.create_watch( tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) watch.setwatch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting for any fenced node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self.CM.cluster_stable(self.Env["StartTime"]) self.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.debug("Found: " + repr(shot)) self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) if self.Env["at-boot"] == 0: self.CM.ShouldBeStatus[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.lookforall(allow_multiple_matches=1) if watch.unmatched: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.Env["StartTime"]) if not matched: return self.failure("Didn't find all expected %s patterns" % chosen.name) elif not is_stable: return self.failure("Cluster did not become stable after killing %s" % chosen.name) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self.okerrpatterns.extend(self.patterns) return self.okerrpatterns AllTestClasses.append(ComponentFail) class SplitBrainTest(CTSTest): '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.is_experimental = 1 def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.debug("Creating partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self.CM.isolate_node(node, other_nodes): self.logger.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"])) if len(other_nodes) == 0: return 1 self.debug("Healing partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: self.CM.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = 1 partitions = {} ret = self.startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self.Env["nodes"]) for node in self.Env["nodes"]: p = self.Env.RandomGen.randint(1, p_max) if not p in partitions: partitions[p] = [] partitions[p].append(node) p_max = len(list(partitions.keys())) if p_max > 1: break # else, try again self.debug("Created %d partitions" % p_max) for key in list(partitions.keys()): self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self.rsh(node, "crm_attribute -V -n stonith-enabled -v false") for key in list(partitions.keys()): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self.CM.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self.CM.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self.CM.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self.CM.partitions_expected = 1 # And heal them again for key in list(partitions.keys()): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self.CM.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self.CM.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self.Env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self.CM.cluster_stable(1200): self.failure("Reformed cluster not stable") if self.Env["continue"] == 1: answer = "Y" else: try: answer = input_wrapper('Continue? [nY]') except EOFError as e: answer = "n" if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self.Env["DoFencing"]: self.rsh(node, "crm_attribute -V -D -n stonith-enabled") self.CM.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"pacemaker-controld.*:.*not in our membership list", r"CRIT:.*node.*returning after partition", ] def is_applicable(self): if not self.is_applicable_common(): return 0 return len(self.Env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) class Reattach(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Reattach" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = 0 # Handled by canrunnow() def _is_managed(self, node): is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1) is_managed = is_managed[:-1] # Strip off the newline return is_managed == "true" def _set_unmanaged(self, node): self.debug("Disable resource management") self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): self.debug("Re-enable resource management") self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def setup(self, node): attempt = 0 if not self.startall(None): return None # Make sure we are really _really_ stable and that all # resources, including those that depend on transient node # attributes, are started while not self.CM.cluster_stable(double_check=True): if attempt < 5: attempt += 1 self.debug("Not stable yet, re-testing") else: self.logger.log("Cluster is not stable") return None return 1 def teardown(self, node): # Make sure 'node' is up start = StartTest(self.CM) start(node) if not self._is_managed(node): self.logger.log("Attempting to re-enable resource management on %s" % node) self._set_managed(node) self.CM.cluster_stable() if not self._is_managed(node): self.logger.log("Could not re-enable resource management") return 0 return 1 def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' if self.find_ocfs2_resources(node): self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return 0 return 1 def __call__(self, node): self.incr("calls") pats = [] # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["Delaying fencing operations"], 60) managed.setwatch() self._set_unmanaged(node) if not managed.lookforall(): self.logger.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*")) watch = self.create_watch(pats, 60, "ShutdownActivity") watch.setwatch() self.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self.startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.setwatch() # Re-enable resource management (and verify it happened). self._set_managed(node) self.CM.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.debug("Ignoring start actions for %s" % r.id) ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"resource( was|s were) active at shutdown", ] def is_applicable(self): return 1 AllTestClasses.append(Reattach) class SpecialTest1(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SpecialTest1" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Could not stop all nodes") # Test config recovery when the other nodes come up self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") # Start the selected node ret = self.restart1(node) if not ret: return self.failure("Could not start "+node) # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Could not start the remaining nodes") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Errors that occur as a result of the CIB being wiped return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", r"error.*: Resource start-up disabled since no STONITH resources have been defined", r"error.*: Either configure some or disable STONITH with the stonith-enabled option", r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity", ] AllTestClasses.append(SpecialTest1) class HAETest(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "HAETest" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_loop = 1 def setup(self, node): # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") return self.success() def wait_on_state(self, node, resource, expected_clones, attempts=240): while attempts > 0: active = 0 (rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None) # Hack until crm_resource does the right thing if rc == 0 and lines: active = len(lines) if len(lines) == expected_clones: return 1 elif rc == 1: self.debug("Resource %s is still inactive" % resource) elif rc == 234: self.logger.log("Unknown resource %s" % resource) return 0 elif rc == 246: self.logger.log("Cluster is inactive") return 0 elif rc != 0: self.logger.log("Call to crm_resource failed, rc=%d" % rc) return 0 else: self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones)) attempts -= 1 time.sleep(1) return 0 def find_dlm(self, node): self.r_dlm = None (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "controld" and r.parent != "NA": self.debug("Found dlm: %s" % self.r_dlm) self.r_dlm = r.parent return 1 return 0 def find_hae_resources(self, node): self.r_dlm = None self.r_o2cb = None self.r_ocfs2 = [] if self.find_dlm(node): self.find_ocfs2_resources(node) def is_applicable(self): if not self.is_applicable_common(): return 0 if self.Env["Schema"] == "hae": return 1 return None class HAERoleTest(HAETest): def __init__(self, cm): '''Lars' mount/unmount test for the HA extension. ''' HAETest.__init__(self,cm) self.name = "HAERoleTest" def change_state(self, node, resource, target): rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 delay = 2 done = time.time() + self.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "Stopped") if not self.wait_on_state(node, self.r_dlm, 0): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "Started") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAERoleTest) class HAEStandbyTest(HAETest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): HAETest.__init__(self,cm) self.name = "HAEStandbyTest" def change_state(self, node, resource, target): rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 done = time.time() + self.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "true") if not self.wait_on_state(node, self.r_dlm, clone_max-1): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "false") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAEStandbyTest) class NearQuorumPointTest(CTSTest): ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint") #decide what to do with each node for node in self.Env["nodes"]: action = self.Env.RandomGen.choice(["start","stop"]) #action = self.Env.RandomGen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.debug("start nodes:" + repr(startset)) self.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self.CM.ShouldBeStatus[node] == "up": watchpats.append(self.templates["Pat:We_stopped"] % node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": #watchpats.append(self.templates["Pat:NonDC_started"] % node) watchpats.append(self.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self.CM.ShouldBeStatus[stopping] == "up": watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping))) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() #begin actions for node in stopset: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": self.CM.StartaCMnoBlock(node) #get the result if watch.lookforall(): self.CM.cluster_stable() self.CM.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self.CM.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self.CM.StataCM(node) == 0: downnodes.append(node) self.CM.fencing_cleanup("NearQuorumPoint", stonith) if upnodes == [] and downnodes == []: self.CM.cluster_stable() # Make sure they're completely down with no residule for node in stopset: self.rsh(node, self.templates["StopCmd"]) return self.success() if len(upnodes) > 0: self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self.logger.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() def is_applicable(self): return 1 AllTestClasses.append(NearQuorumPointTest) class RollingUpgradeTest(CTSTest): '''Perform a rolling upgrade of the cluster''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RollingUpgrade" self.start = StartTest(cm) self.stop = StopTest(cm) self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def setup(self, node): # Start all remaining nodes ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.Env["nodes"]: if not self.downgrade(node, None): return self.failure("Couldn't downgrade %s" % node) ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.Env["nodes"]: if not self.upgrade(node, None): return self.failure("Couldn't upgrade %s" % node) return self.success() def install(self, node, version, start=1, flags="--force"): target_dir = "/tmp/rpm-%s" % version src_dir = "%s/%s" % (self.Env["rpm-dir"], version) self.logger.log("Installing %s on %s with %s" % (version, node, flags)) if not self.stop(node): return self.failure("stop failure: "+node) rc = self.rsh(node, "mkdir -p %s" % target_dir) rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir) (rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None) for line in lines: line = line[:-1] rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir)) rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) if start and not self.start(node): return self.failure("start failure: "+node) return self.success() def upgrade(self, node, start=1): return self.install(node, self.Env["current-version"], start) def downgrade(self, node, start=1): return self.install(node, self.Env["previous-version"], start, "--force --nodeps") def __call__(self, node): '''Perform the 'Rolling Upgrade' test. ''' self.incr("calls") for node in self.Env["nodes"]: if self.upgrade(node): return self.failure("Couldn't upgrade %s" % node) self.CM.cluster_stable() return self.success() def is_applicable(self): if not self.is_applicable_common(): return None if not "rpm-dir" in list(self.Env.keys()): return None if not "current-version" in list(self.Env.keys()): return None if not "previous-version" in list(self.Env.keys()): return None return 1 # Register RestartTest as a good test to run AllTestClasses.append(RollingUpgradeTest) class BSC_AddResource(CTSTest): '''Add a resource to the cluster''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "AddResource" self.resource_offset = 0 self.cib_cmd = """cibadmin -C -o %s -X '%s' """ def __call__(self, node): self.incr("calls") self.resource_offset = self.resource_offset + 1 r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok" patterns = [] patterns.append(start_pat % r_id) watch = self.create_watch(patterns, self.Env["DeadTime"]) watch.setwatch() ip = self.NextIP() if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): return self.failure("Make resource %s failed" % r_id) failed = 0 watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Pattern not found: %s" % (regex)) failed = 1 if failed: return self.failure("Resource pattern(s) not found") if not self.CM.cluster_stable(self.Env["DeadTime"]): return self.failure("Unstable cluster") return self.success() def NextIP(self): ip = self.Env["IPBase"] if ":" in ip: fields = ip.rpartition(":") fields[2] = str(hex(int(fields[2], 16)+1)) print(str(hex(int(f[2], 16)+1))) else: fields = ip.rpartition('.') fields[2] = str(int(fields[2])+1) ip = fields[0] + fields[1] + fields[3]; self.Env["IPBase"] = ip return ip.strip() def make_ip_resource(self, node, id, rclass, type, ip): self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) rsc_xml=""" """ % (id, rclass, type, id, id, ip) node_constraint = """ """ % (id, id, id, id, node) rc = 0 (rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None) if rc != 0: self.logger.log("Constraint creation failed: %d" % rc) return None (rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None) if rc != 0: self.logger.log("Resource creation failed: %d" % rc) return None return 1 def is_applicable(self): if self.Env["DoBSC"]: return 1 return None AllTestClasses.append(BSC_AddResource) class SimulStopLite(CTSTest): '''Stop any active nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStopLite" def __call__(self, dummy): '''Perform the 'SimulStopLite' setup work. ''' self.incr("calls") self.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.incr("WasStarted") watchpats.append(self.templates["Pat:We_stopped"] % node) if len(watchpats) == 0: return self.success() # Stop all the nodes - at about the same time... watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.set_timer() for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) if watch.lookforall(): # Make sure they're completely down with no residule for node in self.Env["nodes"]: self.rsh(node, self.templates["StopCmd"]) return self.success() did_fail = 0 up_nodes = [] for node in self.Env["nodes"]: if self.CM.StataCM(node) == 1: did_fail = 1 up_nodes.append(node) if did_fail: return self.failure("Active nodes exist: " + repr(up_nodes)) self.logger.log("Warn: All nodes stopped but CTS didnt detect: " + repr(watch.unmatched)) return self.failure("Missing log message: "+repr(watch.unmatched)) def is_applicable(self): '''SimulStopLite is a setup test and never applicable''' return 0 class SimulStartLite(CTSTest): '''Start any stopped nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStartLite" def __call__(self, dummy): '''Perform the 'SimulStartList' setup work. ''' self.incr("calls") self.debug("Setup: " + self.name) # We ignore the "node" parameter... node_list = [] for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "down": self.incr("WasStopped") node_list.append(node) self.set_timer() while len(node_list) > 0: # Repeat until all nodes come up watchpats = [ ] uppat = self.templates["Pat:NonDC_started"] if self.CM.upcount() == 0: uppat = self.templates["Pat:Local_started"] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in node_list: watchpats.append(uppat % node) watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) # Start all the nodes - at about the same time... watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() stonith = self.CM.prepare_fencing_watcher(self.name) for node in node_list: self.CM.StartaCMnoBlock(node) watch.lookforall() node_list = self.CM.fencing_cleanup(self.name, stonith) if node_list == None: return self.failure("Cluster did not stabilize") # Remove node_list messages from watch.unmatched for node in node_list: self.logger.debug("Dealing with stonith operations for %s" % repr(node_list)) if watch.unmatched: try: watch.unmatched.remove(uppat % node) except: self.debug("Already matched: %s" % (uppat % node)) try: watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) except: self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node)) try: watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) except: self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node)) if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" %(regex)) if not self.CM.cluster_stable(): return self.failure("Cluster did not stabilize") did_fail = 0 unstable = [] for node in self.Env["nodes"]: if self.CM.StataCM(node) == 0: did_fail = 1 unstable.append(node) if did_fail: return self.failure("Unstarted nodes exist: " + repr(unstable)) unstable = [] for node in self.Env["nodes"]: if not self.CM.node_stable(node): did_fail = 1 unstable.append(node) if did_fail: return self.failure("Unstable cluster nodes exist: " + repr(unstable)) return self.success() def is_applicable(self): '''SimulStartLite is a setup test and never applicable''' return 0 def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.Audits = audits result.append(bound_test) return result class RemoteLXC(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteLXC" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = 1 self.is_docker_unsafe = 1 self.failed = 0 self.fail_string = "" def start_lxc_simple(self, node): # restore any artifacts laying around from a previous test. self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") # generate the containers, put them in the config, add some resources to them pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms")) self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) self.set_timer("remoteSimpleInit") watch.lookforall() self.log_timer("remoteSimpleInit") if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = 1 def cleanup_lxc_simple(self, node): pats = [ ] # if the test failed, attempt to clean up the cib and libvirt environment # as best as possible if self.failed == 1: # restore libvirt and cib self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") return watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2")) self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") self.set_timer("remoteSimpleCleanup") watch.lookforall() self.log_timer("remoteSimpleCleanup") if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = 1 # cleanup libvirt self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") def __call__(self, node): '''Perform the 'RemoteLXC' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") if rc == 1: self.log("Environment test for lxc support failed.") return self.skipped() self.start_lxc_simple(node) self.cleanup_lxc_simple(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed == 1: return self.failure(self.fail_string) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for ping", r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)", # The orphaned lxc-ms resource causes an expected transition error # that is a result of the scheduler not having knowledge that the # promotable resource used to be a clone. As a result, it looks like that # resource is running in multiple locations when it shouldn't... But in # this instance we know why this error is occurring and that it is expected. r"Calculated [Tt]ransition .*pe-error", r"Resource lxc-ms .* is active on 2 nodes attempting recovery", r"Unknown operation: fail", r"VirtualDomain.*ERROR: Unable to determine emulator", ] AllTestClasses.append(RemoteLXC) class RemoteDriver(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = self.__class__.__name__ self.is_docker_unsafe = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.stop = StopTest(cm) self.remote_rsc = "remote-rsc" self.cib_cmd = """cibadmin -C -o %s -X '%s' """ self.reset() def reset(self): self.pcmk_started = 0 self.failed = False self.fail_string = "" self.remote_node_added = 0 self.remote_rsc_added = 0 self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False]) def fail(self, msg): """ Mark test as failed. """ self.failed = True # Always log the failure. self.logger.log(msg) # Use first failure as test status, as it's likely to be most useful. if not self.fail_string: self.fail_string = msg def get_othernode(self, node): for othernode in self.Env["nodes"]: if othernode == node: # we don't want to try and use the cib that we just shutdown. # find a cluster node that is not our soon to be remote-node. continue else: return othernode def del_rsc(self, node, rsc): othernode = self.get_othernode(node) rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc)) if rc != 0: self.fail("Removal of resource '%s' failed" % rsc) def add_rsc(self, node, rsc_xml): othernode = self.get_othernode(node) rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml)) if rc != 0: self.fail("resource creation failed") def add_primitive_rsc(self, node): rsc_xml = """ """ % { "node": self.remote_rsc } self.add_rsc(node, rsc_xml) if not self.failed: self.remote_rsc_added = 1 def add_connection_rsc(self, node): rsc_xml = """ """ % { "node": self.remote_node, "server": node } if self.remote_use_reconnect_interval: # Set reconnect interval on resource rsc_xml = rsc_xml + """ """ % (self.remote_node) rsc_xml = rsc_xml + """ """ % { "node": self.remote_node } self.add_rsc(node, rsc_xml) if not self.failed: self.remote_node_added = 1 def disable_services(self, node): self.corosync_enabled = self.Env.service_is_enabled(node, "corosync") if self.corosync_enabled: self.Env.disable_service(node, "corosync") self.pacemaker_enabled = self.Env.service_is_enabled(node, "pacemaker") if self.pacemaker_enabled: self.Env.disable_service(node, "pacemaker") def restore_services(self, node): if self.corosync_enabled: self.Env.enable_service(node, "corosync") if self.pacemaker_enabled: self.Env.enable_service(node, "pacemaker") def stop_pcmk_remote(self, node): # disable pcmk remote for i in range(10): rc = self.rsh(node, "service pacemaker_remote stop") if rc != 0: time.sleep(6) else: break def start_pcmk_remote(self, node): for i in range(10): rc = self.rsh(node, "service pacemaker_remote start") if rc != 0: time.sleep(6) else: self.pcmk_started = 1 break def freeze_pcmk_remote(self, node): """ Simulate a Pacemaker Remote daemon failure. """ # We freeze the process. self.rsh(node, "killall -STOP pacemaker-remoted") def resume_pcmk_remote(self, node): # We resume the process. self.rsh(node, "killall -CONT pacemaker-remoted") def start_metal(self, node): # Cluster nodes are reused as remote nodes in remote tests. If cluster # services were enabled at boot, in case the remote node got fenced, the # cluster node would join instead of the expected remote one. Meanwhile # pacemaker_remote would not be able to start. Depending on the chances, # the situations might not be able to be orchestrated gracefully any more. # # Temporarily disable any enabled cluster serivces. self.disable_services(node) pcmk_started = 0 # make sure the resource doesn't already exist for some reason self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc)) self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node)) if not self.stop(node): self.fail("Failed to shutdown cluster node %s" % node) return self.start_pcmk_remote(node) if self.pcmk_started == 0: self.fail("Failed to start pacemaker_remote on node %s" % node) return # Convert node to baremetal now that it has shutdown the cluster stack pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node)) pats.append(self.templates["Pat:DC_IDLE"]) self.add_connection_rsc(node) self.set_timer("remoteMetalInit") watch.lookforall() self.log_timer("remoteMetalInit") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def migrate_connection(self, node): if self.failed: return pats = [ ] pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node)) pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node)) pats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(pats, 120) watch.setwatch() (rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None) if rc != 0: self.fail("failed to move remote node connection resource") return self.set_timer("remoteMetalMigrate") watch.lookforall() self.log_timer("remoteMetalMigrate") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return def fail_rsc(self, node): if self.failed: return watchpats = [ ] watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node)) watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, 120) watch.setwatch() self.debug("causing dummy rsc to fail.") rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*") self.set_timer("remoteRscFail") watch.lookforall() self.log_timer("remoteRscFail") if watch.unmatched: self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched) def fail_connection(self, node): if self.failed: return watchpats = [ ] watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node) watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node) watch = self.create_watch(watchpats, 120) watch.setwatch() # freeze the pcmk remote daemon. this will result in fencing self.debug("Force stopped active remote node") self.freeze_pcmk_remote(node) self.debug("Waiting for remote node to be fenced.") self.set_timer("remoteMetalFence") watch.lookforall() self.log_timer("remoteMetalFence") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return self.debug("Waiting for the remote node to come back up") self.CM.ns.WaitForNodeToComeUp(node, 120); pats = [ ] watch = self.create_watch(pats, 240) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node)) if self.remote_rsc_added == 1: pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) # start the remote node again watch it integrate back into cluster. self.start_pcmk_remote(node) if self.pcmk_started == 0: self.fail("Failed to start pacemaker_remote on node %s" % node) return self.debug("Waiting for remote node to rejoin cluster after being fenced.") self.set_timer("remoteMetalRestart") watch.lookforall() self.log_timer("remoteMetalRestart") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return def add_dummy_rsc(self, node): if self.failed: return # verify we can put a resource on the remote node pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) pats.append(self.templates["Pat:DC_IDLE"]) # Add a resource that must live on remote-node self.add_primitive_rsc(node) # force that rsc to prefer the remote node. (rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None) if rc != 0: self.fail("Failed to place remote resource on remote node.") return self.set_timer("remoteMetalRsc") watch.lookforall() self.log_timer("remoteMetalRsc") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def test_attributes(self, node): if self.failed: return # This verifies permanent attributes can be set on a remote-node. It also # verifies the remote-node can edit its own cib node section remotely. (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)) return (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to get remote-node attribute") return (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to delete remote-node attribute") return def cleanup_metal(self, node): self.restore_services(node) if self.pcmk_started == 0: return pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() if self.remote_rsc_added == 1: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc)) if self.remote_node_added == 1: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node)) self.set_timer("remoteMetalCleanup") self.resume_pcmk_remote(node) if self.remote_rsc_added == 1: # Remove dummy resource added for remote node tests self.debug("Cleaning up dummy rsc put on remote node") self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc) self.del_rsc(node, self.remote_rsc) if self.remote_node_added == 1: # Remove remote node's connection resource self.debug("Cleaning up remote node connection resource") self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node)) self.del_rsc(node, self.remote_node) watch.lookforall() self.log_timer("remoteMetalCleanup") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) self.stop_pcmk_remote(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.remote_node_added == 1: # Remove remote node itself self.debug("Cleaning up node entry for remote node") self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node) def setup_env(self, node): self.remote_node = "remote-%s" % (node) # we are assuming if all nodes have a key, that it is # the right key... If any node doesn't have a remote # key, we regenerate it everywhere. if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]): return # create key locally (handle, keyfile) = tempfile.mkstemp(".cts") os.close(handle) devnull = open(os.devnull, 'wb') subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"], stdout=devnull, stderr=devnull) devnull.close() # sync key throughout the cluster for node in self.Env["nodes"]: self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker") self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node) self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") self.rsh(node, "chmod 0640 /etc/pacemaker/authkey") os.unlink(keyfile) def is_applicable(self): if not self.is_applicable_common(): return False for node in self.Env["nodes"]: rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1") if rc != 0: return False return True def start_new_test(self, node): self.incr("calls") self.reset() ret = self.startall(None) if not ret: return self.failure("setup failed: could not start all nodes") self.setup_env(node) self.start_metal(node) self.add_dummy_rsc(node) return True def __call__(self, node): return self.failure("This base class is not meant to be called directly.") def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"""is running on remote.*which isn't allowed""", r"""Connection terminated""", r"""Could not send remote""", ] # RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses class RemoteBasic(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteBaremetal' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.test_attributes(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteBasic) class RemoteStonithd(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteStonithd' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return False if "DoFencing" in list(self.Env.keys()): return self.Env["DoFencing"] return True def errorstoignore(self): ignore_pats = [ r"Lost connection to Pacemaker Remote node", r"Software caused connection abort", r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", r"error: Result of monitor operation for .* on remote-.*: No executor connection", ] ignore_pats.extend(RemoteDriver.errorstoignore(self)) return ignore_pats AllTestClasses.append(RemoteStonithd) class RemoteMigrate(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteMigrate' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.migrate_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteMigrate) class RemoteRscFailure(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteRscFailure' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) # This is an important step. We are migrating the connection # before failing the resource. This verifies that the migration # has properly maintained control over the remote-node. self.migrate_connection(node) self.fail_rsc(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def errorstoignore(self): ignore_pats = [ r"schedulerd.*: Recover remote-rsc\s*\(.*\)", r"Dummy.*: No process state file found", ] ignore_pats.extend(RemoteDriver.errorstoignore(self)) return ignore_pats AllTestClasses.append(RemoteRscFailure) # vim:ts=4:sw=4:et: diff --git a/cts/cts-exec.in b/cts/cts-exec.in index 8cc203fcb9..592d850b4e 100644 --- a/cts/cts-exec.in +++ b/cts/cts-exec.in @@ -1,1219 +1,1219 @@ #!@PYTHON@ """ Regression tests for Pacemaker's pacemaker-execd """ # Pacemaker targets compatibility with Python 2.7 and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division __copyright__ = "Copyright 2012-2019 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import io import os import stat import sys import subprocess import shlex import shutil import time # Where to find test binaries # Prefer the source tree if available BUILD_DIR = "@abs_top_builddir@" TEST_DIR = sys.path[0] SBIN_DIR = "@sbindir@" # File permissions for executable scripts we create EXECMODE = stat.S_IRUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH # These values must be kept in sync with include/crm/crm.h class CrmExit(object): OK = 0 ERROR = 1 INVALID_PARAM = 2 UNIMPLEMENT_FEATURE = 3 INSUFFICIENT_PRIV = 4 NOT_INSTALLED = 5 NOT_CONFIGURED = 6 NOT_RUNNING = 7 USAGE = 64 DATAERR = 65 NOINPUT = 66 NOUSER = 67 NOHOST = 68 UNAVAILABLE = 69 SOFTWARE = 70 OSERR = 71 OSFILE = 72 CANTCREAT = 73 IOERR = 74 TEMPFAIL = 75 PROTOCOL = 76 NOPERM = 77 CONFIG = 78 FATAL = 100 PANIC = 101 DISCONNECT = 102 SOLO = 103 DIGEST = 104 NOSUCH = 105 QUORUM = 106 UNSAFE = 107 EXISTS = 108 MULTIPLE = 109 OLD = 110 TIMEOUT = 124 MAX = 255 def update_path(): """ Set the PATH environment variable appropriately for the tests """ new_path = os.environ['PATH'] if os.path.exists("%s/cts-exec.in" % TEST_DIR): print("Running tests from the source tree: %s (%s)" % (BUILD_DIR, TEST_DIR)) # For pacemaker-execd, cts-exec-helper, and pacemaker-remoted new_path = "%s/daemons/execd:%s" % (BUILD_DIR, new_path) new_path = "%s/tools:%s" % (BUILD_DIR, new_path) # For crm_resource # For pacemaker-fenced new_path = "%s/daemons/fenced:%s" % (BUILD_DIR, new_path) # For cts-support new_path = "%s/cts:%s" % (BUILD_DIR, new_path) else: print("Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % TEST_DIR) # For cts-exec-helper, cts-support, pacemaker-execd, pacemaker-fenced, # and pacemaker-remoted new_path = "@CRM_DAEMON_DIR@:%s" % (new_path) print('Using PATH="{}"'.format(new_path)) os.environ['PATH'] = new_path def pipe_output(pipes, stdout=True, stderr=False): """ Wrapper to get text output from pipes regardless of Python version """ output = "" pipe_outputs = pipes.communicate() if sys.version_info < (3,): if stdout: output = output + pipe_outputs[0] if stderr: output = output + pipe_outputs[1] else: if stdout: output = output + pipe_outputs[0].decode(sys.stdout.encoding) if stderr: output = output + pipe_outputs[1].decode(sys.stderr.encoding) return output def output_from_command(command): """ Run a command, and return its standard output. """ test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) test.wait() return pipe_output(test).split("\n") class TestError(Exception): """ Base class for exceptions in this module """ pass class ExitCodeError(TestError): """ Exception raised when command exit status is unexpected """ def __init__(self, exit_code): self.exit_code = exit_code def __str__(self): return repr(self.exit_code) class OutputNotFoundError(TestError): """ Exception raised when command output does not contain wanted string """ def __init__(self, output): self.output = output def __str__(self): return repr(self.output) class OutputFoundError(TestError): """ Exception raised when command output contains unwanted string """ def __init__(self, output): self.output = output def __str__(self): return repr(self.output) class Test(object): """ Executor for a single pacemaker-execd regression test """ def __init__(self, name, description, verbose=0, tls=0): self.name = name self.description = description self.cmds = [] if tls: self.daemon_location = "pacemaker-remoted" else: self.daemon_location = "pacemaker-execd" self.test_tool_location = "cts-exec-helper" self.verbose = verbose self.tls = tls self.result_txt = "" self.cmd_tool_output = "" self.result_exitcode = CrmExit.OK self.execd_process = None self.stonith_process = None self.executed = 0 def __new_cmd(self, cmd, args, exitcode, stdout_match="", no_wait=0, stdout_negative_match="", kill=None): """ Add a command to be executed as part of this test """ if self.verbose and cmd == self.test_tool_location: args = args + " -V " if (cmd == self.test_tool_location) and self.tls: args = args + " -S " self.cmds.append( { "cmd" : cmd, "kill" : kill, "args" : args, "expected_exitcode" : exitcode, "stdout_match" : stdout_match, "stdout_negative_match" : stdout_negative_match, "no_wait" : no_wait, "cmd_output" : "", } ) def start_environment(self): """ Prepare the host for running a test """ ### make sure we are in full control here ### cmd = shlex.split("killall -q -9 pacemaker-fenced lt-pacemaker-fenced pacemaker-execd lt-pacemaker-execd cts-exec-helper lt-cts-exec-helper pacemaker-remoted") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() additional_args = "" if self.tls == 0: self.stonith_process = subprocess.Popen(shlex.split("pacemaker-fenced -s")) if self.verbose: additional_args = additional_args + " -V" self.execd_process = subprocess.Popen(shlex.split("%s %s -l /tmp/pacemaker-execd-regression.log" % (self.daemon_location, additional_args))) time.sleep(1) def clean_environment(self): """ Clean up the host after running a test """ if self.execd_process: self.execd_process.terminate() self.execd_process.wait() if self.verbose: print("Daemon output") logfile = io.open('/tmp/pacemaker-execd-regression.log', 'rt', errors='replace') for line in logfile: print(line.strip().encode('utf-8', 'replace')) os.remove('/tmp/pacemaker-execd-regression.log') if self.stonith_process: self.stonith_process.terminate() self.stonith_process.wait() self.execd_process = None self.stonith_process = None def add_sys_cmd(self, cmd, args): """ Add a simple command to be executed as part of this test """ self.__new_cmd(cmd, args, CrmExit.OK, "") def add_cmd_check_stdout(self, args, match, no_match=""): """ Add a command with expected output to be executed as part of this test """ self.__new_cmd(self.test_tool_location, args, CrmExit.OK, match, 0, no_match) def add_cmd(self, args): """ Add a cts-exec-helper command to be executed as part of this test """ self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "") def add_cmd_and_kill(self, kill_proc, args): """ Add a cts-exec-helper command and system command to be executed as part of this test """ self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "", kill=kill_proc) def add_expected_fail_cmd(self, args, exitcode=CrmExit.ERROR): """ Add a cts-exec-helper command to be executed as part of this test and expected to fail """ self.__new_cmd(self.test_tool_location, args, exitcode, "") def get_exitcode(self): """ Return the exit status of the last test execution """ return self.result_exitcode def print_result(self, filler): """ Print the result of the last test execution """ print("%s%s" % (filler, self.result_txt)) def run_cmd(self, args): """ Execute a command as part of this test """ cmd = shlex.split(args['args']) cmd.insert(0, args['cmd']) if self.verbose: print("\n\nRunning: "+" ".join(cmd)) test = subprocess.Popen(cmd, stdout=subprocess.PIPE) if args['kill']: if self.verbose: print("Also running: "+args['kill']) ### Typically, the kill argument is used to detect some sort of ### failure. Without yielding for a few seconds here, the process ### launched earlier that is listening for the failure may not have ### time to connect to pacemaker-execd. time.sleep(2) subprocess.Popen(shlex.split(args['kill'])) if args['no_wait'] == 0: test.wait() else: return CrmExit.OK output = pipe_output(test) args['cmd_output'] = output if test.returncode != args['expected_exitcode']: raise ExitCodeError(test.returncode) if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0: raise OutputNotFoundError(output) if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0: raise OutputFoundError(output) def set_error(self, step, cmd): """ Record failure of this test """ msg = "FAILURE - '%s' failed at step %d. Command: %s %s" self.result_txt = msg % (self.name, step, cmd['cmd'], cmd['args']) self.result_exitcode = CrmExit.ERROR def run(self): """ Execute this test. """ res = 0 i = 1 if self.tls and self.name.count("stonith") != 0: self.result_txt = "SKIPPED - '%s' - disabled when testing pacemaker_remote" % (self.name) print(self.result_txt) return res self.start_environment() if self.verbose: print("\n--- START TEST - %s" % self.name) self.result_txt = "SUCCESS - '%s'" % (self.name) self.result_exitcode = CrmExit.OK for cmd in self.cmds: try: self.run_cmd(cmd) except ExitCodeError as e: print(cmd['cmd_output']) print("Step %d FAILED - command returned %s, expected %d" % (i, e, cmd['expected_exitcode'])) self.set_error(i, cmd); break except OutputNotFoundError as e: print("Step %d FAILED - '%s' was not found in command output: %s" % (i, cmd['stdout_match'], e)) self.set_error(i, cmd); break except OutputFoundError as e: print("Step %d FAILED - '%s' was found in command output: %s" % (i, cmd['stdout_negative_match'], e)) self.set_error(i, cmd); break if self.verbose: print(cmd['cmd_output'].strip()) print("Step %d SUCCESS" % (i)) i = i + 1 self.clean_environment() print(self.result_txt) if self.verbose: print("--- END TEST - %s\n" % self.name) self.executed = 1 return res class Tests(object): """ Collection of all pacemaker-execd regression tests """ def __init__(self, verbose=0, tls=0): self.tests = [] self.verbose = verbose self.tls = tls self.rsc_classes = output_from_command("crm_resource --list-standards") self.rsc_classes = self.rsc_classes[:-1] # Strip trailing empty line self.installed_files = [] self.action_timeout = " -t 9000 " if self.tls: self.rsc_classes.remove("stonith") if "systemd" in self.rsc_classes: try: # This code doesn't need this import, but pacemaker-cts-dummyd # does, so ensure the dependency is available rather than cause # all systemd tests to fail. import systemd.daemon except ImportError: print("Python systemd bindings not found.") print("The tests for systemd class are not going to be run.") self.rsc_classes.remove("systemd") print("Testing resource classes", repr(self.rsc_classes)) self.common_cmds = { "ocf_reg_line" : "-c register_rsc -r ocf_test_rsc "+self.action_timeout+" -C ocf -P pacemaker -T Dummy", "ocf_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"", "ocf_unreg_line" : "-c unregister_rsc -r \"ocf_test_rsc\" "+self.action_timeout, "ocf_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"", "ocf_start_line" : "-c exec -r \"ocf_test_rsc\" -a \"start\" "+self.action_timeout, "ocf_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:start rc:ok op_status:complete\" ", "ocf_stop_line" : "-c exec -r \"ocf_test_rsc\" -a \"stop\" "+self.action_timeout, "ocf_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:stop rc:ok op_status:complete\" ", "ocf_monitor_line" : '-c exec -r ocf_test_rsc -a monitor -i 2s ' + self.action_timeout, "ocf_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, "ocf_cancel_line" : '-c cancel -r ocf_test_rsc -a monitor -i 2s ' + self.action_timeout, "ocf_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:Cancelled\" ", "systemd_reg_line" : "-c register_rsc -r systemd_test_rsc " + self.action_timeout + " -C systemd -T pacemaker-cts-dummyd@3", "systemd_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"", "systemd_unreg_line" : "-c unregister_rsc -r \"systemd_test_rsc\" "+self.action_timeout, "systemd_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"", "systemd_start_line" : "-c exec -r \"systemd_test_rsc\" -a \"start\" "+self.action_timeout, "systemd_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:start rc:ok op_status:complete\" ", "systemd_stop_line" : "-c exec -r \"systemd_test_rsc\" -a \"stop\" "+self.action_timeout, "systemd_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:stop rc:ok op_status:complete\" ", "systemd_monitor_line" : '-c exec -r systemd_test_rsc -a monitor -i 2s ' + self.action_timeout, "systemd_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:complete\" -t 15000 ", "systemd_cancel_line" : '-c cancel -r systemd_test_rsc -a monitor -i 2s ' + self.action_timeout, "systemd_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:Cancelled\" ", "upstart_reg_line" : "-c register_rsc -r upstart_test_rsc "+self.action_timeout+" -C upstart -T pacemaker-cts-dummyd", "upstart_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"", "upstart_unreg_line" : "-c unregister_rsc -r \"upstart_test_rsc\" "+self.action_timeout, "upstart_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"", "upstart_start_line" : "-c exec -r \"upstart_test_rsc\" -a \"start\" "+self.action_timeout, "upstart_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:start rc:ok op_status:complete\" ", "upstart_stop_line" : "-c exec -r \"upstart_test_rsc\" -a \"stop\" "+self.action_timeout, "upstart_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:stop rc:ok op_status:complete\" ", "upstart_monitor_line" : '-c exec -r upstart_test_rsc -a monitor -i 2s ' + self.action_timeout, "upstart_monitor_event" : '-l "NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:complete" -t 15000', "upstart_cancel_line" : '-c cancel -r upstart_test_rsc -a monitor -i 2s ' + self.action_timeout, "upstart_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:Cancelled\" ", "service_reg_line" : "-c register_rsc -r service_test_rsc "+self.action_timeout+" -C service -T LSBDummy", "service_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:service_test_rsc action:none rc:ok op_status:complete\"", "service_unreg_line" : "-c unregister_rsc -r \"service_test_rsc\" "+self.action_timeout, "service_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:service_test_rsc action:none rc:ok op_status:complete\"", "service_start_line" : "-c exec -r \"service_test_rsc\" -a \"start\" "+self.action_timeout, "service_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:start rc:ok op_status:complete\" ", "service_stop_line" : "-c exec -r \"service_test_rsc\" -a \"stop\" "+self.action_timeout, "service_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:stop rc:ok op_status:complete\" ", "service_monitor_line" : '-c exec -r service_test_rsc -a monitor -i 2s ' + self.action_timeout, "service_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, "service_cancel_line" : '-c cancel -r service_test_rsc -a monitor -i 2s ' + self.action_timeout, "service_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:Cancelled\" ", "lsb_reg_line" : "-c register_rsc -r lsb_test_rsc "+self.action_timeout+" -C lsb -T LSBDummy", "lsb_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\" ", "lsb_unreg_line" : "-c unregister_rsc -r \"lsb_test_rsc\" "+self.action_timeout, "lsb_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\"", "lsb_start_line" : "-c exec -r \"lsb_test_rsc\" -a \"start\" "+self.action_timeout, "lsb_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:start rc:ok op_status:complete\" ", "lsb_stop_line" : "-c exec -r \"lsb_test_rsc\" -a \"stop\" "+self.action_timeout, "lsb_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:stop rc:ok op_status:complete\" ", "lsb_monitor_line" : '-c exec -r lsb_test_rsc -a status -i 2s ' + self.action_timeout, "lsb_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:complete\" "+self.action_timeout, "lsb_cancel_line" : '-c cancel -r lsb_test_rsc -a status -i 2s ' + self.action_timeout, "lsb_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:Cancelled\" ", "stonith_reg_line" : "-c register_rsc -r stonith_test_rsc " + self.action_timeout + " -C stonith -P pacemaker -T fence_dummy", "stonith_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\" ", "stonith_unreg_line" : "-c unregister_rsc -r \"stonith_test_rsc\" "+self.action_timeout, "stonith_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\"", "stonith_start_line" : '-c exec -r stonith_test_rsc -a start ' + self.action_timeout, "stonith_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:start rc:ok op_status:complete\" ", "stonith_stop_line" : "-c exec -r \"stonith_test_rsc\" -a \"stop\" "+self.action_timeout, "stonith_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:stop rc:ok op_status:complete\" ", "stonith_monitor_line" : '-c exec -r stonith_test_rsc -a monitor -i 2s ' + self.action_timeout, "stonith_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, "stonith_cancel_line" : '-c cancel -r stonith_test_rsc -a monitor -i 2s ' + self.action_timeout, "stonith_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Cancelled\" ", } def new_test(self, name, description): """ Create a named test """ test = Test(name, description, self.verbose, self.tls) self.tests.append(test) return test def setup_test_environment(self): """ Prepare the host before executing any tests """ os.system("service pacemaker_remote stop") self.cleanup_test_environment() if self.tls and not os.path.isfile("/etc/pacemaker/authkey"): print("Installing /etc/pacemaker/authkey ...") os.system("mkdir -p /etc/pacemaker") os.system("dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1") self.installed_files.append("/etc/pacemaker/authkey") # If we're in build directory, install agents if not already installed if os.path.exists("%s/cts/cts-exec.in" % BUILD_DIR): if not os.path.exists("@OCF_RA_DIR@/pacemaker"): # @TODO remember which components were created and remove them os.makedirs("@OCF_RA_DIR@/pacemaker", 0o755) for agent in ["Dummy", "Stateful", "ping"]: agent_source = "%s/extra/resources/%s" % (BUILD_DIR, agent) agent_dest = "@OCF_RA_DIR@/pacemaker/%s" % (agent) if not os.path.exists(agent_dest): print("Installing %s ..." % (agent_dest)) shutil.copyfile(agent_source, agent_dest) os.chmod(agent_dest, EXECMODE) self.installed_files.append(agent_dest) subprocess.call(["cts-support", "install"]) def cleanup_test_environment(self): """ Clean up the host after executing desired tests """ for installed_file in self.installed_files: print("Removing %s ..." % (installed_file)) os.remove(installed_file) subprocess.call(["cts-support", "uninstall"]) def build_generic_tests(self): """ Register tests that apply to all resource classes """ common_cmds = self.common_cmds ### register/unregister tests ### for rsc in self.rsc_classes: test = self.new_test("generic_registration_%s" % (rsc), "Simple resource registration test for %s standard" % (rsc)) test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) ### start/stop tests ### for rsc in self.rsc_classes: test = self.new_test("generic_start_stop_%s" % (rsc), "Simple start and stop test for %s standard" % (rsc)) test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)]) test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)]) test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) ### monitor cancel test ### for rsc in self.rsc_classes: test = self.new_test("generic_monitor_cancel_%s" % (rsc), "Simple monitor cancel test for %s standard" % (rsc)) test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)]) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)]) test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) ### monitor duplicate test ### for rsc in self.rsc_classes: test = self.new_test("generic_monitor_duplicate_%s" % (rsc), "Test creation and canceling of duplicate monitors for %s standard" % (rsc)) test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) # Add the duplicate monitors test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) # verify we still get update events ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) # cancel the monitor, if the duplicate merged with the original, we should no longer see monitor updates test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)]) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)]) test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) ### stop implies cancel test ### for rsc in self.rsc_classes: test = self.new_test("generic_stop_implies_cancel_%s" % (rsc), "Verify stopping a resource implies cancel of recurring ops for %s standard" % (rsc)) test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)]) test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) ### If this fails, that means the monitor may not be getting rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)]) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) ### If this happens the monitor did not actually cancel correctly. ### test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT) test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) def build_multi_rsc_tests(self): """ Register complex tests that involve managing multiple resouces of different types """ common_cmds = self.common_cmds # do not use service and systemd at the same time, it is the same resource. ### register start monitor stop unregister resources of each type at the same time. ### test = self.new_test("multi_rsc_start_stop_all", "Start, monitor, and stop resources of multiple types and classes") for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)]) for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)]) for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)]) for rsc in self.rsc_classes: ### If this fails, that means the monitor is not being rescheduled #### test.add_cmd(common_cmds["%s_monitor_event" % (rsc)]) for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)]) for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)]) for rsc in self.rsc_classes: test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)]) def build_negative_tests(self): """ Register tests related to how pacemaker-execd handles failures """ ### ocf start timeout test ### test = self.new_test("ocf_start_timeout", "Force start timeout to occur, verify start failure.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" " + self.action_timeout + "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") # -t must be less than self.action_timeout test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" -k \"op_sleep\" -v \"5\" -t 1000 -w") - test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:Timed Out" ' + test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:error op_status:Timed Out" ' + self.action_timeout) test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ") test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### stonith start timeout test ### test = self.new_test("stonith_start_timeout", "Force start timeout to occur, verify start failure.") test.add_cmd('-c register_rsc -r test_rsc ' + '-C stonith -P pacemaker -T fence_dummy ' + self.action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete"') test.add_cmd('-c exec -r test_rsc -a start -k monitor_delay -v 30 ' + '-t 1000 -w') # -t must be less than self.action_timeout test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:OCF_TIMEOUT op_status:Timed Out" ' + self.action_timeout) test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ") test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### stonith component fail ### common_cmds = self.common_cmds test = self.new_test("stonith_component_fail", "Kill stonith component after pacemaker-execd connects") test.add_cmd(common_cmds["stonith_reg_line"] + " " + common_cmds["stonith_reg_event"]) test.add_cmd(common_cmds["stonith_start_line"] + " " + common_cmds["stonith_start_event"]) test.add_cmd('-c exec -r stonith_test_rsc -a monitor -i 600s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete" ' + self.action_timeout) test.add_cmd_and_kill("killall -9 -q pacemaker-fenced lt-pacemaker-fenced", - '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:unknown error op_status:error" -t 15000') + '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:error op_status:error" -t 15000') test.add_cmd(common_cmds["stonith_unreg_line"] + " " + common_cmds["stonith_unreg_event"]) ### monitor fail for ocf resources ### test = self.new_test("monitor_fail_ocf", "Force ocf monitor to fail, verify failure is reported.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" " + self.action_timeout + "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"') test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"' + self.action_timeout) test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"' + self.action_timeout) test.add_cmd_and_kill("rm -f @localstatedir@/run/Dummy-test_rsc.state", '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete" ' + self.action_timeout) test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s ' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" " + self.action_timeout, CrmExit.TIMEOUT) test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" " + self.action_timeout, CrmExit.TIMEOUT) test.add_cmd("-c unregister_rsc -r \"test_rsc\" " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### verify notify changes only for monitor operation. ### test = self.new_test("monitor_changes_only", "Verify when flag is set, only monitor changes are notified.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+" -o " "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout + ' -o -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete" ') test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_cmd_and_kill('rm -f @localstatedir@/run/Dummy-test_rsc.state', '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout) test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_cmd('-c unregister_rsc -r "test_rsc" ' + self.action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete"') ### monitor fail for systemd resource ### if "systemd" in self.rsc_classes: test = self.new_test("monitor_fail_systemd", "Force systemd monitor to fail, verify failure is reported..") test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T pacemaker-cts-dummyd@3 " + self.action_timeout + "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ") test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd_and_kill("killall -9 -q pacemaker-cts-dummyd", '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout) test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### monitor fail for upstart resource ### if "upstart" in self.rsc_classes: test = self.new_test("monitor_fail_upstart", "Force upstart monitor to fail, verify failure is reported..") test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T pacemaker-cts-dummyd "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ") test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd_and_kill('killall -9 -q dd', '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout) test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Cancel non-existent operation on a resource ### test = self.new_test("cancel_non_existent_op", "Attempt to cancel the wrong monitor operation, verify expected failure") test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ") test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout) ### interval is wrong, should fail test.add_expected_fail_cmd('-c cancel -r test_rsc -a monitor -i 2s' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") ### action name is wrong, should fail test.add_expected_fail_cmd('-c cancel -r test_rsc -a stop -i 1s' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Attempt to invoke non-existent rsc id ### test = self.new_test("invoke_non_existent_rsc", "Attempt to perform operations on a non-existent rsc id.") test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ - "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:complete\" ") + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:error op_status:complete\" ") test.add_expected_fail_cmd("-c exec -r test_rsc -a stop "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ") test.add_expected_fail_cmd('-c exec -r test_rsc -a monitor -i 6s ' + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ") test.add_expected_fail_cmd("-c cancel -r test_rsc -a start "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Register and start a resource that doesn't exist, systemd ### if "systemd" in self.rsc_classes: test = self.new_test("start_uninstalled_systemd", "Register uninstalled systemd agent, try to start, verify expected failure") test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T this_is_fake1234 "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") if "upstart" in self.rsc_classes: test = self.new_test("start_uninstalled_upstart", "Register uninstalled upstart agent, try to start, verify expected failure") test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T this_is_fake1234 "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Register and start a resource that doesn't exist, ocf ### test = self.new_test("start_uninstalled_ocf", "Register uninstalled ocf agent, try to start, verify expected failure.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pacemaker -T this_is_fake1234 "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Register ocf with non-existent provider ### test = self.new_test("start_ocf_bad_provider", "Register ocf agent with a non-existent provider, verify expected failure.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pancakes -T Dummy "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Register ocf with empty provider field ### test = self.new_test("start_ocf_no_provider", "Register ocf agent with a no provider, verify expected failure.") test.add_expected_fail_cmd("-c register_rsc -r \"test_rsc\" -C ocf -T Dummy "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Error\" ") test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") def build_stress_tests(self): """ Register stress tests """ timeout = "-t 20000" iterations = 25 test = self.new_test("ocf_stress", "Verify OCF agent handling works under load") for i in range(iterations): test.add_cmd("-c register_rsc -r rsc_%s %s -C ocf -P heartbeat -T Dummy -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i)) for i in range(iterations): test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) if "systemd" in self.rsc_classes: test = self.new_test("systemd_stress", "Verify systemd dbus connection works under load") for i in range(iterations): test.add_cmd("-c register_rsc -r rsc_%s %s -C systemd -T pacemaker-cts-dummyd@3 -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i)) for i in range(iterations): test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i)) test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) iterations = 9 timeout = "-t 30000" ### Verify recurring op in-flight collision is handled in series properly test = self.new_test("rsc_inflight_collision", "Verify recurring ops do not collide with other operations for the same rsc.") test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy " "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-c exec -r test_rsc -a start %s -k op_sleep -v 1 -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\"" % (timeout)) for i in range(iterations): test.add_cmd('-c exec -r test_rsc -a monitor %s -i 100%dms ' '-k op_sleep -v 2 ' '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"' % (timeout, i)) test.add_cmd("-c exec -r test_rsc -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\"" % (timeout)) test.add_cmd("-c unregister_rsc -r test_rsc %s -l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\"" % (timeout)) def build_custom_tests(self): """ Register tests that target specific cases """ ### verify resource temporary folder is created and used by OCF agents. ### test = self.new_test("rsc_tmp_dir", "Verify creation and use of rsc temporary state directory") test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@") test.add_cmd("-c register_rsc -r test_rsc -P heartbeat -C ocf -T Dummy " "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-c exec -r test_rsc -a start -t 4000") test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@") test.add_sys_cmd("ls", "@CRM_RSCTMP_DIR@/Dummy-test_rsc.state") test.add_cmd("-c exec -r test_rsc -a stop -t 4000") test.add_cmd("-c unregister_rsc -r test_rsc "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### start delay then stop test ### test = self.new_test("start_delay", "Verify start delay works as expected.") test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy " "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-c exec -r test_rsc -s 6000 -a start -w -t 6000") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 2000", CrmExit.TIMEOUT) test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 6000") test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ") test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### start delay, but cancel before it gets a chance to start. ### test = self.new_test("start_delay_cancel", "Using start_delay, start a rsc, but cancel the start op before execution.") test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy " "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout) test.add_cmd("-c exec -r test_rsc -s 5000 -a start -w -t 4000") test.add_cmd("-c cancel -r test_rsc -a start " + self.action_timeout + "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ") test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 5000", CrmExit.TIMEOUT) test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout + "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### Register a bunch of resources, verify we can get info on them ### test = self.new_test("verify_get_rsc_info", "Register multiple resources, verify retrieval of rsc info.") if "systemd" in self.rsc_classes: test.add_cmd("-c register_rsc -r rsc1 -C systemd -T pacemaker-cts-dummyd@3 "+self.action_timeout) test.add_cmd("-c get_rsc_info -r rsc1 ") test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout) test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ") if "upstart" in self.rsc_classes: test.add_cmd("-c register_rsc -r rsc1 -C upstart -T pacemaker-cts-dummyd "+self.action_timeout) test.add_cmd("-c get_rsc_info -r rsc1 ") test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout) test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ") test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout) test.add_cmd("-c get_rsc_info -r rsc2 ") test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout) test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ") ### Register duplicate, verify only one entry exists and can still be removed. test = self.new_test("duplicate_registration", "Register resource multiple times, verify only one entry exists and can be removed.") test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout) test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy") test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout) test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy") test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Stateful -P pacemaker "+self.action_timeout) test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Stateful") test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout) test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ") ### verify the option to only send notification to the original client. ### test = self.new_test("notify_orig_client_only", "Verify option to only send notifications to the client originating the action.") test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ") test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ") test.add_cmd('-c exec -r \"test_rsc\" -a \"monitor\" -i 1s ' + self.action_timeout + ' -n ' '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"') # this will fail because the monitor notifications should only go to the original caller, which no longer exists. test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT) test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 ') test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") ### get metadata ### test = self.new_test("get_ocf_metadata", "Retrieve metadata for a resource") test.add_cmd_check_stdout("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Dummy\"", "resource-agent name=\"Dummy\"") test.add_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Stateful\"") test.add_expected_fail_cmd("-c metadata -P \"pacemaker\" -T \"Stateful\"") test.add_expected_fail_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"fake_agent\"") ### get metadata ### test = self.new_test("get_lsb_metadata", "Retrieve metadata for a resource") test.add_cmd_check_stdout("-c metadata -C \"lsb\" -T \"LSBDummy\"", "resource-agent name='LSBDummy'") ### get stonith metadata ### test = self.new_test("get_stonith_metadata", "Retrieve stonith metadata for a resource") test.add_cmd_check_stdout("-c metadata -C \"stonith\" -P \"pacemaker\" -T \"fence_dummy\"", "resource-agent name=\"fence_dummy\"") ### get metadata ### if "systemd" in self.rsc_classes: test = self.new_test("get_systemd_metadata", "Retrieve metadata for a resource") test.add_cmd_check_stdout("-c metadata -C \"systemd\" -T \"pacemaker-cts-dummyd@\"", "resource-agent name=\"pacemaker-cts-dummyd@\"") ### get metadata ### if "upstart" in self.rsc_classes: test = self.new_test("get_upstart_metadata", "Retrieve metadata for a resource") test.add_cmd_check_stdout("-c metadata -C \"upstart\" -T \"pacemaker-cts-dummyd\"", "resource-agent name=\"pacemaker-cts-dummyd\"") ### get ocf providers ### test = self.new_test("list_ocf_providers", "Retrieve list of available resource providers, verifies pacemaker is a provider.") test.add_cmd_check_stdout("-c list_ocf_providers ", "pacemaker") test.add_cmd_check_stdout("-c list_ocf_providers -T ping", "pacemaker") ### Verify agents only exist in their lists ### test = self.new_test("verify_agent_lists", "Verify the agent lists contain the right data.") test.add_cmd_check_stdout("-c list_agents ", "Stateful") ### ocf ### test.add_cmd_check_stdout("-c list_agents -C ocf", "Stateful") test.add_cmd_check_stdout("-c list_agents -C lsb", "", "Stateful") ### should not exist test.add_cmd_check_stdout("-c list_agents -C service", "", "Stateful") ### should not exist test.add_cmd_check_stdout("-c list_agents ", "LSBDummy") ### init.d ### test.add_cmd_check_stdout("-c list_agents -C lsb", "LSBDummy") test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy") test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist test.add_cmd_check_stdout("-c list_agents -C lsb", "", "fence_dummy") ### should not exist test.add_cmd_check_stdout("-c list_agents -C service", "", "fence_dummy") ### should not exist test.add_cmd_check_stdout("-c list_agents -C ocf", "", "fence_dummy") ### should not exist if "systemd" in self.rsc_classes: test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd@") ### systemd ### test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy") test.add_cmd_check_stdout("-c list_agents -C systemd", "", "Stateful") ### should not exist test.add_cmd_check_stdout("-c list_agents -C systemd", "pacemaker-cts-dummyd@") test.add_cmd_check_stdout("-c list_agents -C systemd", "", "fence_dummy") ### should not exist if "upstart" in self.rsc_classes: test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd") ### upstart ### test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy") test.add_cmd_check_stdout("-c list_agents -C upstart", "", "Stateful") ### should not exist test.add_cmd_check_stdout("-c list_agents -C upstart", "pacemaker-cts-dummyd") test.add_cmd_check_stdout("-c list_agents -C upstart", "", "fence_dummy") ### should not exist if "stonith" in self.rsc_classes: test.add_cmd_check_stdout("-c list_agents -C stonith", "fence_dummy") ### stonith ### test.add_cmd_check_stdout("-c list_agents -C stonith", "", "pacemaker-cts-dummyd@") ### should not exist test.add_cmd_check_stdout("-c list_agents -C stonith", "", "Stateful") ### should not exist test.add_cmd_check_stdout("-c list_agents ", "fence_dummy") def print_list(self): """ List all registered tests """ print("\n==== %d TESTS FOUND ====" % (len(self.tests))) print("%35s - %s" % ("TEST NAME", "TEST DESCRIPTION")) print("%35s - %s" % ("--------------------", "--------------------")) for test in self.tests: print("%35s - %s" % (test.name, test.description)) print("==== END OF LIST ====\n") def run_single(self, name): """ Run a single named test """ for test in self.tests: if test.name == name: test.run() break def run_tests_matching(self, pattern): """ Run all tests whose name matches a pattern """ for test in self.tests: if test.name.count(pattern) != 0: test.run() def run_tests(self): """ Run all tests """ for test in self.tests: test.run() def exit(self): """ Exit (with error status code if any test failed) """ for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != CrmExit.OK: sys.exit(CrmExit.ERROR) sys.exit(CrmExit.OK) def print_results(self): """ Print summary of results of executed tests """ failures = 0 success = 0 print("\n\n======= FINAL RESULTS ==========") print("\n--- FAILURE RESULTS:") for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != CrmExit.OK: failures = failures + 1 test.print_result(" ") else: success = success + 1 if failures == 0: print(" None") print("\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures)) class TestOptions(object): """ Option handler """ def __init__(self): self.options = {} self.options['list-tests'] = 0 self.options['run-all'] = 1 self.options['run-only'] = "" self.options['run-only-pattern'] = "" self.options['verbose'] = 0 self.options['invalid-arg'] = "" self.options['show-usage'] = 0 self.options['pacemaker-remote'] = 0 def build_options(self, argv): """ Set options based on command-line arguments """ args = argv[1:] skip = 0 for i in range(0, len(args)): if skip: skip = 0 continue elif args[i] == "-h" or args[i] == "--help": self.options['show-usage'] = 1 elif args[i] == "-l" or args[i] == "--list-tests": self.options['list-tests'] = 1 elif args[i] == "-V" or args[i] == "--verbose": self.options['verbose'] = 1 elif args[i] == "-R" or args[i] == "--pacemaker-remote": self.options['pacemaker-remote'] = 1 elif args[i] == "-r" or args[i] == "--run-only": self.options['run-only'] = args[i+1] skip = 1 elif args[i] == "-p" or args[i] == "--run-only-pattern": self.options['run-only-pattern'] = args[i+1] skip = 1 def show_usage(self): """ Show command usage """ print("usage: " + sys.argv[0] + " [options]") print("If no options are provided, all tests will run") print("Options:") print("\t [--help | -h] Show usage") print("\t [--list-tests | -l] Print out all registered tests.") print("\t [--run-only | -r 'testname'] Run a specific test") print("\t [--verbose | -V] Verbose output") print("\t [--pacemaker-remote | -R Test pacemaker-remoted binary instead of pacemaker-execd") print("\t [--run-only-pattern | -p 'string'] Run only tests containing the string value") print("\n\tExample: Run only the test 'start_stop'") print("\t\t " + sys.argv[0] + " --run-only start_stop") print("\n\tExample: Run only the tests with the string 'systemd' present in them") print("\t\t " + sys.argv[0] + " --run-only-pattern systemd") def main(argv): """ Run pacemaker-execd regression tests as specified by arguments """ update_path() opts = TestOptions() opts.build_options(argv) tests = Tests(opts.options['verbose'], opts.options['pacemaker-remote']) tests.build_generic_tests() tests.build_multi_rsc_tests() tests.build_negative_tests() tests.build_custom_tests() tests.build_stress_tests() tests.setup_test_environment() print("Starting ...") if opts.options['list-tests']: tests.print_list() elif opts.options['show-usage']: opts.show_usage() elif opts.options['run-only-pattern'] != "": tests.run_tests_matching(opts.options['run-only-pattern']) tests.print_results() elif opts.options['run-only'] != "": tests.run_single(opts.options['run-only']) tests.print_results() else: tests.run_tests() tests.print_results() tests.cleanup_test_environment() tests.exit() if __name__ == "__main__": main(sys.argv) diff --git a/cts/patterns.py b/cts/patterns.py index b0b8784a02..96d6471c38 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -1,411 +1,413 @@ """ Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS) """ # Pacemaker targets compatibility with Python 2.7 and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division __copyright__ = "Copyright 2008-2019 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys, os from cts.CTSvars import * patternvariants = {} class BasePatterns(object): def __init__(self, name): self.name = name patternvariants[name] = self self.ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", ] self.BadNews = [] self.components = {} self.commands = { "StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null", "CibQuery" : "cibadmin -Ql", "CibAddXml" : "cibadmin --modify -c --xml-text %s", "CibDelXpath" : "cibadmin --delete --xpath %s", # 300,000 == 5 minutes "RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self.search = { "Pat:DC_IDLE" : "pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started" : "%s\W.*controller successfully started", "Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped" : "%s\W.*LOST:.* %s ", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s", "Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* targeting %s on .* for .*@.*: OK", "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s", "Pat:Fencing_active" : r"pacemaker-schedulerd.*: Resource %s is active on .* nodes", "Pat:Fencing_probe" : r"pacemaker-controld.* Result of probe operation for %s on .*: Error", "Pat:RscOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?ok", + "Pat:RscOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ", + "Pat:CloneOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ", "Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok", "Pat:NodeFenced" : r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", "Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0", } def get_component(self, key): if key in self.components: return self.components[key] print("Unknown component '%s' for %s" % (key, self.name)) return [] def get_patterns(self, key): if key == "BadNews": return self.BadNews elif key == "BadNewsIgnore": return self.ignore elif key == "Commands": return self.commands elif key == "Search": return self.search elif key == "Components": return self.components def __getitem__(self, key): if key == "Name": return self.name elif key in self.commands: return self.commands[key] elif key in self.search: return self.search[key] else: print("Unknown template '%s' for %s" % (key, self.name)) return None class crm_corosync(BasePatterns): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): BasePatterns.__init__(self, name) self.commands.update({ "StartCmd" : "service corosync start && service pacemaker start", "StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd" : "crm_node -e", "QuorumCmd" : "crm_node -q", "PartitionCmd" : "crm_node -p", }) self.search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines", "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() "Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)", "Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s", "Pat:InfraUp" : "%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker", }) self.ignore = self.ignore + [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self.BadNews = [ - r"error:", + r"[^(]error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)", r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)", r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", #r"crm_ipc_send:.*Request .* failed", #r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received", # Not inherently bad, but worth tracking #r"No need to invoke the TE", #r"ping.*: DEBUG: Updated connected = 0", #r"Digest mis-match:", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", #r"Executing .* fencing operation", r"Election storm", r"stalled the FSA with pending inputs", ] self.components["common-ignore"] = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ", r".*:\s*Executing .* fencing operation \(.*\) on ", r".*:\s*Requesting fencing \([^)]+\) of node ", r"(Blackbox dump requested|Problem detected)", # "Resource .*stonith::.* is active on 2 nodes attempting recovery", # "Transition .* ERRORs found during PE processing", ] self.components["corosync-ignore"] = [ r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit: Fencing daemon connection failed", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling Node .* for STONITH", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self.components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning failed child process: pacemaker-attrd", r"pacemakerd.* Respawning failed child process: pacemaker-based", r"pacemakerd.* Respawning failed child process: pacemaker-controld", r"pacemakerd.* Respawning failed child process: pacemaker-fenced", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-based-ignore"] = [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["pacemaker-execd"] = [ r"pacemaker-controld.*Connection to (pacemaker-execd|lrmd|executor) (failed|closed)", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.*Respawning failed child process: pacemaker-execd", r"pacemakerd.*Respawning failed child process: pacemaker-controld", ] self.components["pacemaker-execd-ignore"] = [ r"pacemaker-attrd.*Connection to lrmd (failed|closed)", r"pacemaker-(attrd|controld).*Could not execute alert", ] self.components["pacemaker-controld"] = [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # Only if the node wasn't the DC: "State transition S_IDLE", "State transition .* -> S_IDLE", ] self.components["pacemaker-controld-ignore"] = [] self.components["pacemaker-attrd"] = [] self.components["pacemaker-attrd-ignore"] = [] self.components["pacemaker-schedulerd"] = [ "State transition .* S_RECOVERY", r"Respawning failed child process: pacemaker-controld", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", "Connection to pengine failed", "Connection to pengine.* closed", r"Connection to the scheduler failed", "pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", "pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-schedulerd-ignore"] = [] self.components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Fencing daemon connection failed", r"pacemaker-controld.*Fencer successfully connected", ] self.components["pacemaker-fenced-ignore"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit:.*Fencing daemon connection failed", r"error:.*Fencer connection failed \(will retry\)", r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["pacemaker-fenced-ignore"].extend(self.components["common-ignore"]) class crm_corosync_docker(crm_corosync): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): crm_corosync.__init__(self, name) self.commands.update({ "StartCmd" : "pcmk_start", "StopCmd" : "pcmk_stop", }) class PatternSelector(object): def __init__(self, name=None): self.name = name self.base = BasePatterns("crm-base") if not name: crm_corosync("crm-corosync") elif name == "crm-corosync": crm_corosync(name) elif name == "crm-corosync-docker": crm_corosync_docker(name) def get_variant(self, variant): if variant in patternvariants: return patternvariants[variant] print("defaulting to crm-base for %s" % variant) return self.base def get_patterns(self, variant, kind): return self.get_variant(variant).get_patterns(kind) def get_template(self, variant, key): v = self.get_variant(variant) return v[key] def get_component(self, variant, kind): return self.get_variant(variant).get_component(kind) def __getitem__(self, key): return self.get_template(self.name, key) # python cts/CTSpatt.py -k crm-corosync -t StartCmd if __name__ == '__main__': pdir=os.path.dirname(sys.path[0]) sys.path.insert(0, pdir) # So that things work from the source directory kind=None template=None skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == "-k" or args[i] == "--kind": skipthis=1 kind = args[i+1] elif args[i] == "-t" or args[i] == "--template": skipthis=1 template = args[i+1] else: print("Illegal argument " + args[i]) print(PatternSelector(kind)[template]) diff --git a/doc/Clusters_from_Scratch/en-US/Author_Group.xml b/doc/Clusters_from_Scratch/en-US/Author_Group.xml index 898da71261..ff907feee5 100644 --- a/doc/Clusters_from_Scratch/en-US/Author_Group.xml +++ b/doc/Clusters_from_Scratch/en-US/Author_Group.xml @@ -1,11 +1,10 @@ - AndrewBeekhof - Red Hat - Primary author - andrew@beekhof.net + + Written by the Pacemaker project contributors + diff --git a/doc/Clusters_from_Scratch/en-US/Book_Info.xml b/doc/Clusters_from_Scratch/en-US/Book_Info.xml index 68be4424c1..9fcbdf0a10 100644 --- a/doc/Clusters_from_Scratch/en-US/Book_Info.xml +++ b/doc/Clusters_from_Scratch/en-US/Book_Info.xml @@ -1,69 +1,71 @@ %BOOK_ENTITIES; ]> - Clusters from Scratch - Step-by-Step Instructions for Building Your First High-Availability Cluster - Pacemaker - 2.0 - - 10 - 2 - - - This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker. - - - The example cluster will use: - - - - &DISTRO; &DISTRO_VERSION; as the host operating system - - - - - Corosync to provide messaging and membership services, - - - - - Pacemaker 1.1.18 - While this guide is part of the document set for - Pacemaker 2.0, it demonstrates the version available in - the standard &DISTRO; repositories. - - - - - DRBD as a cost-effective alternative to shared storage, - - - - - GFS2 as the cluster filesystem (in active/active mode) - - - - - - Given the graphical nature of the install process, a number of screenshots are included. However the guide is primarily composed of commands, the reasons for executing them and their expected outputs. - - - - - - - - - - - + Clusters from Scratch + Step-by-Step Instructions for Building Your First High-Availability Cluster + Pacemaker + 2.0 + + 11 + 0 + + + This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker. + + + The example cluster will use: + + + + &DISTRO; &DISTRO_VERSION; as the host operating system + + + + + Corosync to provide messaging and membership services, + + + + + Pacemaker 1.1.18 + While this guide is part of the document set for + Pacemaker 2.0, it demonstrates the version available in + the standard &DISTRO; repositories. + + + + + DRBD as a cost-effective alternative to shared storage, + + + + + GFS2 as the cluster filesystem (in active/active mode) + + + + + + Given the graphical nature of the install process, a number of + screenshots are included. However the guide is primarily composed of + commands, the reasons for executing them and their expected outputs. + + + + + + + + + + + diff --git a/doc/Clusters_from_Scratch/en-US/Revision_History.xml b/doc/Clusters_from_Scratch/en-US/Revision_History.xml index 05ef1be65a..77a3b2cdde 100644 --- a/doc/Clusters_from_Scratch/en-US/Revision_History.xml +++ b/doc/Clusters_from_Scratch/en-US/Revision_History.xml @@ -1,93 +1,205 @@ %BOOK_ENTITIES; ]> - - Revision History - - - - 1-0 - Mon May 17 2010 - AndrewBeekhofandrew@beekhof.net - Import from Pages.app - - - 2-0 - Wed Sep 22 2010 - RaoulScarazzinirasca@miamammausalinux.org - Italian translation - - - 3-0 - Wed Feb 9 2011 - AndrewBeekhofandrew@beekhof.net - Updated for Fedora 13 - - - 4-0 - Wed Oct 5 2011 - AndrewBeekhofandrew@beekhof.net - Update the GFS2 section to use CMAN - - - 5-0 - Fri Feb 10 2012 - AndrewBeekhofandrew@beekhof.net - Generate docbook content from asciidoc sources - - - 6-0 - Tues July 3 2012 - AndrewBeekhofandrew@beekhof.net - Updated for Fedora 17 - - - 7-0 - Fri Sept 14 2012 - DavidVosseldavidvossel@gmail.com - Updated for pcs - - - 8-0 - Mon Jan 05 2015 - KenGaillotkgaillot@redhat.com - Updated for Fedora 21 - - - 8-1 - Thu Jan 08 2015 - KenGaillotkgaillot@redhat.com - Minor corrections, plus use include file for intro - - - 9-0 - Fri Aug 14 2015 - KenGaillotkgaillot@redhat.com - Update for CentOS 7.1 and leaving firewalld/SELinux enabled - - - 10-0 - Fri Jan 12 2018 - KenGaillotkgaillot@redhat.com - Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16 - - - 10-1 - Wed Sep 5 2018 - KenGaillotkgaillot@redhat.com - Update for CentOS 7.5 with Pacemaker 1.1.18 - - - 10-2 - Fri Dec 7 2018 - KenGaillotkgaillot@redhat.com - JanPokornýjpokorny@redhat.com - ChrisLumensclumens@redhat.com - Minor clarifications and formatting changes - - - + + Revision History + + + + 1-0 + Mon May 17 2010 + + AndrewBeekhof + andrew@beekhof.net + + + + Import from Pages.app + + + + + 2-0 + Wed Sep 22 2010 + + RaoulScarazzini + rasca@miamammausalinux.org + + + + Italian translation + + + + + 3-0 + Wed Feb 9 2011 + + Andrew + Beekhof + andrew@beekhof.net + + + + Updated for Fedora 13 + + + + + 4-0 + Wed Oct 5 2011 + + Andrew + Beekhof + andrew@beekhof.net + + + + Update the GFS2 section to use CMAN + + + + + 5-0 + Fri Feb 10 2012 + + Andrew + Beekhof + andrew@beekhof.net + + + + Generate docbook content from asciidoc sources + + + + + 6-0 + Tues July 3 2012 + + AndrewBeekhof + andrew@beekhof.net + + + + Updated for Fedora 17 + + + + + 7-0 + Fri Sept 14 2012 + + DavidVossel + davidvossel@gmail.com + + + + Updated for pcs + + + + + 8-0 + Mon Jan 05 2015 + + KenGaillot + kgaillot@redhat.com + + + + Updated for Fedora 21 + + + + 8-1 + Thu Jan 08 2015 + + KenGaillot + kgaillot@redhat.com + + + + Minor corrections, plus use include file for intro + + + + + 9-0 + Fri Aug 14 2015 + + KenGaillot + kgaillot@redhat.com + + + + Update for CentOS 7.1 and leaving firewalld/SELinux enabled + + + + + 10-0 + Fri Jan 12 2018 + + KenGaillot + kgaillot@redhat.com + + + + Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16 + + + + + 10-1 + Wed Sep 5 2018 + + KenGaillot + kgaillot@redhat.com + + + + Update for CentOS 7.5 with Pacemaker 1.1.18 + + + + + 10-2 + Fri Dec 7 2018 + + KenGaillot + kgaillot@redhat.com + + + JanPokorný + jpokorny@redhat.com + + + ChrisLumens + clumens@redhat.com + + + + Minor clarifications and formatting changes + + + + + 11-0 + Thu Jul 18 2019 + + TomasJelinek + tojeline@redhat.com + + + + Note differences in pcs 0.10 + + + + + diff --git a/doc/Pacemaker_Administration/en-US/Author_Group.xml b/doc/Pacemaker_Administration/en-US/Author_Group.xml index a40b3adcc6..ff907feee5 100644 --- a/doc/Pacemaker_Administration/en-US/Author_Group.xml +++ b/doc/Pacemaker_Administration/en-US/Author_Group.xml @@ -1,17 +1,10 @@ - AndrewBeekhof - Red Hat - Co-author - andrew@beekhof.net - - - KenGaillot - Red Hat - Co-author - kgaillot@redhat.com + + Written by the Pacemaker project contributors + diff --git a/doc/Pacemaker_Administration/en-US/Book_Info.xml b/doc/Pacemaker_Administration/en-US/Book_Info.xml index 5dcd86bb2b..5acc7868e7 100644 --- a/doc/Pacemaker_Administration/en-US/Book_Info.xml +++ b/doc/Pacemaker_Administration/en-US/Book_Info.xml @@ -1,36 +1,34 @@ %BOOK_ENTITIES; ]> - Pacemaker Administration - Managing Pacemaker Clusters - - 1 - 3 - - - This document has instructions and tips for system - administrators who need to manage high-availability - clusters using Pacemaker. - - - - - - - - - - - + Pacemaker Administration + Managing Pacemaker Clusters + + 2 + 0 + + + This document has instructions and tips for system administrators who + need to manage high-availability clusters using Pacemaker. + + + + + + + + + + + - diff --git a/doc/Pacemaker_Administration/en-US/Revision_History.xml b/doc/Pacemaker_Administration/en-US/Revision_History.xml index 6276ccd69b..3d8a310735 100644 --- a/doc/Pacemaker_Administration/en-US/Revision_History.xml +++ b/doc/Pacemaker_Administration/en-US/Revision_History.xml @@ -1,69 +1,87 @@ %BOOK_ENTITIES; ]> Revision History 1-0 Tue Jan 23 2018 + + AndrewBeekhof + andrew@beekhof.net + KenGaillot kgaillot@redhat.com Move administration-oriented information from Pacemaker Explained into its own book 1-1 Mon Jan 28 2019 KenGaillot kgaillot@redhat.com JanPokorný jpokorny@redhat.com Add "Troubleshooting" chapter, minor clarifications and reformatting 1-2 Mon May 13 2019 KenGaillot kgaillot@redhat.com Overhaul configuration how-to 1-3 Tue Oct 15 2019 KenGaillot kgaillot@redhat.com Add Tools chapter + + 2-0 + Tue Nov 5 2019 + + ChrisLumens + clumens@redhat.com + + + + Update for crm_mon changes in 2.0.3 + + + + diff --git a/doc/Pacemaker_Development/en-US/Author_Group.xml b/doc/Pacemaker_Development/en-US/Author_Group.xml index 4f315d6ad0..ff907feee5 100644 --- a/doc/Pacemaker_Development/en-US/Author_Group.xml +++ b/doc/Pacemaker_Development/en-US/Author_Group.xml @@ -1,23 +1,10 @@ - AndrewBeekhof - Red Hat - Co-author - andrew@beekhof.net - - - KenGaillot - Red Hat - Co-author - kgaillot@redhat.com - - - JanPokorný - Red Hat - Co-author - poki@redhat.com + + Written by the Pacemaker project contributors + diff --git a/doc/Pacemaker_Development/en-US/Revision_History.xml b/doc/Pacemaker_Development/en-US/Revision_History.xml index 32b37341ce..163867f785 100644 --- a/doc/Pacemaker_Development/en-US/Revision_History.xml +++ b/doc/Pacemaker_Development/en-US/Revision_History.xml @@ -1,108 +1,112 @@ %BOOK_ENTITIES; ]> Revision History 1-0 Tue Jul 26 2016 + + AndrewBeekhof + andrew@beekhof.net + KenGaillot kgaillot@redhat.com Convert coding guidelines and developer FAQ to Publican document 1-1 Mon Aug 29 2016 KenGaillot kgaillot@redhat.com Add Python coding guidelines, and more about licensing 2-0 Fri Jan 12 2018 KenGaillot kgaillot@redhat.com Drop support for Python 2.6 2-1 Tue Sep 18 2018 JanPokorný poki@redhat.com Start documenting notable evolutionary points 2-2 Fri Dec 7 2018 KenGaillot kgaillot@redhat.com Update FAQ and C guidelines 2-3 Mon May 13 2019 KenGaillot kgaillot@redhat.com JanPokorný poki@redhat.com Update copyright notice policy, and some external references 2-4 Fri 17 May 2019 JanPokorný poki@redhat.com Start capturing hacking howto for advanced contributors diff --git a/doc/Pacemaker_Explained/en-US/Author_Group.xml b/doc/Pacemaker_Explained/en-US/Author_Group.xml index 08fc1f72ed..ff907feee5 100644 --- a/doc/Pacemaker_Explained/en-US/Author_Group.xml +++ b/doc/Pacemaker_Explained/en-US/Author_Group.xml @@ -1,59 +1,10 @@ - AndrewBeekhof - Red Hat - Primary author - andrew@beekhof.net + + Written by the Pacemaker project contributors + - - KenGaillot - Red Hat - Co-author - kgaillot@redhat.com - - - PhilippMarek - LINBit - Style and formatting updates. Indexing. - philipp.marek@linbit.com - - - TanjaRoth - SUSE - Utilization chapter - Resource Templates chapter - Multi-Site Clusters chapter - taroth@suse.com - - - LarsMarowsky-Bree - SUSE - Multi-Site Clusters chapter - lmb@suse.com - - - YanGao - SUSE - Utilization chapter - Resource Templates chapter - Multi-Site Clusters chapter - ygao@suse.com - - - ThomasSchraitle - SUSE - Utilization chapter - Resource Templates chapter - Multi-Site Clusters chapter - toms@suse.com - - - DejanMuhamedagic - SUSE - Resource Templates chapter - dmuhamedagic@suse.com - diff --git a/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt b/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt index 53d8793024..31b5044274 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt @@ -1,1021 +1,1028 @@ :compat-mode: legacy = Fencing = //// We prefer [[ch-fencing]], but older versions of asciidoc don't deal well with that construct for chapter headings //// anchor:ch-fencing[Chapter 6, Fencing] indexterm:[Fencing, Configuration] indexterm:[STONITH, Configuration] == What Is Fencing? == 'Fencing' is the ability to make a node unable to run resources, even when that node is unresponsive to cluster commands. Fencing is also known as 'STONITH', an acronym for "Shoot The Other Node In The Head", since the most common fencing method is cutting power to the node. Another method is "fabric fencing", cutting the node's access to some capability required to run resources (such as network access or a shared disk). == Why Is Fencing Necessary? == Fencing protects your data from being corrupted by malfunctioning nodes or unintentional concurrent access to shared resources. Fencing protects against the "split brain" failure scenario, where cluster nodes have lost the ability to reliably communicate with each other but are still able to run resources. If the cluster just assumed that uncommunicative nodes were down, then multiple instances of a resource could be started on different nodes. The effect of split brain depends on the resource type. For example, an IP address brought up on two hosts on a network will cause packets to randomly be sent to one or the other host, rendering the IP useless. For a database or clustered file system, the effect could be much more severe, causing data corruption or divergence. Fencing also is used when a resource cannot otherwise be stopped. If a failed resource fails to stop, it cannot be recovered elsewhere. Fencing the resource's node is the only way to ensure the resource is recoverable. Users may also configure the +on-fail+ property of any resource operation to +fencing+, in which case the cluster will fence the resource's node if the operation fails. == Fence Devices == A 'fence device' (or 'fencing device') is a special type of resource that provides the means to fence a node. Examples of fencing devices include intelligent power switches and IPMI devices that accept SNMP commands to cut power to a node, and iSCSI controllers that allow SCSI reservations to be used to cut a node's access to a shared disk. Since fencing devices will be used to recover from loss of networking connectivity to other nodes, it is essential that they do not rely on the same network as the cluster itself, otherwise that network becomes a single point of failure. Since loss of a node due to power outage is indistinguishable from loss of network connectivity to that node, it is also essential that at least one fence device for a node does not share power with that node. For example, an on-board IPMI controller that shares power with its host should not be used as the sole fencing device for that host. Since fencing is used to isolate malfunctioning nodes, no fence device should rely on its target functioning properly. This includes, for example, devices that ssh into a node and issue a shutdown command (such devices might be suitable for testing, but never for production). == Fence Agents == A 'fence agent' (or 'fencing agent') is a +stonith+-class resource agent. The fence agent standard provides commands (such as +off+ and +reboot+) that the cluster can use to fence nodes. As with other resource agent classes, this allows a layer of abstraction so that Pacemaker doesn't need any knowledge about specific fencing technologies -- that knowledge is isolated in the agent. == When a Fence Device Can Be Used == Fencing devices do not actually "run" like most services. Typically, they just provide an interface for sending commands to an external device. Additionally, fencing may be initiated by Pacemaker, by other cluster-aware software such as DRBD or DLM, or manually by an administrator, at any point in the cluster life cycle, including before any resources have been started. To accommodate this, Pacemaker does not require the fence device resource to be "started" in order to be used. Whether a fence device is started or not determines whether a node runs any recurring monitor for the device, and gives the node a slight preference for being chosen to execute fencing using that device. By default, any node can execute any fencing device. If a fence device is disabled by setting its +target-role+ to Stopped, then no node can use that device. If mandatory location constraints prevent a specific node from "running" a fence device, then that node will never be chosen to execute fencing using the device. A node may fence itself, but the cluster will choose that only if no other nodes can do the fencing. A common configuration scenario is to have one fence device per target node. In such a case, users often configure anti-location constraints so that the target node does not monitor its own device. The best practice is to make the constraint optional (i.e. a finite negative score rather than +-INFINITY+), so that the node can fence itself if no other nodes can. == Limitations of Fencing Resources == Fencing resources have certain limitations that other resource classes don't: * They may have only one set of meta-attributes and one set of instance attributes. * If <> are used to determine fencing resource options, these may only be evaluated when first read, meaning that later changes to the rules will have no effect. Therefore, it is better to avoid confusion and not use rules at all with fencing resources. These limitations could be revisited if there is significant user demand. == Special Options for Fencing Resources == The table below lists special instance attributes that may be set for any fencing resource ('not' meta-attributes, even though they are interpreted by pacemaker rather than the fence agent). These are also listed in the man page for +pacemaker-fenced+. .Additional Properties of Fencing Resources [width="95%",cols="8m,3,6,<12",options="header",align="center"] |========================================================= |Field |Type |Default |Description |stonith-timeout |NA |NA a|Older versions used this to override the default period to wait for a STONITH (reboot, on, off) action to complete for this device. It has been replaced by the +pcmk_reboot_timeout+ and +pcmk_off_timeout+ properties. indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] //// (not yet implemented) priority integer 0 The priority of the STONITH resource. Devices are tried in order of highest priority to lowest. indexterm priority,Fencing indexterm Fencing,Property,priority //// |provides |string | |Any special capability provided by the fence device. Currently, only one such capability is meaningful: +unfencing+ (see <>). indexterm:[provides,Fencing] indexterm:[Fencing,Property,provides] |pcmk_host_map |string | |A mapping of host names to ports numbers for devices that do not support host names. Example: +node1:1;node2:2,3+ tells the cluster to use port 1 for *node1* and ports 2 and 3 for *node2*. If +pcmk_host_check+ is explicitly set to +static-list+, either this or +pcmk_host_list+ must be set. indexterm:[pcmk_host_map,Fencing] indexterm:[Fencing,Property,pcmk_host_map] |pcmk_host_list |string | |A list of machines controlled by this device. If +pcmk_host_check+ is explicitly set to +static-list+, either this or +pcmk_host_map+ must be set. indexterm:[pcmk_host_list,Fencing] indexterm:[Fencing,Property,pcmk_host_list] |pcmk_host_check |string -|static-list if either +pcmk_host_list+ or +pcmk_host_map+ is set, - otherwise dynamic-list if the fence device supports the list action, - otherwise status if the fence device supports the status action, - otherwise none +|A value appropriate to other configuration options and + device capabilities (see note below) a|How to determine which machines are controlled by the device. Allowed values: * +dynamic-list:+ query the device via the "list" command * +static-list:+ check the +pcmk_host_list+ or +pcmk_host_map+ attribute * +status:+ query the device via the "status" command * +none:+ assume every device can fence every machine indexterm:[pcmk_host_check,Fencing] indexterm:[Fencing,Property,pcmk_host_check] |pcmk_delay_max |time |0s |Enable a random delay of up to the time specified before executing fencing actions. This is sometimes used in two-node clusters to ensure that the nodes don't fence each other at the same time. The overall delay introduced by pacemaker is derived from this random delay value adding a static delay so that the sum is kept below the maximum delay. indexterm:[pcmk_delay_max,Fencing] indexterm:[Fencing,Property,pcmk_delay_max] |pcmk_delay_base |time |0s |Enable a static delay before executing fencing actions. This can be used e.g. in two-node clusters to ensure that the nodes don't fence each other, by having separate fencing resources with different values. The node that is fenced with the shorter delay will lose a fencing race. The overall delay introduced by pacemaker is derived from this value plus a random delay such that the sum is kept below the maximum delay. indexterm:[pcmk_delay_base,Fencing] indexterm:[Fencing,Property,pcmk_delay_base] |pcmk_action_limit |integer |1 |The maximum number of actions that can be performed in parallel on this device, if the cluster option +concurrent-fencing+ is +true+. -1 is unlimited. indexterm:[pcmk_action_limit,Fencing] indexterm:[Fencing,Property,pcmk_action_limit] |pcmk_host_argument |string |port |'Advanced use only.' Which parameter should be supplied to the resource agent to identify the node to be fenced. Some devices do not support the standard +port+ parameter or may provide additional ones. Use this to specify an alternate, device-specific parameter. A value of +none+ tells the cluster not to supply any additional parameters. indexterm:[pcmk_host_argument,Fencing] indexterm:[Fencing,Property,pcmk_host_argument] |pcmk_reboot_action |string |reboot |'Advanced use only.' The command to send to the resource agent in order to reboot a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_reboot_action,Fencing] indexterm:[Fencing,Property,pcmk_reboot_action] |pcmk_reboot_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `reboot` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_reboot_timeout,Fencing] indexterm:[Fencing,Property,pcmk_reboot_timeout] indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] |pcmk_reboot_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `reboot` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_reboot_retries,Fencing] indexterm:[Fencing,Property,pcmk_reboot_retries] |pcmk_off_action |string |off |'Advanced use only.' The command to send to the resource agent in order to shut down a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_off_action,Fencing] indexterm:[Fencing,Property,pcmk_off_action] |pcmk_off_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `off` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_off_timeout,Fencing] indexterm:[Fencing,Property,pcmk_off_timeout] indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] |pcmk_off_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `off` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_off_retries,Fencing] indexterm:[Fencing,Property,pcmk_off_retries] |pcmk_list_action |string |list |'Advanced use only.' The command to send to the resource agent in order to list nodes. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_list_action,Fencing] indexterm:[Fencing,Property,pcmk_list_action] |pcmk_list_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `list` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_list_timeout,Fencing] indexterm:[Fencing,Property,pcmk_list_timeout] |pcmk_list_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `list` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_list_retries,Fencing] indexterm:[Fencing,Property,pcmk_list_retries] |pcmk_monitor_action |string |monitor |'Advanced use only.' The command to send to the resource agent in order to report extended status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_monitor_action,Fencing] indexterm:[Fencing,Property,pcmk_monitor_action] |pcmk_monitor_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `monitor` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_monitor_timeout,Fencing] indexterm:[Fencing,Property,pcmk_monitor_timeout] |pcmk_monitor_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `monitor` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_monitor_retries,Fencing] indexterm:[Fencing,Property,pcmk_monitor_retries] |pcmk_status_action |string |status |'Advanced use only.' The command to send to the resource agent in order to report status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_status_action,Fencing] indexterm:[Fencing,Property,pcmk_status_action] |pcmk_status_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `status` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_status_timeout,Fencing] indexterm:[Fencing,Property,pcmk_status_timeout] |pcmk_status_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `status` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_status_retries,Fencing] indexterm:[Fencing,Property,pcmk_status_retries] |========================================================= +[NOTE] +==== +The default value for +pcmk_host_check+ is +static-list+ if either ++pcmk_host_list+ or +pcmk_host_map+ is configured. If neither of those are +configured, the default is +dynamic-list+ if the fence device supports the list +action, or +status+ if the fence device supports the status action but not the +list action. If none of those conditions apply, the default is +none+. +==== + [[s-unfencing]] == Unfencing == With fabric fencing (such as cutting network or shared disk access rather than power), it is expected that the cluster will fence the node, and then a system administrator must manually investigate what went wrong, correct any issues found, then reboot (or restart the cluster services on) the node. Once the node reboots and rejoins the cluster, some fabric fencing devices require an explicit command to restore the node's access. This capability is called 'unfencing' and is typically implemented as the fence agent's +on+ command. If any cluster resource has +requires+ set to +unfencing+, then that resource will not be probed or started on a node until that node has been unfenced. == Fence Devices Dependent on Other Resources == In some cases, a fence device may require some other cluster resource (such as an IP address) to be active in order to function properly. This is obviously undesirable in general: fencing may be required when the depended-on resource is not active, or fencing may be required because the node running the depended-on resource is no longer responding. However, this may be acceptable under certain conditions: * The dependent fence device should not be able to target any node that is allowed to run the depended-on resource. * The depended-on resource should not be disabled during production operation. * The +concurrent-fencing+ cluster property should be set to +true+. Otherwise, if both the node running the depended-on resource and some node targeted by the dependent fence device need to be fenced, the fencing of the node running the depended-on resource might be ordered first, making the second fencing impossible and blocking further recovery. With concurrent fencing, the dependent fence device might fail at first due to the depended-on resource being unavailable, but it will be retried and eventually succeed once the resource is brought back up. Even under those conditions, there is one unlikely problem scenario. The DC always schedules fencing of itself after any other fencing needed, to avoid unnecessary repeated DC elections. If the dependent fence device targets the DC, and both the DC and a different node running the depended-on resource need to be fenced, the DC fencing will always fail and block further recovery. Note, however, that losing a DC node entirely causes some other node to become DC and schedule the fencing, so this is only a risk when a stop or other operation with +on-fail+ set to +fencing+ fails on the DC. == Configuring Fencing == . Find the correct driver: + ---- # stonith_admin --list-installed ---- . Find the required parameters associated with the device (replacing $AGENT_NAME with the name obtained from the previous step): + ---- # stonith_admin --metadata --agent $AGENT_NAME ---- . Create a file called +stonith.xml+ containing a primitive resource with a class of +stonith+, a type equal to the agent name obtained earlier, and a parameter for each of the values returned in the previous step. . If the device does not know how to fence nodes based on their uname, you may also need to set the special +pcmk_host_map+ parameter. See `man pacemaker-fenced` for details. . If the device does not support the `list` command, you may also need to set the special +pcmk_host_list+ and/or +pcmk_host_check+ parameters. See `man pacemaker-fenced` for details. . If the device does not expect the victim to be specified with the `port` parameter, you may also need to set the special +pcmk_host_argument+ parameter. See `man pacemaker-fenced` for details. . Upload it into the CIB using cibadmin: + ---- # cibadmin -C -o resources --xml-file stonith.xml ---- . Set +stonith-enabled+ to true: + ---- # crm_attribute -t crm_config -n stonith-enabled -v true ---- . Once the stonith resource is running, you can test it by executing the following (although you might want to stop the cluster on that machine first): + ---- # stonith_admin --reboot nodename ---- === Example Fencing Configuration === Assume we have a chassis containing four nodes and an IPMI device active on 192.0.2.1. We would choose the `fence_ipmilan` driver, and obtain the following list of parameters: .Obtaining a list of Fence Agent Parameters ==== ---- # stonith_admin --metadata -a fence_ipmilan ---- [source,XML] ---- ---- ==== Based on that, we would create a fencing resource fragment that might look like this: .An IPMI-based Fencing Resource ==== [source,XML] ---- ---- ==== Finally, we need to enable fencing: ---- # crm_attribute -t crm_config -n stonith-enabled -v true ---- == Fencing Topologies == Pacemaker supports fencing nodes with multiple devices through a feature called 'fencing topologies'. Fencing topologies may be used to provide alternative devices in case one fails, or to require multiple devices to all be executed successfully in order to consider the node successfully fenced, or even a combination of the two. Create the individual devices as you normally would, then define one or more +fencing-level+ entries in the +fencing-topology+ section of the configuration. * Each fencing level is attempted in order of ascending +index+. Allowed values are 1 through 9. * If a device fails, processing terminates for the current level. No further devices in that level are exercised, and the next level is attempted instead. * If the operation succeeds for all the listed devices in a level, the level is deemed to have passed. * The operation is finished when a level has passed (success), or all levels have been attempted (failed). * If the operation failed, the next step is determined by the scheduler and/or the controller. Some possible uses of topologies include: * Try on-board IPMI, then an intelligent power switch if that fails * Try fabric fencing of both disk and network, then fall back to power fencing if either fails * Wait up to a certain time for a kernel dump to complete, then cut power to the node .Properties of Fencing Levels [width="95%",cols="1m,<3",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the level indexterm:[id,fencing-level] indexterm:[Fencing,fencing-level,id] |target |The name of a single node to which this level applies indexterm:[target,fencing-level] indexterm:[Fencing,fencing-level,target] |target-pattern |An extended regular expression (as defined in http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04[POSIX]) matching the names of nodes to which this level applies indexterm:[target-pattern,fencing-level] indexterm:[Fencing,fencing-level,target-pattern] |target-attribute |The name of a node attribute that is set (to +target-value+) for nodes to which this level applies indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |target-value |The node attribute value (of +target-attribute+) that is set for nodes to which this level applies indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |index |The order in which to attempt the levels. Levels are attempted in ascending order 'until one succeeds'. Valid values are 1 through 9. indexterm:[index,fencing-level] indexterm:[Fencing,fencing-level,index] |devices |A comma-separated list of devices that must all be tried for this level indexterm:[devices,fencing-level] indexterm:[Fencing,fencing-level,devices] |========================================================= .Fencing topology with different devices for different nodes ==== [source,XML] ---- ... ... ---- ==== === Example Dual-Layer, Dual-Device Fencing Topologies === The following example illustrates an advanced use of +fencing-topology+ in a cluster with the following properties: * 3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes) * the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2 * the active nodes also have two independent PSUs (Power Supply Units) connected to two independent PDUs (Power Distribution Units) reached at 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11) * the first fencing method uses the `fence_ipmi` agent * the second fencing method uses the `fence_apc_snmp` agent targetting 2 fencing devices (one per PSU, either port 10 or 11) * fencing is only implemented for the active nodes and has location constraints * fencing topology is set to try IPMI fencing first then default to a "sure-kill" dual PDU fencing In a normal failure scenario, STONITH will first select +fence_ipmi+ to try to kill the faulty node. Using a fencing topology, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice: * once for the first PDU * again for the second PDU The fence action is considered successful only if both PDUs report the required status. If any of them fails, STONITH loops back to the first fencing method, +fence_ipmi+, and so on until the node is fenced or fencing action is cancelled. .First fencing method: single IPMI device Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives: [source,XML] ---- ---- .Second fencing method: dual PDU devices Each cluster node also has two distinct power channels controlled by two distinct PDUs. That means a total of 4 fencing devices configured as follows: - Node 1, PDU 1, PSU 1 @ port 10 - Node 1, PDU 2, PSU 2 @ port 10 - Node 2, PDU 1, PSU 1 @ port 11 - Node 2, PDU 2, PSU 2 @ port 11 The matching fencing agents are configured as follows: [source,XML] ---- ---- .Location Constraints To prevent STONITH from trying to run a fencing agent on the same node it is supposed to fence, constraints are placed on all the fencing primitives: [source,XML] ---- ---- .Fencing topology Now that all the fencing resources are defined, it's time to create the right topology. We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node. [source,XML] ---- ---- Please note, in +fencing-topology+, the lowest +index+ value determines the priority of the first fencing method. .Final configuration Put together, the configuration looks like this: [source,XML] ---- ... ... ---- == Remapping Reboots == When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to other commands in two cases: . If the chosen fencing device does not support the +reboot+ command, the cluster will ask it to perform +off+ instead. . If a fencing topology level with multiple devices must be executed, the cluster will ask all the devices to perform +off+, then ask the devices to perform +on+. To understand the second case, consider the example of a node with redundant power supplies connected to intelligent power switches. Rebooting one switch and then the other would have no effect on the node. Turning both switches off, and then on, actually reboots the node. In such a case, the fencing operation will be treated as successful as long as the +off+ commands succeed, because then it is safe for the cluster to recover any resources that were on the node. Timeouts and errors in the +on+ phase will be logged but ignored. When a reboot operation is remapped, any action-specific timeout for the remapped action will be used (for example, +pcmk_off_timeout+ will be used when executing the +off+ command, not +pcmk_reboot_timeout+). diff --git a/doc/Pacemaker_Explained/en-US/Revision_History.xml b/doc/Pacemaker_Explained/en-US/Revision_History.xml index 93fa71334f..4a80cb4fb6 100644 --- a/doc/Pacemaker_Explained/en-US/Revision_History.xml +++ b/doc/Pacemaker_Explained/en-US/Revision_History.xml @@ -1,199 +1,249 @@ Revision History 1-0 19 Oct 2009 AndrewBeekhofandrew@beekhof.net Import from Pages.app 2-0 26 Oct 2009 AndrewBeekhofandrew@beekhof.net Cleanup and reformatting of docbook xml complete 3-0 Tue Nov 12 2009 AndrewBeekhofandrew@beekhof.net Split book into chapters and pass validation Re-organize book for use with Publican + + 3-1 + Tue Nov 12 2009 + + TanjaRoth + SUSE + taroth@suse.com + + + LarsMarowsky-Bree + SUSE + lmb@suse.com + + + YanGao + SUSE + ygao@suse.com + + + ThomasSchraitle + SUSE + toms@suse.com + + + DejanMuhamedagic + SUSE + dmuhamedagic@suse.com + + + + Utilization chapter + Resource Templates chapter + Multi-Site Clusters chapter + + + + + 3-2 + Fri Nov 4 2011 + + PhilippMarek + LINBit + philipp.marek@linbit.com + + + + Extensive style, formatting, and indexing updates + + + 4-0 Mon Oct 8 2012 AndrewBeekhofandrew@beekhof.net Converted to asciidoc (which is converted to docbook for use with Publican) 5-0 Mon Feb 23 2015 KenGaillotkgaillot@redhat.com Update for clarity, stylistic consistency and current command-line syntax 6-0 Tue Dec 8 2015 KenGaillotkgaillot@redhat.com Update for Pacemaker 1.1.14 7-0 Tue May 3 2016 KenGaillotkgaillot@redhat.com Update for Pacemaker 1.1.15 7-1 Fri Oct 28 2016 KenGaillotkgaillot@redhat.com Overhaul upgrade documentation, and document node health strategies 8-0 Tue Oct 25 2016 KenGaillotkgaillot@redhat.com Update for Pacemaker 1.1.16 9-0 Tue Jul 11 2017 KenGaillotkgaillot@redhat.com Update for Pacemaker 1.1.17 10-0 Fri Oct 6 2017 KenGaillotkgaillot@redhat.com Update for Pacemaker 1.1.18 11-0 Fri Jan 12 2018 KenGaillotkgaillot@redhat.com Update for Pacemaker 2.0.0 12-0 Mon Jan 28 2019 KenGaillotkgaillot@redhat.com ReidWahlnwahl@redhat.com JanPokornýjpokorny@redhat.com Update for Pacemaker 2.0.1, remove "Further Reading" and "FAQ" sections, and add minor clarifications and reformatting 12-1 Mon May 13 2019 KenGaillotkgaillot@redhat.com MaciejSobkowiakmsobkowiak@egnyte.com Document podman support, cluster-name cluster option, and new HealthIOWait agent, with other minor clarifications and corrections 12-2 Wed Jun 19 2019 KenGaillotkgaillot@redhat.com Add chapter for ACLs 13-0 Tue Oct 15 2019 KenGaillot kgaillot@redhat.com Overhaul fencing, rules, and constraints chapters, elaborate on various options, and update for Pacemaker 2.0.3 diff --git a/doc/Pacemaker_Remote/en-US/Author_Group.xml b/doc/Pacemaker_Remote/en-US/Author_Group.xml index 1de3082be1..ff907feee5 100644 --- a/doc/Pacemaker_Remote/en-US/Author_Group.xml +++ b/doc/Pacemaker_Remote/en-US/Author_Group.xml @@ -1,11 +1,10 @@ - DavidVossel - Red Hat - Primary author - davidvossel@gmail.com + + Written by the Pacemaker project contributors + diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000000..594d9e5717 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,80 @@ +# Pacemaker Documentation + +Pacemaker has multiple forms of documentation: + +* The primary end-user documentation is a series of "books": + + * Clusters From Scratch: Simplified walk-through of setting up a + cluster for the first time + * Pacemaker Administration: Tips for managing a cluster + * Pacemaker Development: How to work on the Pacemaker code base + * Pacemaker Explained: Configuration reference guide + * Pacemaker Remote: Configuration and walk-throughs for extended + clusters + + The source for these is kept beneath this directory. Generated versions + are available online in epub, PDF, and HTML format at: + + https://clusterlabs.org/pacemaker/doc/ + +* Annotated source code in HTML format can be generated by running + "make global" in this directory, which requires the gtags and htags tools. + This is generated for releases and can be viewed online at: + + https://clusterlabs.org/pacemaker/global/ + +* Pacemaker manual pages are generated and installed automatically when + building the software. HTML versions can be generated by running + "make manhtml" in this directory, which requires the groff tool. + This is generated for releases and can be viewed online at: + + https://clusterlabs.org/pacemaker/man/ + +* For developers, documentation for Pacemaker's public C API is generated + by running "make doxygen" in this directory, which requires the doxygen tool. + This is generated for releases and can be viewed online at: + + https://clusterlabs.org/pacemaker/doxygen/ + +* Also for developers, a report of Pacemaker ABI compatibility between any two + commits can be generated by running in this directory: + + make LAST_RELEASE=$EARLIER_COMMIT TAG=$NEWER_COMMIT abi-www + + which requires the abi-compliance-checker tool. This is generated for each + release compared to the previous release and can be viewed online at: + + https://clusterlabs.org/pacemaker/abi/ + +* In addition, there are a few old text files in this directory focusing on + particular characteristics of Pacemaker clusters. These are mostly outdated + but do still have some useful information. The plan is to incorporate an + updated version of them into the books. + +## Editing the Books + +Each book's sources are kept in a subdirectory by title. Each book subdirectory +has an en-US subdirectory with the master sources, and potentially other +subdirectories for translations. However, the translations are not currently +actively maintained and so are disabled. + +Each book's en-US subdirectory has text files with the chapter sources in +asciidoc format. The file asciidoc.reference in this directory has a quick +guide to asciidoc; search online for more detailed help. + +Once you have edited the asciidoc as desired, run "make" in this directory +to generate all the books locally. You view the results by pointing your +web browser to (replacing BOOK\_TITLE appropriately): + + file:///path/to/checkout/doc/BOOK\_TITLE/publish/desktop/en-US/index.html + +Each en-US subdirectory also contains some raw XML files of lesser interest: + +* Author\_Group.xml: This contains the author listing at the top of the + generated book. To avoid clutter, we just put "Pacemaker project + contributors" here and list individual authors in the revision history, so + this typically does not need to be edited. +* Book\_Info.xml: Revision numbers, etc. This is generally updated only for + official releases. +* Revision\_History.xml: This can be updated for any change, listing the + individual authors. diff --git a/include/crm/services.h b/include/crm/services.h index 8fe9bc918c..c35aa2454d 100644 --- a/include/crm/services.h +++ b/include/crm/services.h @@ -1,431 +1,431 @@ /* * Copyright 2010-2019 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef __PCMK_SERVICES__ # define __PCMK_SERVICES__ #ifdef __cplusplus extern "C" { #endif /** * \file * \brief Services API * \ingroup core */ # include # include # include # include # include # ifndef OCF_ROOT_DIR # define OCF_ROOT_DIR "/usr/lib/ocf" # endif # ifndef LSB_ROOT_DIR # define LSB_ROOT_DIR "/etc/init.d" # endif /* TODO: Autodetect these two ?*/ # ifndef SYSTEMCTL # define SYSTEMCTL "/bin/systemctl" # endif /* Known resource classes */ #define PCMK_RESOURCE_CLASS_OCF "ocf" #define PCMK_RESOURCE_CLASS_SERVICE "service" #define PCMK_RESOURCE_CLASS_LSB "lsb" #define PCMK_RESOURCE_CLASS_SYSTEMD "systemd" #define PCMK_RESOURCE_CLASS_UPSTART "upstart" #define PCMK_RESOURCE_CLASS_NAGIOS "nagios" #define PCMK_RESOURCE_CLASS_STONITH "stonith" /* This is the string passed in the OCF_EXIT_REASON_PREFIX environment variable. * The stderr output that occurs after this prefix is encountered is considered * the exit reason for a completed operation. */ #define PCMK_OCF_REASON_PREFIX "ocf-exit-reason:" // Agent version to use if agent doesn't specify one #define PCMK_DEFAULT_AGENT_VERSION "0.1" enum lsb_exitcode { PCMK_LSB_OK = 0, PCMK_LSB_UNKNOWN_ERROR = 1, PCMK_LSB_INVALID_PARAM = 2, PCMK_LSB_UNIMPLEMENT_FEATURE = 3, PCMK_LSB_INSUFFICIENT_PRIV = 4, PCMK_LSB_NOT_INSTALLED = 5, PCMK_LSB_NOT_CONFIGURED = 6, PCMK_LSB_NOT_RUNNING = 7, }; /* The return codes for the status operation are not the same for other * operatios - go figure */ enum lsb_status_exitcode { PCMK_LSB_STATUS_OK = 0, PCMK_LSB_STATUS_VAR_PID = 1, PCMK_LSB_STATUS_VAR_LOCK = 2, PCMK_LSB_STATUS_NOT_RUNNING = 3, PCMK_LSB_STATUS_UNKNOWN = 4, /* custom codes should be in the 150-199 range reserved for application use */ PCMK_LSB_STATUS_NOT_INSTALLED = 150, PCMK_LSB_STATUS_INSUFFICIENT_PRIV = 151, }; /* Uniform exit codes * Everything is mapped to its OCF equivalent so that Pacemaker only deals with one set of codes */ enum ocf_exitcode { PCMK_OCF_OK = 0, PCMK_OCF_UNKNOWN_ERROR = 1, PCMK_OCF_INVALID_PARAM = 2, PCMK_OCF_UNIMPLEMENT_FEATURE = 3, PCMK_OCF_INSUFFICIENT_PRIV = 4, PCMK_OCF_NOT_INSTALLED = 5, PCMK_OCF_NOT_CONFIGURED = 6, PCMK_OCF_NOT_RUNNING = 7, /* End of overlap with LSB */ PCMK_OCF_RUNNING_MASTER = 8, PCMK_OCF_FAILED_MASTER = 9, /* 150-199 reserved for application use */ PCMK_OCF_CONNECTION_DIED = 189, // Deprecated (see PCMK_LRM_OP_NOT_CONNECTED) PCMK_OCF_DEGRADED = 190, /* Active resource that is no longer 100% functional */ PCMK_OCF_DEGRADED_MASTER = 191, /* Promoted resource that is no longer 100% functional */ PCMK_OCF_EXEC_ERROR = 192, /* Generic problem invoking the agent */ PCMK_OCF_UNKNOWN = 193, /* State of the service is unknown - used for recording in-flight operations */ PCMK_OCF_SIGNAL = 194, PCMK_OCF_NOT_SUPPORTED = 195, PCMK_OCF_PENDING = 196, PCMK_OCF_CANCELLED = 197, PCMK_OCF_TIMEOUT = 198, PCMK_OCF_OTHER_ERROR = 199, /* Keep the same codes as PCMK_LSB */ }; enum op_status { PCMK_LRM_OP_UNKNOWN = -2, PCMK_LRM_OP_PENDING = -1, PCMK_LRM_OP_DONE, PCMK_LRM_OP_CANCELLED, PCMK_LRM_OP_TIMEOUT, PCMK_LRM_OP_NOTSUPPORTED, PCMK_LRM_OP_ERROR, PCMK_LRM_OP_ERROR_HARD, PCMK_LRM_OP_ERROR_FATAL, PCMK_LRM_OP_NOT_INSTALLED, PCMK_LRM_OP_NOT_CONNECTED, PCMK_LRM_OP_INVALID, }; enum nagios_exitcode { NAGIOS_STATE_OK = 0, NAGIOS_STATE_WARNING = 1, NAGIOS_STATE_CRITICAL = 2, NAGIOS_STATE_UNKNOWN = 3, NAGIOS_STATE_DEPENDENT = 4, NAGIOS_INSUFFICIENT_PRIV = 100, NAGIOS_NOT_INSTALLED = 101, }; enum svc_action_flags { /* On timeout, only kill pid, do not kill entire pid group */ SVC_ACTION_LEAVE_GROUP = 0x01, SVC_ACTION_NON_BLOCKED = 0x02, }; typedef struct svc_action_private_s svc_action_private_t; typedef struct svc_action_s { char *id; char *rsc; char *action; guint interval_ms; char *standard; char *provider; char *agent; int timeout; GHashTable *params; /* used for setting up environment for ocf-ra & alert agents and to be sent via stdin for fence-agents */ int rc; int pid; int cancel; int status; int sequence; int expected_rc; int synchronous; enum svc_action_flags flags; char *stderr_data; char *stdout_data; /*! * Data stored by the creator of the action. * * This may be used to hold data that is needed later on by a callback, * for example. */ void *cb_data; svc_action_private_t *opaque; } svc_action_t; /** * \brief Get a list of files or directories in a given path * * \param[in] root full path to a directory to read * \param[in] files return list of files if TRUE or directories if FALSE * \param[in] executable if TRUE and files is TRUE, only return executable files * * \return a list of what was found. The list items are char *. * \note It is the caller's responsibility to free the result with g_list_free_full(list, free). */ GList *get_directory_list(const char *root, gboolean files, gboolean executable); /** * Get a list of services * * \return a list of services. The list items are gchar *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList *services_list(void); /** * \brief Get a list of providers * * \param[in] standard list providers of this standard (e.g. ocf, lsb, etc.) * * \return a list of providers as char * list items (or NULL if standard does not support providers) * \note The caller is responsible for freeing the result using g_list_free_full(list, free). */ GList *resources_list_providers(const char *standard); /** * \brief Get a list of resource agents * * \param[in] standard list agents using this standard (e.g. ocf, lsb, etc.) (or NULL for all) * \param[in] provider list agents from this provider (or NULL for all) * * \return a list of resource agents. The list items are char *. * \note The caller is responsible for freeing the result using g_list_free_full(list, free). */ GList *resources_list_agents(const char *standard, const char *provider); /** * Get list of available standards * * \return a list of resource standards. The list items are char *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList *resources_list_standards(void); /** * Does the given standard, provider, and agent describe a resource that can exist? * * \param[in] standard Which class of agent does the resource belong to? * \param[in] provider What provides the agent (NULL for most standards)? * \param[in] agent What is the name of the agent? * * \return A boolean */ gboolean resources_agent_exists(const char *standard, const char *provider, const char *agent); svc_action_t *services_action_create(const char *name, const char *action, guint interval_ms, int timeout /* ms */); /** * \brief Create a new resource action * * \param[in] name Name of resource * \param[in] standard Resource agent standard (ocf, lsb, etc.) * \param[in] provider Resource agent provider * \param[in] agent Resource agent name * \param[in] action action (start, stop, monitor, etc.) * \param[in] interval_ms How often to repeat this action (if 0, execute once) * \param[in] timeout Consider action failed if it does not complete in this many milliseconds * \param[in] params Action parameters * * \return newly allocated action instance * * \post After the call, 'params' is owned, and later free'd by the svc_action_t result * \note The caller is responsible for freeing the return value using * services_action_free(). */ svc_action_t *resources_action_create(const char *name, const char *standard, const char *provider, const char *agent, const char *action, guint interval_ms, int timeout /* ms */, GHashTable *params, enum svc_action_flags flags); /** * Kick a recurring action so it is scheduled immediately for re-execution */ gboolean services_action_kick(const char *name, const char *action, guint interval_ms); const char *resources_find_service_class(const char *agent); /** * Utilize services API to execute an arbitrary command. * * This API has useful infrastructure in place to be able to run a command * in the background and get notified via a callback when the command finishes. * * \param[in] exec command to execute * \param[in] args arguments to the command, NULL terminated * * \return a svc_action_t object, used to pass to the execute function * (services_action_sync() or services_action_async()) and is * provided to the callback. */ svc_action_t *services_action_create_generic(const char *exec, const char *args[]); void services_action_cleanup(svc_action_t * op); void services_action_free(svc_action_t * op); int services_action_user(svc_action_t *op, const char *user); gboolean services_action_sync(svc_action_t * op); /** * Run an action asynchronously. * * \param[in] op services action data * \param[in] action_callback callback for when the action completes * \param[in] action_fork_callback callback for when action forked successfully * * \retval TRUE succesfully started execution * \retval FALSE failed to start execution, no callback will be received */ gboolean services_action_async_fork_notify(svc_action_t * op, void (*action_callback) (svc_action_t *), void (*action_fork_callback) (svc_action_t *)); gboolean services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *)); gboolean services_action_cancel(const char *name, const char *action, guint interval_ms); /* functions for alert agents */ svc_action_t *services_alert_create(const char *id, const char *exec, int timeout, GHashTable *params, int sequence, void *cb_data); gboolean services_alert_async(svc_action_t *action, void (*cb)(svc_action_t *op)); static inline const char *services_lrm_status_str(enum op_status status) { switch (status) { case PCMK_LRM_OP_PENDING: return "pending"; case PCMK_LRM_OP_DONE:return "complete"; case PCMK_LRM_OP_CANCELLED:return "Cancelled"; case PCMK_LRM_OP_TIMEOUT:return "Timed Out"; case PCMK_LRM_OP_NOTSUPPORTED:return "NOT SUPPORTED"; case PCMK_LRM_OP_ERROR:return "Error"; case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed"; case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection"; case PCMK_LRM_OP_INVALID:return "Cannot execute now"; default:return "UNKNOWN!"; } } static inline const char *services_ocf_exitcode_str(enum ocf_exitcode code) { switch (code) { case PCMK_OCF_OK: return "ok"; case PCMK_OCF_UNKNOWN_ERROR: - return "unknown error"; + return "error"; case PCMK_OCF_INVALID_PARAM: return "invalid parameter"; case PCMK_OCF_UNIMPLEMENT_FEATURE: return "unimplemented feature"; case PCMK_OCF_INSUFFICIENT_PRIV: return "insufficient privileges"; case PCMK_OCF_NOT_INSTALLED: return "not installed"; case PCMK_OCF_NOT_CONFIGURED: return "not configured"; case PCMK_OCF_NOT_RUNNING: return "not running"; case PCMK_OCF_RUNNING_MASTER: return "master"; case PCMK_OCF_FAILED_MASTER: return "master (failed)"; case PCMK_OCF_SIGNAL: return "OCF_SIGNAL"; case PCMK_OCF_NOT_SUPPORTED: return "OCF_NOT_SUPPORTED"; case PCMK_OCF_PENDING: return "OCF_PENDING"; case PCMK_OCF_CANCELLED: return "OCF_CANCELLED"; case PCMK_OCF_TIMEOUT: return "OCF_TIMEOUT"; case PCMK_OCF_OTHER_ERROR: return "OCF_OTHER_ERROR"; case PCMK_OCF_DEGRADED: return "OCF_DEGRADED"; case PCMK_OCF_DEGRADED_MASTER: return "OCF_DEGRADED_MASTER"; default: return "unknown"; } } /** * \brief Get OCF equivalent of LSB exit code * * \param[in] action LSB action that produced exit code * \param[in] lsb_exitcode Exit code of LSB action * * \return PCMK_OCF_* constant that corresponds to LSB exit code */ static inline enum ocf_exitcode services_get_ocf_exitcode(const char *action, int lsb_exitcode) { /* For non-status actions, LSB and OCF share error code meaning <= 7 */ if (action && strcmp(action, "status") && strcmp(action, "monitor")) { if ((lsb_exitcode < 0) || (lsb_exitcode > PCMK_LSB_NOT_RUNNING)) { return PCMK_OCF_UNKNOWN_ERROR; } return (enum ocf_exitcode)lsb_exitcode; } /* status has different return codes */ switch (lsb_exitcode) { case PCMK_LSB_STATUS_OK: return PCMK_OCF_OK; case PCMK_LSB_STATUS_NOT_INSTALLED: return PCMK_OCF_NOT_INSTALLED; case PCMK_LSB_STATUS_INSUFFICIENT_PRIV: return PCMK_OCF_INSUFFICIENT_PRIV; case PCMK_LSB_STATUS_VAR_PID: case PCMK_LSB_STATUS_VAR_LOCK: case PCMK_LSB_STATUS_NOT_RUNNING: return PCMK_OCF_NOT_RUNNING; } return PCMK_OCF_UNKNOWN_ERROR; } # ifdef __cplusplus } # endif #endif /* __PCMK_SERVICES__ */ diff --git a/lib/common/iso8601.c b/lib/common/iso8601.c index 7f64edd020..9ed0027520 100644 --- a/lib/common/iso8601.c +++ b/lib/common/iso8601.c @@ -1,1718 +1,1720 @@ /* * Copyright 2005-2019 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ /* * References: * https://en.wikipedia.org/wiki/ISO_8601 * http://www.staff.science.uu.nl/~gent0113/calendar/isocalendar.htm */ #include #include #include #include #include #include #include #include /* * Andrew's code was originally written for OSes whose "struct tm" contains: * long tm_gmtoff; :: Seconds east of UTC * const char *tm_zone; :: Timezone abbreviation * Some OSes lack these, instead having: * time_t (or long) timezone; :: "difference between UTC and local standard time" * char *tzname[2] = { "...", "..." }; * I (David Lee) confess to not understanding the details. So my attempted * generalisations for where their use is necessary may be flawed. * * 1. Does "difference between ..." subtract the same or opposite way? * 2. Should it use "altzone" instead of "timezone"? * 3. Should it use tzname[0] or tzname[1]? Interaction with timezone/altzone? */ #if defined(HAVE_STRUCT_TM_TM_GMTOFF) # define GMTOFF(tm) ((tm)->tm_gmtoff) #else /* Note: extern variable; macro argument not actually used. */ # define GMTOFF(tm) (-timezone+daylight) #endif #define HOUR_SECONDS (60 * 60) #define DAY_SECONDS (HOUR_SECONDS * 24) // A date/time or duration struct crm_time_s { int years; // Calendar year (date/time) or number of years (duration) int months; // Number of months (duration only) int days; // Ordinal day of year (date/time) or number of days (duration) int seconds; // Seconds of day (date/time) or number of seconds (duration) int offset; // Seconds offset from UTC (date/time only) bool duration; // True if duration }; char *crm_time_as_string(crm_time_t * date_time, int flags); static crm_time_t *parse_date(const char *date_str); gboolean check_for_ordinal(const char *str); static crm_time_t * crm_get_utc_time(crm_time_t *dt) { crm_time_t *utc = NULL; if (dt == NULL) { errno = EINVAL; return NULL; } utc = crm_time_new_undefined(); utc->years = dt->years; utc->days = dt->days; utc->seconds = dt->seconds; utc->offset = 0; if (dt->offset) { crm_time_add_seconds(utc, -dt->offset); } else { /* Durations (which are the only things that can include months, never have a timezone */ utc->months = dt->months; } crm_time_log(LOG_TRACE, "utc-source", dt, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone); crm_time_log(LOG_TRACE, "utc-target", utc, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone); return utc; } crm_time_t * crm_time_new(const char *date_time) { time_t tm_now; crm_time_t *dt = NULL; tzset(); if (date_time == NULL) { tm_now = time(NULL); dt = crm_time_new_undefined(); crm_time_set_timet(dt, &tm_now); } else { dt = parse_date(date_time); } return dt; } /*! * \brief Allocate memory for an uninitialized time object * * \return Newly allocated time object * \note The caller is responsible for freeing the return value using * crm_time_free(). */ crm_time_t * crm_time_new_undefined() { crm_time_t *result = calloc(1, sizeof(crm_time_t)); CRM_ASSERT(result != NULL); return result; } /*! * \brief Check whether a time object has been initialized yet * * \param[in] t Time object to check * * \return TRUE if time object has been initialized, FALSE otherwise */ bool crm_time_is_defined(const crm_time_t *t) { // Any nonzero member indicates something has been done to t return (t != NULL) && (t->years || t->months || t->days || t->seconds || t->offset || t->duration); } void crm_time_free(crm_time_t * dt) { if (dt == NULL) { return; } free(dt); } static int year_days(int year) { int d = 365; if (crm_time_leapyear(year)) { d++; } return d; } /* From http://myweb.ecu.edu/mccartyr/ISOwdALG.txt : * * 5. Find the Jan1Weekday for Y (Monday=1, Sunday=7) * YY = (Y-1) % 100 * C = (Y-1) - YY * G = YY + YY/4 * Jan1Weekday = 1 + (((((C / 100) % 4) x 5) + G) % 7) */ int crm_time_january1_weekday(int year) { int YY = (year - 1) % 100; int C = (year - 1) - YY; int G = YY + YY / 4; int jan1 = 1 + (((((C / 100) % 4) * 5) + G) % 7); crm_trace("YY=%d, C=%d, G=%d", YY, C, G); crm_trace("January 1 %.4d: %d", year, jan1); return jan1; } int crm_time_weeks_in_year(int year) { int weeks = 52; int jan1 = crm_time_january1_weekday(year); /* if jan1 == thursday */ if (jan1 == 4) { weeks++; } else { jan1 = crm_time_january1_weekday(year + 1); /* if dec31 == thursday aka. jan1 of next year is a friday */ if (jan1 == 5) { weeks++; } } return weeks; } // Jan-Dec plus Feb of leap years static int month_days[13] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 29 }; /*! * \brief Return number of days in given month of given year * * \param[in] Ordinal month (1-12) * \param[in] Gregorian year * * \return Number of days in given month (0 if given month is invalid) */ int crm_time_days_in_month(int month, int year) { if ((month < 1) || (month > 12)) { return 0; } if ((month == 2) && crm_time_leapyear(year)) { month = 13; } return month_days[month - 1]; } bool crm_time_leapyear(int year) { gboolean is_leap = FALSE; if (year % 4 == 0) { is_leap = TRUE; } if (year % 100 == 0 && year % 400 != 0) { is_leap = FALSE; } return is_leap; } static uint32_t get_ordinal_days(uint32_t y, uint32_t m, uint32_t d) { int lpc; for (lpc = 1; lpc < m; lpc++) { d += crm_time_days_in_month(lpc, y); } return d; } void crm_time_log_alias(int log_level, const char *file, const char *function, int line, const char *prefix, crm_time_t * date_time, int flags) { char *date_s = crm_time_as_string(date_time, flags); if (log_level < LOG_CRIT) { printf("%s%s%s\n", (prefix? prefix : ""), (prefix? ": " : ""), date_s); } else { do_crm_log_alias(log_level, file, function, line, "%s%s%s", (prefix? prefix : ""), (prefix? ": " : ""), date_s); } free(date_s); } static int crm_time_get_sec(int sec, uint * h, uint * m, uint * s) { uint hours, minutes, seconds; if (sec < 0) { seconds = 0 - sec; } else { seconds = sec; } hours = seconds / HOUR_SECONDS; seconds -= HOUR_SECONDS * hours; minutes = seconds / 60; seconds -= 60 * minutes; crm_trace("%d == %.2d:%.2d:%.2d", sec, hours, minutes, seconds); *h = hours; *m = minutes; *s = seconds; return TRUE; } int crm_time_get_timeofday(crm_time_t * dt, uint * h, uint * m, uint * s) { return crm_time_get_sec(dt->seconds, h, m, s); } int crm_time_get_timezone(crm_time_t * dt, uint * h, uint * m) { uint s; return crm_time_get_sec(dt->seconds, h, m, &s); } long long crm_time_get_seconds(crm_time_t * dt) { int lpc; crm_time_t *utc = NULL; long long in_seconds = 0; if (dt == NULL) { return 0; } utc = crm_get_utc_time(dt); if (utc == NULL) { return 0; } for (lpc = 1; lpc < utc->years; lpc++) { int dmax = year_days(lpc); in_seconds += DAY_SECONDS * dmax; } /* utc->months is an offset that can only be set for a duration. * By definition, the value is variable depending on the date to * which it is applied. * * Force 30-day months so that something vaguely sane happens * for anyone that tries to use a month in this way. */ if (utc->months > 0) { in_seconds += DAY_SECONDS * 30 * utc->months; } if (utc->days > 0) { in_seconds += DAY_SECONDS * (utc->days - 1); } in_seconds += utc->seconds; crm_time_free(utc); return in_seconds; } #define EPOCH_SECONDS 62135596800ULL /* Calculated using crm_time_get_seconds() */ long long crm_time_get_seconds_since_epoch(crm_time_t * dt) { return (dt == NULL)? 0 : (crm_time_get_seconds(dt) - EPOCH_SECONDS); } int crm_time_get_gregorian(crm_time_t * dt, uint * y, uint * m, uint * d) { int months = 0; int days = dt->days; if(dt->years != 0) { for (months = 1; months <= 12 && days > 0; months++) { int mdays = crm_time_days_in_month(months, dt->years); if (mdays >= days) { break; } else { days -= mdays; } } } else if (dt->months) { /* This is a duration including months, don't convert the days field */ months = dt->months; } else { /* This is a duration not including months, still don't convert the days field */ } *y = dt->years; *m = months; *d = days; crm_trace("%.4d-%.3d -> %.4d-%.2d-%.2d", dt->years, dt->days, dt->years, months, days); return TRUE; } int crm_time_get_ordinal(crm_time_t * dt, uint * y, uint * d) { *y = dt->years; *d = dt->days; return TRUE; } int crm_time_get_isoweek(crm_time_t * dt, uint * y, uint * w, uint * d) { /* * Monday 29 December 2008 is written "2009-W01-1" * Sunday 3 January 2010 is written "2009-W53-7" */ int year_num = 0; int jan1 = crm_time_january1_weekday(dt->years); int h = -1; CRM_CHECK(dt->days > 0, return FALSE); /* 6. Find the Weekday for Y M D */ h = dt->days + jan1 - 1; *d = 1 + ((h - 1) % 7); /* 7. Find if Y M D falls in YearNumber Y-1, WeekNumber 52 or 53 */ if (dt->days <= (8 - jan1) && jan1 > 4) { crm_trace("year--, jan1=%d", jan1); year_num = dt->years - 1; *w = crm_time_weeks_in_year(year_num); } else { year_num = dt->years; } /* 8. Find if Y M D falls in YearNumber Y+1, WeekNumber 1 */ if (year_num == dt->years) { int dmax = year_days(year_num); int correction = 4 - *d; if ((dmax - dt->days) < correction) { crm_trace("year++, jan1=%d, i=%d vs. %d", jan1, dmax - dt->days, correction); year_num = dt->years + 1; *w = 1; } } /* 9. Find if Y M D falls in YearNumber Y, WeekNumber 1 through 53 */ if (year_num == dt->years) { int j = dt->days + (7 - *d) + (jan1 - 1); *w = j / 7; if (jan1 > 4) { *w -= 1; } } *y = year_num; crm_trace("Converted %.4d-%.3d to %.4d-W%.2d-%d", dt->years, dt->days, *y, *w, *d); return TRUE; } #define DATE_MAX 128 #define s_if_plural(i) (((i) == 1)? "" : "s") static void crm_duration_as_string(crm_time_t *dt, char *result) { size_t offset = 0; if (dt->years) { offset += snprintf(result + offset, DATE_MAX - offset, "%4d year%s ", dt->years, s_if_plural(dt->years)); } if (dt->months) { offset += snprintf(result + offset, DATE_MAX - offset, "%2d month%s ", dt->months, s_if_plural(dt->months)); } if (dt->days) { offset += snprintf(result + offset, DATE_MAX - offset, "%2d day%s ", dt->days, s_if_plural(dt->days)); } if (((offset == 0) || (dt->seconds != 0)) && (dt->seconds > -60) && (dt->seconds < 60)) { offset += snprintf(result + offset, DATE_MAX - offset, "%d second%s", dt->seconds, s_if_plural(dt->seconds)); } else if (dt->seconds) { uint h = 0, m = 0, s = 0; offset += snprintf(result + offset, DATE_MAX - offset, "%d seconds (", dt->seconds); crm_time_get_sec(dt->seconds, &h, &m, &s); if (h) { offset += snprintf(result + offset, DATE_MAX - offset, "%u hour%s%s", h, s_if_plural(h), ((m || s)? " " : "")); } if (m) { offset += snprintf(result + offset, DATE_MAX - offset, "%u minute%s%s", m, s_if_plural(m), (s? " " : "")); } if (s) { offset += snprintf(result + offset, DATE_MAX - offset, "%u second%s", s, s_if_plural(s)); } offset += snprintf(result + offset, DATE_MAX - offset, ")"); } } char * crm_time_as_string(crm_time_t * date_time, int flags) { crm_time_t *dt = NULL; crm_time_t *utc = NULL; char result[DATE_MAX] = { '\0', }; char *result_copy = NULL; size_t offset = 0; // Convert to UTC if local timezone was not requested if (date_time && date_time->offset && is_not_set(flags, crm_time_log_with_timezone)) { crm_trace("UTC conversion"); utc = crm_get_utc_time(date_time); dt = utc; } else { dt = date_time; } if (!crm_time_is_defined(dt)) { strcpy(result, ""); goto done; } // Simple cases: as duration, seconds, or seconds since epoch if (flags & crm_time_log_duration) { crm_duration_as_string(date_time, result); goto done; } if (flags & crm_time_seconds) { snprintf(result, DATE_MAX, "%lld", crm_time_get_seconds(date_time)); goto done; } if (flags & crm_time_epoch) { snprintf(result, DATE_MAX, "%lld", crm_time_get_seconds_since_epoch(date_time)); goto done; } // As readable string if (flags & crm_time_log_date) { if (flags & crm_time_weeks) { // YYYY-WW-D uint y, w, d; if (crm_time_get_isoweek(dt, &y, &w, &d)) { offset += snprintf(result + offset, DATE_MAX - offset, "%u-W%.2u-%u", y, w, d); } } else if (flags & crm_time_ordinal) { // YYYY-DDD uint y, d; if (crm_time_get_ordinal(dt, &y, &d)) { offset += snprintf(result + offset, DATE_MAX - offset, "%u-%.3u", y, d); } } else { // YYYY-MM-DD uint y, m, d; if (crm_time_get_gregorian(dt, &y, &m, &d)) { offset += snprintf(result + offset, DATE_MAX - offset, "%.4u-%.2u-%.2u", y, m, d); } } } if (flags & crm_time_log_timeofday) { uint h = 0, m = 0, s = 0; if (offset > 0) { offset += snprintf(result + offset, DATE_MAX - offset, " "); } if (crm_time_get_timeofday(dt, &h, &m, &s)) { offset += snprintf(result + offset, DATE_MAX - offset, "%.2u:%.2u:%.2u", h, m, s); } if ((flags & crm_time_log_with_timezone) && (dt->offset != 0)) { crm_time_get_sec(dt->offset, &h, &m, &s); offset += snprintf(result + offset, DATE_MAX - offset, " %c%.2u:%.2u", ((dt->offset < 0)? '-' : '+'), h, m); } else { offset += snprintf(result + offset, DATE_MAX - offset, "Z"); } } done: crm_time_free(utc); result_copy = strdup(result); CRM_ASSERT(result_copy != NULL); return result_copy; } /*! * \internal * \brief Determine number of seconds from an hour:minute:second string * * \param[in] time_str Time specification string * \param[out] result Number of seconds equivalent to time_str * * \return TRUE if specification was valid, FALSE (and set errno) otherwise * \note This may return the number of seconds in a day (which is out of bounds * for a time object) if given 24:00:00. */ static bool crm_time_parse_sec(const char *time_str, int *result) { int rc; uint hour = 0; uint minute = 0; uint second = 0; *result = 0; // Must have at least hour, but minutes and seconds are optional rc = sscanf(time_str, "%d:%d:%d", &hour, &minute, &second); if (rc == 1) { rc = sscanf(time_str, "%2d%2d%2d", &hour, &minute, &second); } if (rc == 0) { crm_err("%s is not a valid ISO 8601 time specification", time_str); errno = EINVAL; return FALSE; } crm_trace("Got valid time: %.2d:%.2d:%.2d", hour, minute, second); if ((hour == 24) && (minute == 0) && (second == 0)) { // Equivalent to 00:00:00 of next day, return number of seconds in day } else if (hour >= 24) { crm_err("%s is not a valid ISO 8601 time specification " "because %d is not a valid hour", time_str, hour); errno = EINVAL; return FALSE; } if (minute >= 60) { crm_err("%s is not a valid ISO 8601 time specification " "because %d is not a valid minute", time_str, minute); errno = EINVAL; return FALSE; } if (second >= 60) { crm_err("%s is not a valid ISO 8601 time specification " "because %d is not a valid second", time_str, second); errno = EINVAL; return FALSE; } *result = (hour * HOUR_SECONDS) + (minute * 60) + second; return TRUE; } static bool crm_time_parse_offset(const char *offset_str, int *offset) { tzset(); if (offset_str == NULL) { // Use local offset #if defined(HAVE_STRUCT_TM_TM_GMTOFF) time_t now = time(NULL); struct tm *now_tm = localtime(&now); #endif int h_offset = GMTOFF(now_tm) / HOUR_SECONDS; int m_offset = (GMTOFF(now_tm) - (HOUR_SECONDS * h_offset)) / 60; if (h_offset < 0 && m_offset < 0) { m_offset = 0 - m_offset; } *offset = (HOUR_SECONDS * h_offset) + (60 * m_offset); return TRUE; } if (offset_str[0] == 'Z') { // @TODO invalid if anything after? *offset = 0; return TRUE; } *offset = 0; if ((offset_str[0] == '+') || (offset_str[0] == '-') || isdigit((int)offset_str[0])) { gboolean negate = FALSE; if (offset_str[0] == '-') { negate = TRUE; offset_str++; } if (crm_time_parse_sec(offset_str, offset) == FALSE) { return FALSE; } if (negate) { *offset = 0 - *offset; } } // @TODO else invalid? return TRUE; } /*! * \internal * \brief Parse the time portion of an ISO 8601 date/time string * * \param[in] time_str Time portion of specification (after any 'T') * \param[in,out] a_time Time object to parse into * * \return TRUE if valid time was parsed, FALSE (and set errno) otherwise * \note This may add a day to a_time (if the time is 24:00:00). */ static bool crm_time_parse(const char *time_str, crm_time_t *a_time) { uint h, m, s; char *offset_s = NULL; tzset(); if (time_str) { if (crm_time_parse_sec(time_str, &(a_time->seconds)) == FALSE) { return FALSE; } offset_s = strstr(time_str, "Z"); if (offset_s == NULL) { offset_s = strstr(time_str, " "); if (offset_s) { while (isspace(offset_s[0])) { offset_s++; } } } } if (crm_time_parse_offset(offset_s, &(a_time->offset)) == FALSE) { return FALSE; } crm_time_get_sec(a_time->offset, &h, &m, &s); crm_trace("Got tz: %c%2.d:%.2d", ((a_time->offset < 0)? '-' : '+'), h, m); if (a_time->seconds == DAY_SECONDS) { // 24:00:00 == 00:00:00 of next day a_time->seconds = 0; crm_time_add_days(a_time, 1); } return TRUE; } /* * \internal * \brief Parse a time object from an ISO 8601 date/time specification * * \param[in] date_str ISO 8601 date/time specification (or "epoch") * * \return New time object on success, NULL (and set errno) otherwise */ static crm_time_t * parse_date(const char *date_str) { const char *time_s = NULL; crm_time_t *dt = NULL; int year = 0; int month = 0; int week = 0; int day = 0; int rc = 0; if ((date_str == NULL) || (date_str[0] == '\0')) { crm_err("No ISO 8601 date/time specification given"); goto invalid; } if ((date_str[0] == 'T') || (date_str[2] == ':')) { /* Just a time supplied - Infer current date */ dt = crm_time_new(NULL); if (date_str[0] == 'T') { time_s = date_str + 1; } else { time_s = date_str; } goto parse_time; } dt = crm_time_new_undefined(); if (!strncasecmp("epoch", date_str, 5) && ((date_str[5] == '\0') || (date_str[5] == '/') || isspace(date_str[5]))) { dt->days = 1; dt->years = 1970; crm_time_log(LOG_TRACE, "Unpacked", dt, crm_time_log_date | crm_time_log_timeofday); return dt; } /* YYYY-MM-DD */ rc = sscanf(date_str, "%d-%d-%d", &year, &month, &day); if (rc == 1) { /* YYYYMMDD */ rc = sscanf(date_str, "%4d%2d%2d", &year, &month, &day); } if (rc == 3) { if (month > 12) { crm_err("'%s' is not a valid ISO 8601 date/time specification " "because '%d' is not a valid month", date_str, month); goto invalid; } else if (day > crm_time_days_in_month(month, year)) { crm_err("'%s' is not a valid ISO 8601 date/time specification " "because '%d' is not a valid day of the month", date_str, day); goto invalid; } else { dt->years = year; dt->days = get_ordinal_days(year, month, day); crm_trace("Parsed Gregorian date '%.4d-%.3d' from date string '%s'", year, dt->days, date_str); } goto parse_time; } /* YYYY-DDD */ rc = sscanf(date_str, "%d-%d", &year, &day); if (rc == 2) { if (day > year_days(year)) { crm_err("'%s' is not a valid ISO 8601 date/time specification " "because '%d' is not a valid day of the year (max %d)", date_str, day, year_days(year)); goto invalid; } crm_trace("Parsed ordinal year %d and days %d from date string '%s'", year, day, date_str); dt->days = day; dt->years = year; goto parse_time; } /* YYYY-Www-D */ rc = sscanf(date_str, "%d-W%d-%d", &year, &week, &day); if (rc == 3) { if (week > crm_time_weeks_in_year(year)) { crm_err("'%s' is not a valid ISO 8601 date/time specification " "because '%d' is not a valid week of the year (max %d)", date_str, week, crm_time_weeks_in_year(year)); goto invalid; } else if (day < 1 || day > 7) { crm_err("'%s' is not a valid ISO 8601 date/time specification " "because '%d' is not a valid day of the week", date_str, day); goto invalid; } else { /* * See https://en.wikipedia.org/wiki/ISO_week_date * * Monday 29 December 2008 is written "2009-W01-1" * Sunday 3 January 2010 is written "2009-W53-7" * Saturday 27 September 2008 is written "2008-W37-6" * * If 1 January is on a Monday, Tuesday, Wednesday or Thursday, it is in week 01. * If 1 January is on a Friday, Saturday or Sunday, it is in week 52 or 53 of the previous year. */ int jan1 = crm_time_january1_weekday(year); crm_trace("Got year %d (Jan 1 = %d), week %d, and day %d from date string '%s'", year, jan1, week, day, date_str); dt->years = year; crm_time_add_days(dt, (week - 1) * 7); if (jan1 <= 4) { crm_time_add_days(dt, 1 - jan1); } else { crm_time_add_days(dt, 8 - jan1); } crm_time_add_days(dt, day); } goto parse_time; } crm_err("'%s' is not a valid ISO 8601 date/time specification", date_str); goto invalid; parse_time: if (time_s == NULL) { time_s = date_str + strspn(date_str, "0123456789-W"); if ((time_s[0] == ' ') || (time_s[0] == 'T')) { ++time_s; } else { time_s = NULL; } } if ((time_s != NULL) && (crm_time_parse(time_s, dt) == FALSE)) { goto invalid; } crm_time_log(LOG_TRACE, "Unpacked", dt, crm_time_log_date | crm_time_log_timeofday); if (crm_time_check(dt) == FALSE) { crm_err("'%s' is not a valid ISO 8601 date/time specification", date_str); goto invalid; } return dt; invalid: crm_time_free(dt); errno = EINVAL; return NULL; } // Parse an ISO 8601 numeric value and return number of characters consumed // @TODO This cannot handle >INT_MAX int values // @TODO Fractions appear to be not working // @TODO Error out on invalid specifications static int parse_int(const char *str, int field_width, int upper_bound, int *result) { int lpc = 0; int offset = 0; int intermediate = 0; gboolean fraction = FALSE; gboolean negate = FALSE; *result = 0; if (*str == '\0') { return 0; } if (str[offset] == 'T') { offset++; } if (str[offset] == '.' || str[offset] == ',') { fraction = TRUE; field_width = -1; offset++; } else if (str[offset] == '-') { negate = TRUE; offset++; } else if (str[offset] == '+' || str[offset] == ':') { offset++; } for (; (fraction || lpc < field_width) && isdigit((int)str[offset]); lpc++) { if (fraction) { intermediate = (str[offset] - '0') / (10 ^ lpc); } else { *result *= 10; intermediate = str[offset] - '0'; } *result += intermediate; offset++; } if (fraction) { *result = (int)(*result * upper_bound); } else if (upper_bound > 0 && *result > upper_bound) { *result = upper_bound; } if (negate) { *result = 0 - *result; } if (lpc > 0) { crm_trace("Found int: %d. Stopped at str[%d]='%c'", *result, lpc, str[lpc]); return offset; } return 0; } /*! * \brief Parse a time duration from an ISO 8601 duration specification * * \param[in] period_s ISO 8601 duration specification (optionally followed by * whitespace, after which the rest of the string will be * ignored) * * \return New time object on success, NULL (and set errno) otherwise * \note It is the caller's responsibility to return the result using * crm_time_free(). */ crm_time_t * crm_time_parse_duration(const char *period_s) { gboolean is_time = FALSE; crm_time_t *diff = NULL; if ((period_s == NULL) || (period_s[0] == '\0')) { crm_err("No ISO 8601 time duration given"); goto invalid; } if (period_s[0] != 'P') { crm_err("'%s' is not a valid ISO 8601 time duration " "because it does not start with a 'P'", period_s); goto invalid; } if ((period_s[1] == '\0') || isspace(period_s[1])) { crm_err("'%s' is not a valid ISO 8601 time duration " "because nothing follows 'P'", period_s); goto invalid; } diff = crm_time_new_undefined(); diff->duration = TRUE; for (const char *current = period_s + 1; current[0] && (current[0] != '/') && !isspace(current[0]); ++current) { int an_int = 0, rc; if (current[0] == 'T') { /* A 'T' separates year/month/day from hour/minute/seconds. We don't * require it strictly, but just use it to differentiate month from * minutes. */ is_time = TRUE; continue; } // An integer must be next rc = parse_int(current, 10, 0, &an_int); if (rc == 0) { crm_err("'%s' is not a valid ISO 8601 time duration " "because no integer at '%s'", period_s, current); goto invalid; } current += rc; // A time unit must be next (we're not strict about the order) switch (current[0]) { case 'Y': diff->years = an_int; break; case 'M': if (is_time) { /* Minutes */ diff->seconds += an_int * 60; } else { diff->months = an_int; } break; case 'W': diff->days += an_int * 7; break; case 'D': diff->days += an_int; break; case 'H': diff->seconds += an_int * HOUR_SECONDS; break; case 'S': diff->seconds += an_int; break; case '\0': crm_err("'%s' is not a valid ISO 8601 time duration " "because no units after %d", period_s, an_int); goto invalid; default: crm_err("'%s' is not a valid ISO 8601 time duration " "because '%c' is not a valid time unit", period_s, current[0]); goto invalid; } } if (!crm_time_is_defined(diff)) { crm_err("'%s' is not a valid ISO 8601 time duration " "because no amounts and units given", period_s); goto invalid; } return diff; invalid: crm_time_free(diff); errno = EINVAL; return NULL; } /*! * \brief Parse a time period from an ISO 8601 interval specification * * \param[in] period_str ISO 8601 interval specification (start/end, * start/duration, or duration/end) * * \return New time period object on success, NULL (and set errno) otherwise * \note The caller is responsible for freeing the result using * crm_time_free_period(). */ crm_time_period_t * crm_time_parse_period(const char *period_str) { const char *original = period_str; crm_time_period_t *period = NULL; if ((period_str == NULL) || (period_str[0] == '\0')) { crm_err("No ISO 8601 time period given"); goto invalid; } tzset(); period = calloc(1, sizeof(crm_time_period_t)); CRM_ASSERT(period != NULL); if (period_str[0] == 'P') { period->diff = crm_time_parse_duration(period_str); if (period->diff == NULL) { goto error; } } else { period->start = parse_date(period_str); if (period->start == NULL) { goto error; } } period_str = strstr(original, "/"); if (period_str) { ++period_str; if (period_str[0] == 'P') { if (period->diff != NULL) { crm_err("'%s' is not a valid ISO 8601 time period " "because it has two durations", original); goto invalid; } period->diff = crm_time_parse_duration(period_str); if (period->diff == NULL) { goto error; } } else { period->end = parse_date(period_str); if (period->end == NULL) { goto error; } } } else if (period->diff != NULL) { // Only duration given, assume start is now period->start = crm_time_new(NULL); } else { // Only start given crm_err("'%s' is not a valid ISO 8601 time period " "because it has no duration or ending time", original); goto invalid; } if (period->start == NULL) { period->start = crm_time_subtract(period->end, period->diff); } else if (period->end == NULL) { period->end = crm_time_add(period->start, period->diff); } if (crm_time_check(period->start) == FALSE) { crm_err("'%s' is not a valid ISO 8601 time period " "because the start is invalid", period_str); goto invalid; } if (crm_time_check(period->end) == FALSE) { crm_err("'%s' is not a valid ISO 8601 time period " "because the end is invalid", period_str); goto invalid; } return period; invalid: errno = EINVAL; error: crm_time_free_period(period); return NULL; } /*! * \brief Free a dynamically allocated time period object * * \param[in] period Time period to free */ void crm_time_free_period(crm_time_period_t *period) { if (period) { crm_time_free(period->start); crm_time_free(period->end); crm_time_free(period->diff); free(period); } } void crm_time_set(crm_time_t * target, crm_time_t * source) { crm_trace("target=%p, source=%p", target, source); CRM_CHECK(target != NULL && source != NULL, return); target->years = source->years; target->days = source->days; target->months = source->months; /* Only for durations */ target->seconds = source->seconds; target->offset = source->offset; crm_time_log(LOG_TRACE, "source", source, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone); crm_time_log(LOG_TRACE, "target", target, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone); } static void ha_set_tm_time(crm_time_t * target, struct tm *source) { int h_offset = 0; int m_offset = 0; /* Ensure target is fully initialized */ target->years = 0; target->months = 0; target->days = 0; target->seconds = 0; target->offset = 0; target->duration = FALSE; if (source->tm_year > 0) { /* years since 1900 */ target->years = 1900 + source->tm_year; } if (source->tm_yday >= 0) { /* days since January 1 [0-365] */ target->days = 1 + source->tm_yday; } if (source->tm_hour >= 0) { target->seconds += HOUR_SECONDS * source->tm_hour; } if (source->tm_min >= 0) { target->seconds += 60 * source->tm_min; } if (source->tm_sec >= 0) { target->seconds += source->tm_sec; } /* tm_gmtoff == offset from UTC in seconds */ h_offset = GMTOFF(source) / HOUR_SECONDS; m_offset = (GMTOFF(source) - (HOUR_SECONDS * h_offset)) / 60; crm_trace("Offset (s): %ld, offset (hh:mm): %.2d:%.2d", GMTOFF(source), h_offset, m_offset); target->offset += HOUR_SECONDS * h_offset; target->offset += 60 * m_offset; } void crm_time_set_timet(crm_time_t * target, time_t * source) { ha_set_tm_time(target, localtime(source)); } crm_time_t * crm_time_add(crm_time_t * dt, crm_time_t * value) { crm_time_t *utc = NULL; crm_time_t *answer = NULL; if ((dt == NULL) || (value == NULL)) { errno = EINVAL; return NULL; } answer = crm_time_new_undefined(); crm_time_set(answer, dt); utc = crm_get_utc_time(value); if (utc == NULL) { crm_time_free(answer); return NULL; } answer->years += utc->years; crm_time_add_months(answer, utc->months); crm_time_add_days(answer, utc->days); crm_time_add_seconds(answer, utc->seconds); crm_time_free(utc); return answer; } crm_time_t * crm_time_calculate_duration(crm_time_t * dt, crm_time_t * value) { crm_time_t *utc = NULL; crm_time_t *answer = NULL; if ((dt == NULL) || (value == NULL)) { errno = EINVAL; return NULL; } utc = crm_get_utc_time(value); if (utc == NULL) { return NULL; } answer = crm_get_utc_time(dt); if (answer == NULL) { crm_time_free(utc); return NULL; } answer->duration = TRUE; answer->years -= utc->years; if(utc->months != 0) { crm_time_add_months(answer, -utc->months); } crm_time_add_days(answer, -utc->days); crm_time_add_seconds(answer, -utc->seconds); crm_time_free(utc); return answer; } crm_time_t * crm_time_subtract(crm_time_t * dt, crm_time_t * value) { crm_time_t *utc = NULL; crm_time_t *answer = NULL; if ((dt == NULL) || (value == NULL)) { errno = EINVAL; return NULL; } utc = crm_get_utc_time(value); if (utc == NULL) { return NULL; } answer = crm_time_new_undefined(); crm_time_set(answer, dt); answer->years -= utc->years; if(utc->months != 0) { crm_time_add_months(answer, -utc->months); } crm_time_add_days(answer, -utc->days); crm_time_add_seconds(answer, -utc->seconds); return answer; } /*! * \brief Check whether a time object represents a sensible date/time * * \param[in] dt Date/time object to check * * \return TRUE if years, days, and seconds are sensible, FALSE otherwise */ bool crm_time_check(crm_time_t * dt) { return (dt != NULL) && (dt->days > 0) && (dt->days <= year_days(dt->years)) && (dt->seconds >= 0) && (dt->seconds < DAY_SECONDS); } #define do_cmp_field(l, r, field) \ if(rc == 0) { \ if(l->field > r->field) { \ crm_trace("%s: %d > %d", \ #field, l->field, r->field); \ rc = 1; \ } else if(l->field < r->field) { \ crm_trace("%s: %d < %d", \ #field, l->field, r->field); \ rc = -1; \ } \ } int crm_time_compare(crm_time_t *a, crm_time_t *b) { int rc = 0; crm_time_t *t1 = crm_get_utc_time(a); crm_time_t *t2 = crm_get_utc_time(b); if ((t1 == NULL) && (t2 == NULL)) { return 0; } else if (t1 == NULL) { return -1; } else if (t2 == NULL) { return 1; } do_cmp_field(t1, t2, years); do_cmp_field(t1, t2, days); do_cmp_field(t1, t2, seconds); crm_time_free(t1); crm_time_free(t2); return rc; } /*! * \brief Add a given number of seconds to a date/time or duration * * \param[in] a_time Date/time or duration to add seconds to * \param[in] extra Number of seconds to add */ void crm_time_add_seconds(crm_time_t *a_time, int extra) { int days = 0; crm_trace("Adding %d seconds to %d (max=%d)", extra, a_time->seconds, DAY_SECONDS); a_time->seconds += extra; days = a_time->seconds / DAY_SECONDS; a_time->seconds %= DAY_SECONDS; // Don't have negative seconds if (a_time->seconds < 0) { a_time->seconds += DAY_SECONDS; --days; } crm_time_add_days(a_time, days); } void crm_time_add_days(crm_time_t * a_time, int extra) { int lower_bound = 1; int ydays = crm_time_leapyear(a_time->years) ? 366 : 365; crm_trace("Adding %d days to %.4d-%.3d", extra, a_time->years, a_time->days); a_time->days += extra; while (a_time->days > ydays) { a_time->years++; a_time->days -= ydays; ydays = crm_time_leapyear(a_time->years) ? 366 : 365; } if(a_time->duration) { lower_bound = 0; } while (a_time->days < lower_bound) { a_time->years--; a_time->days += crm_time_leapyear(a_time->years) ? 366 : 365; } } void crm_time_add_months(crm_time_t * a_time, int extra) { int lpc; uint32_t y, m, d, dmax; crm_time_get_gregorian(a_time, &y, &m, &d); crm_trace("Adding %d months to %.4d-%.2d-%.2d", extra, y, m, d); if (extra > 0) { for (lpc = extra; lpc > 0; lpc--) { m++; if (m == 13) { m = 1; y++; } } } else { for (lpc = -extra; lpc > 0; lpc--) { m--; if (m == 0) { m = 12; y--; } } } dmax = crm_time_days_in_month(m, y); if (dmax < d) { /* Preserve day-of-month unless the month doesn't have enough days */ d = dmax; } crm_trace("Calculated %.4d-%.2d-%.2d", y, m, d); a_time->years = y; a_time->days = get_ordinal_days(y, m, d); crm_time_get_gregorian(a_time, &y, &m, &d); crm_trace("Got %.4d-%.2d-%.2d", y, m, d); } void crm_time_add_minutes(crm_time_t * a_time, int extra) { crm_time_add_seconds(a_time, extra * 60); } void crm_time_add_hours(crm_time_t * a_time, int extra) { crm_time_add_seconds(a_time, extra * HOUR_SECONDS); } void crm_time_add_weeks(crm_time_t * a_time, int extra) { crm_time_add_days(a_time, extra * 7); } void crm_time_add_years(crm_time_t * a_time, int extra) { a_time->years += extra; } static void ha_get_tm_time( struct tm *target, crm_time_t *source) { *target = (struct tm) { .tm_year = source->years - 1900, .tm_mday = source->days, .tm_sec = source->seconds % 60, .tm_min = ( source->seconds / 60 ) % 60, .tm_hour = source->seconds / HOUR_SECONDS, .tm_isdst = -1, /* don't adjust */ #if defined(HAVE_STRUCT_TM_TM_GMTOFF) .tm_gmtoff = source->offset #endif }; mktime(target); } crm_time_hr_t * crm_time_hr_convert(crm_time_hr_t *target, crm_time_t *dt) { crm_time_hr_t *hr_dt = NULL; if (dt) { hr_dt = target?target:calloc(1, sizeof(crm_time_hr_t)); CRM_ASSERT(hr_dt != NULL); *hr_dt = (crm_time_hr_t) { .years = dt->years, .months = dt->months, .days = dt->days, .seconds = dt->seconds, .offset = dt->offset, .duration = dt->duration }; } return hr_dt; } void crm_time_set_hr_dt(crm_time_t *target, crm_time_hr_t *hr_dt) { CRM_ASSERT((hr_dt) && (target)); *target = (crm_time_t) { .years = hr_dt->years, .months = hr_dt->months, .days = hr_dt->days, .seconds = hr_dt->seconds, .offset = hr_dt->offset, .duration = hr_dt->duration }; } crm_time_hr_t * crm_time_timeval_hr_convert(crm_time_hr_t *target, struct timeval *tv) { crm_time_t dt; crm_time_hr_t *ret; crm_time_set_timet(&dt, &tv->tv_sec); ret = crm_time_hr_convert(target, &dt); if (ret) { ret->useconds = tv->tv_usec; } return ret; } crm_time_hr_t * crm_time_hr_new(const char *date_time) { crm_time_hr_t *hr_dt = NULL; struct timeval tv_now; if (!date_time) { if (gettimeofday(&tv_now, NULL) == 0) { hr_dt = crm_time_timeval_hr_convert(NULL, &tv_now); } } else { crm_time_t *dt; dt = parse_date(date_time); hr_dt = crm_time_hr_convert(NULL, dt); crm_time_free(dt); } return hr_dt; } void crm_time_hr_free(crm_time_hr_t * hr_dt) { free(hr_dt); } char * crm_time_format_hr(const char *format, crm_time_hr_t * hr_dt) { const char *mark_s; int max = 128, scanned_pos = 0, printed_pos = 0, fmt_pos = 0, date_len = 0, nano_digits = 0; char nano_s[10], date_s[max+1], nanofmt_s[5] = "%", *tmp_fmt_s; struct tm tm; crm_time_t dt; if (!format) { return NULL; } crm_time_set_hr_dt(&dt, hr_dt); ha_get_tm_time(&tm, &dt); sprintf(nano_s, "%06d000", hr_dt->useconds); while ((format[scanned_pos]) != '\0') { mark_s = strchr(&format[scanned_pos], '%'); if (mark_s) { int fmt_len = 1; fmt_pos = mark_s - format; while ((format[fmt_pos+fmt_len] != '\0') && (format[fmt_pos+fmt_len] >= '0') && (format[fmt_pos+fmt_len] <= '9')) { fmt_len++; } scanned_pos = fmt_pos + fmt_len + 1; if (format[fmt_pos+fmt_len] == 'N') { nano_digits = atoi(&format[fmt_pos+1]); nano_digits = (nano_digits > 6)?6:nano_digits; nano_digits = (nano_digits < 0)?0:nano_digits; sprintf(&nanofmt_s[1], ".%ds", nano_digits); } else { if (format[scanned_pos] != '\0') { continue; } fmt_pos = scanned_pos; /* print till end */ } } else { scanned_pos = strlen(format); fmt_pos = scanned_pos; /* print till end */ } tmp_fmt_s = strndup(&format[printed_pos], fmt_pos - printed_pos); #ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-nonliteral" #endif date_len += strftime(&date_s[date_len], max-date_len, tmp_fmt_s, &tm); #ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED #pragma GCC diagnostic pop #endif printed_pos = scanned_pos; free(tmp_fmt_s); if (nano_digits) { #ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-nonliteral" #endif date_len += snprintf(&date_s[date_len], max-date_len, nanofmt_s, nano_s); #ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED #pragma GCC diagnostic pop #endif nano_digits = 0; } } return (date_len == 0)?NULL:strdup(date_s); } /*! * \internal - * \brief Return human-friendly string representing current time + * \brief Return human-friendly string corresponding to a time * - * \return Current time as string (as by ctime() but without newline) on success - * or "Could not determine current time" on error + * \param[in] when Pointer to epoch time value (or NULL for current time) + * + * \return Current time as string (as by ctime() but without newline) on + * success, NULL otherwise * \note The return value points to a statically allocated string which might be * overwritten by subsequent calls to any of the C library date and time functions. */ const char * crm_now_string(time_t *when) { char *since_epoch = NULL; if (when == NULL) { time_t a_time = time(NULL); if (a_time == (time_t) -1) { return NULL; } else { since_epoch = ctime(&a_time); } } else { since_epoch = ctime(when); } if (since_epoch == NULL) { return NULL; } else { return crm_strip_trailing_newline(since_epoch); } } diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 64c1979bf8..d33775838c 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1,3774 +1,3842 @@ /* * Copyright 2004-2019 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include +#include +#include #include #include #include #include #include #include #include #include +#include #include #include CRM_TRACE_INIT_DATA(pe_status); #define set_config_flag(data_set, option, flag) do { \ const char *tmp = pe_pref(data_set->config_hash, option); \ if(tmp) { \ if(crm_is_true(tmp)) { \ set_bit(data_set->flags, flag); \ } else { \ clear_bit(data_set->flags, flag); \ } \ } \ } while(0) static void unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, xmlNode **last_failure, enum action_fail_response *failed, pe_working_set_t *data_set); static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node); static void add_node_attrs(xmlNode *attrs, pe_node_t *node, bool overwrite, pe_working_set_t *data_set); // Bitmask for warnings we only want to print once uint32_t pe_wo = 0; static gboolean is_dangling_guest_node(node_t *node) { /* we are looking for a remote-node that was supposed to be mapped to a * container resource, but all traces of that container have disappeared * from both the config and the status section. */ if (pe__is_guest_or_remote_node(node) && node->details->remote_rsc && node->details->remote_rsc->container == NULL && is_set(node->details->remote_rsc->flags, pe_rsc_orphan_container_filler)) { return TRUE; } return FALSE; } /*! * \brief Schedule a fence action for a node * * \param[in,out] data_set Current working set of cluster * \param[in,out] node Node to fence * \param[in] reason Text description of why fencing is needed */ void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) { CRM_CHECK(node, return); /* A guest node is fenced by marking its container as failed */ if (pe__is_guest_node(node)) { resource_t *rsc = node->details->remote_rsc->container; if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { if (!is_set(rsc->flags, pe_rsc_managed)) { crm_notice("Not fencing guest node %s " "(otherwise would because %s): " "its guest resource %s is unmanaged", node->details->uname, reason, rsc->id); } else { crm_warn("Guest node %s will be fenced " "(by recovering its guest resource %s): %s", node->details->uname, rsc->id, reason); /* We don't mark the node as unclean because that would prevent the * node from running resources. We want to allow it to run resources * in this transition if the recovery succeeds. */ node->details->remote_requires_reset = TRUE; set_bit(rsc->flags, pe_rsc_failed); } } } else if (is_dangling_guest_node(node)) { crm_info("Cleaning up dangling connection for guest node %s: " "fencing was already done because %s, " "and guest resource no longer exists", node->details->uname, reason); set_bit(node->details->remote_rsc->flags, pe_rsc_failed); } else if (pe__is_remote_node(node)) { resource_t *rsc = node->details->remote_rsc; if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) { crm_notice("Not fencing remote node %s " "(otherwise would because %s): connection is unmanaged", node->details->uname, reason); } else if(node->details->remote_requires_reset == FALSE) { node->details->remote_requires_reset = TRUE; crm_warn("Remote node %s %s: %s", node->details->uname, pe_can_fence(data_set, node)? "will be fenced" : "is unclean", reason); } node->details->unclean = TRUE; pe_fence_op(node, NULL, TRUE, reason, data_set); } else if (node->details->unclean) { crm_trace("Cluster node %s %s because %s", node->details->uname, pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean", reason); } else { crm_warn("Cluster node %s %s: %s", node->details->uname, pe_can_fence(data_set, node)? "will be fenced" : "is unclean", reason); node->details->unclean = TRUE; pe_fence_op(node, NULL, TRUE, reason, data_set); } } // @TODO xpaths can't handle templates, rules, or id-refs // nvpair with provides or requires set to unfencing #define XPATH_UNFENCING_NVPAIR XML_CIB_TAG_NVPAIR \ "[(@" XML_NVPAIR_ATTR_NAME "='" XML_RSC_ATTR_PROVIDES "'" \ "or @" XML_NVPAIR_ATTR_NAME "='" XML_RSC_ATTR_REQUIRES "') " \ "and @" XML_NVPAIR_ATTR_VALUE "='unfencing']" // unfencing in rsc_defaults or any resource #define XPATH_ENABLE_UNFENCING \ "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RESOURCES \ "//" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR \ "|/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RSCCONFIG \ "/" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR static void set_if_xpath(unsigned long long flag, const char *xpath, pe_working_set_t *data_set) { xmlXPathObjectPtr result = NULL; if (is_not_set(data_set->flags, flag)) { result = xpath_search(data_set->input, xpath); if (result && (numXpathResults(result) > 0)) { set_bit(data_set->flags, flag); } freeXpathObject(result); } } gboolean unpack_config(xmlNode * config, pe_working_set_t * data_set) { const char *value = NULL; GHashTable *config_hash = crm_str_table_new(); data_set->config_hash = config_hash; pe__unpack_dataset_nvpairs(config, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, data_set); verify_pe_options(data_set->config_hash); set_config_flag(data_set, "enable-startup-probes", pe_flag_startup_probes); if(is_not_set(data_set->flags, pe_flag_startup_probes)) { crm_info("Startup probes: disabled (dangerous)"); } value = pe_pref(data_set->config_hash, XML_ATTR_HAVE_WATCHDOG); if (value && crm_is_true(value)) { crm_notice("Watchdog will be used via SBD if fencing is required " "and stonith-watchdog-timeout is nonzero"); set_bit(data_set->flags, pe_flag_have_stonith_resource); } /* Set certain flags via xpath here, so they can be used before the relevant * configuration sections are unpacked. */ set_if_xpath(pe_flag_enable_unfencing, XPATH_ENABLE_UNFENCING, data_set); value = pe_pref(data_set->config_hash, "stonith-timeout"); data_set->stonith_timeout = (int) crm_parse_interval_spec(value); crm_debug("STONITH timeout: %d", data_set->stonith_timeout); set_config_flag(data_set, "stonith-enabled", pe_flag_stonith_enabled); crm_debug("STONITH of failed nodes is %s", is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled"); data_set->stonith_action = pe_pref(data_set->config_hash, "stonith-action"); if (!strcmp(data_set->stonith_action, "poweroff")) { pe_warn_once(pe_wo_poweroff, "Support for stonith-action of 'poweroff' is deprecated " "and will be removed in a future release (use 'off' instead)"); data_set->stonith_action = "off"; } crm_trace("STONITH will %s nodes", data_set->stonith_action); set_config_flag(data_set, "concurrent-fencing", pe_flag_concurrent_fencing); crm_debug("Concurrent fencing is %s", is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled"); set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything); crm_debug("Stop all active resources: %s", is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false"); set_config_flag(data_set, "symmetric-cluster", pe_flag_symmetric_cluster); if (is_set(data_set->flags, pe_flag_symmetric_cluster)) { crm_debug("Cluster is symmetric" " - resources can run anywhere by default"); } value = pe_pref(data_set->config_hash, "no-quorum-policy"); if (safe_str_eq(value, "ignore")) { data_set->no_quorum_policy = no_quorum_ignore; } else if (safe_str_eq(value, "freeze")) { data_set->no_quorum_policy = no_quorum_freeze; } else if (safe_str_eq(value, "suicide")) { if (is_set(data_set->flags, pe_flag_stonith_enabled)) { int do_panic = 0; crm_element_value_int(data_set->input, XML_ATTR_QUORUM_PANIC, &do_panic); if (do_panic || is_set(data_set->flags, pe_flag_have_quorum)) { data_set->no_quorum_policy = no_quorum_suicide; } else { crm_notice("Resetting no-quorum-policy to 'stop': cluster has never had quorum"); data_set->no_quorum_policy = no_quorum_stop; } } else { crm_config_err("Resetting no-quorum-policy to 'stop': stonith is not configured"); data_set->no_quorum_policy = no_quorum_stop; } } else { data_set->no_quorum_policy = no_quorum_stop; } switch (data_set->no_quorum_policy) { case no_quorum_freeze: crm_debug("On loss of quorum: Freeze resources"); break; case no_quorum_stop: crm_debug("On loss of quorum: Stop ALL resources"); break; case no_quorum_suicide: crm_notice("On loss of quorum: Fence all remaining nodes"); break; case no_quorum_ignore: crm_notice("On loss of quorum: Ignore"); break; } set_config_flag(data_set, "stop-orphan-resources", pe_flag_stop_rsc_orphans); crm_trace("Orphan resources are %s", is_set(data_set->flags, pe_flag_stop_rsc_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "stop-orphan-actions", pe_flag_stop_action_orphans); crm_trace("Orphan resource actions are %s", is_set(data_set->flags, pe_flag_stop_action_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "remove-after-stop", pe_flag_remove_after_stop); crm_trace("Stopped resources are removed from the status section: %s", is_set(data_set->flags, pe_flag_remove_after_stop) ? "true" : "false"); set_config_flag(data_set, "maintenance-mode", pe_flag_maintenance_mode); crm_trace("Maintenance mode: %s", is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false"); set_config_flag(data_set, "start-failure-is-fatal", pe_flag_start_failure_fatal); crm_trace("Start failures are %s", is_set(data_set->flags, pe_flag_start_failure_fatal) ? "always fatal" : "handled by failcount"); if (is_set(data_set->flags, pe_flag_stonith_enabled)) { set_config_flag(data_set, "startup-fencing", pe_flag_startup_fencing); } if (is_set(data_set->flags, pe_flag_startup_fencing)) { crm_trace("Unseen nodes will be fenced"); } else { pe_warn_once(pe_wo_blind, "Blind faith: not fencing unseen nodes"); } node_score_red = char2score(pe_pref(data_set->config_hash, "node-health-red")); node_score_green = char2score(pe_pref(data_set->config_hash, "node-health-green")); node_score_yellow = char2score(pe_pref(data_set->config_hash, "node-health-yellow")); crm_debug("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s", pe_pref(data_set->config_hash, "node-health-red"), pe_pref(data_set->config_hash, "node-health-yellow"), pe_pref(data_set->config_hash, "node-health-green")); data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy"); crm_trace("Placement strategy: %s", data_set->placement_strategy); return TRUE; } static void destroy_digest_cache(gpointer ptr) { op_digest_cache_t *data = ptr; free_xml(data->params_all); free_xml(data->params_secure); free_xml(data->params_restart); free(data->digest_all_calc); free(data->digest_restart_calc); free(data->digest_secure_calc); free(data); } node_t * pe_create_node(const char *id, const char *uname, const char *type, const char *score, pe_working_set_t * data_set) { node_t *new_node = NULL; if (pe_find_node(data_set->nodes, uname) != NULL) { crm_config_warn("Detected multiple node entries with uname=%s" " - this is rarely intended", uname); } new_node = calloc(1, sizeof(node_t)); if (new_node == NULL) { return NULL; } new_node->weight = char2score(score); new_node->fixed = FALSE; new_node->details = calloc(1, sizeof(struct pe_node_shared_s)); if (new_node->details == NULL) { free(new_node); return NULL; } crm_trace("Creating node for entry %s/%s", uname, id); new_node->details->id = id; new_node->details->uname = uname; new_node->details->online = FALSE; new_node->details->shutdown = FALSE; new_node->details->rsc_discovery_enabled = TRUE; new_node->details->running_rsc = NULL; new_node->details->type = node_ping; if (safe_str_eq(type, "remote")) { new_node->details->type = node_remote; set_bit(data_set->flags, pe_flag_have_remote_nodes); } else if ((type == NULL) || safe_str_eq(type, "member")) { new_node->details->type = node_member; } new_node->details->attrs = crm_str_table_new(); if (pe__is_guest_or_remote_node(new_node)) { g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND), strdup("remote")); } else { g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND), strdup("cluster")); } new_node->details->utilization = crm_str_table_new(); new_node->details->digest_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_digest_cache); data_set->nodes = g_list_insert_sorted(data_set->nodes, new_node, sort_node_uname); return new_node; } bool remote_id_conflict(const char *remote_name, pe_working_set_t *data) { bool match = FALSE; #if 1 pe_find_resource(data->resources, remote_name); #else if (data->name_check == NULL) { data->name_check = g_hash_table_new(crm_str_hash, g_str_equal); for (xml_rsc = __xml_first_child_element(parent); xml_rsc != NULL; xml_rsc = __xml_next_element(xml_rsc)) { const char *id = ID(xml_rsc); /* avoiding heap allocation here because we know the duration of this hashtable allows us to */ g_hash_table_insert(data->name_check, (char *) id, (char *) id); } } if (g_hash_table_lookup(data->name_check, remote_name)) { match = TRUE; } #endif if (match) { crm_err("Invalid remote-node name, a resource called '%s' already exists.", remote_name); return NULL; } return match; } static const char * expand_remote_rsc_meta(xmlNode *xml_obj, xmlNode *parent, pe_working_set_t *data) { xmlNode *attr_set = NULL; xmlNode *attr = NULL; const char *container_id = ID(xml_obj); const char *remote_name = NULL; const char *remote_server = NULL; const char *remote_port = NULL; const char *connect_timeout = "60s"; const char *remote_allow_migrate=NULL; const char *is_managed = NULL; for (attr_set = __xml_first_child_element(xml_obj); attr_set != NULL; attr_set = __xml_next_element(attr_set)) { if (safe_str_neq((const char *)attr_set->name, XML_TAG_META_SETS)) { continue; } for (attr = __xml_first_child_element(attr_set); attr != NULL; attr = __xml_next_element(attr)) { const char *value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE); const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME); if (safe_str_eq(name, XML_RSC_ATTR_REMOTE_NODE)) { remote_name = value; } else if (safe_str_eq(name, "remote-addr")) { remote_server = value; } else if (safe_str_eq(name, "remote-port")) { remote_port = value; } else if (safe_str_eq(name, "remote-connect-timeout")) { connect_timeout = value; } else if (safe_str_eq(name, "remote-allow-migrate")) { remote_allow_migrate=value; } else if (safe_str_eq(name, XML_RSC_ATTR_MANAGED)) { is_managed = value; } } } if (remote_name == NULL) { return NULL; } if (remote_id_conflict(remote_name, data)) { return NULL; } pe_create_remote_xml(parent, remote_name, container_id, remote_allow_migrate, is_managed, connect_timeout, remote_server, remote_port); return remote_name; } static void handle_startup_fencing(pe_working_set_t *data_set, node_t *new_node) { if ((new_node->details->type == node_remote) && (new_node->details->remote_rsc == NULL)) { /* Ignore fencing for remote nodes that don't have a connection resource * associated with them. This happens when remote node entries get left * in the nodes section after the connection resource is removed. */ return; } if (is_set(data_set->flags, pe_flag_startup_fencing)) { // All nodes are unclean until we've seen their status entry new_node->details->unclean = TRUE; } else { // Blind faith ... new_node->details->unclean = FALSE; } /* We need to be able to determine if a node's status section * exists or not separate from whether the node is unclean. */ new_node->details->unseen = TRUE; } gboolean unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; node_t *new_node = NULL; const char *id = NULL; const char *uname = NULL; const char *type = NULL; const char *score = NULL; for (xml_obj = __xml_first_child_element(xml_nodes); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_NODE, TRUE)) { new_node = NULL; id = crm_element_value(xml_obj, XML_ATTR_ID); uname = crm_element_value(xml_obj, XML_ATTR_UNAME); type = crm_element_value(xml_obj, XML_ATTR_TYPE); score = crm_element_value(xml_obj, XML_RULE_ATTR_SCORE); crm_trace("Processing node %s/%s", uname, id); if (id == NULL) { crm_config_err("Must specify id tag in "); continue; } new_node = pe_create_node(id, uname, type, score, data_set); if (new_node == NULL) { return FALSE; } /* if(data_set->have_quorum == FALSE */ /* && data_set->no_quorum_policy == no_quorum_stop) { */ /* /\* start shutting resources down *\/ */ /* new_node->weight = -INFINITY; */ /* } */ handle_startup_fencing(data_set, new_node); add_node_attrs(xml_obj, new_node, FALSE, data_set); pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_UTILIZATION, NULL, new_node->details->utilization, NULL, FALSE, data_set); crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME)); } } if (data_set->localhost && pe_find_node(data_set->nodes, data_set->localhost) == NULL) { crm_info("Creating a fake local node"); pe_create_node(data_set->localhost, data_set->localhost, NULL, 0, data_set); } return TRUE; } static void setup_container(resource_t * rsc, pe_working_set_t * data_set) { const char *container_id = NULL; if (rsc->children) { GListPtr gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; setup_container(child_rsc, data_set); } return; } container_id = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_CONTAINER); if (container_id && safe_str_neq(container_id, rsc->id)) { resource_t *container = pe_find_resource(data_set->resources, container_id); if (container) { rsc->container = container; set_bit(container->flags, pe_rsc_is_container); container->fillers = g_list_append(container->fillers, rsc); pe_rsc_trace(rsc, "Resource %s's container is %s", rsc->id, container_id); } else { pe_err("Resource %s: Unknown resource container (%s)", rsc->id, container_id); } } } gboolean unpack_remote_nodes(xmlNode * xml_resources, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; /* Create remote nodes and guest nodes from the resource configuration * before unpacking resources. */ for (xml_obj = __xml_first_child_element(xml_resources); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { const char *new_node_id = NULL; /* Check for remote nodes, which are defined by ocf:pacemaker:remote * primitives. */ if (xml_contains_remote_node(xml_obj)) { new_node_id = ID(xml_obj); /* The "pe_find_node" check is here to make sure we don't iterate over * an expanded node that has already been added to the node list. */ if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found remote node %s defined by resource %s", new_node_id, ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } continue; } /* Check for guest nodes, which are defined by special meta-attributes * of a primitive of any type (for example, VirtualDomain or Xen). */ if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RESOURCE, TRUE)) { /* This will add an ocf:pacemaker:remote primitive to the * configuration for the guest node's connection, to be unpacked * later. */ new_node_id = expand_remote_rsc_meta(xml_obj, xml_resources, data_set); if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found guest node %s in resource %s", new_node_id, ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } continue; } /* Check for guest nodes inside a group. Clones are currently not * supported as guest nodes. */ if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_GROUP, TRUE)) { xmlNode *xml_obj2 = NULL; for (xml_obj2 = __xml_first_child_element(xml_obj); xml_obj2 != NULL; xml_obj2 = __xml_next_element(xml_obj2)) { new_node_id = expand_remote_rsc_meta(xml_obj2, xml_resources, data_set); if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found guest node %s in resource %s inside group %s", new_node_id, ID(xml_obj2), ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } } } } return TRUE; } /* Call this after all the nodes and resources have been * unpacked, but before the status section is read. * * A remote node's online status is reflected by the state * of the remote node's connection resource. We need to link * the remote node to this connection resource so we can have * easy access to the connection resource during the PE calculations. */ static void link_rsc2remotenode(pe_working_set_t *data_set, resource_t *new_rsc) { node_t *remote_node = NULL; if (new_rsc->is_remote_node == FALSE) { return; } if (is_set(data_set->flags, pe_flag_quick_location)) { /* remote_nodes and remote_resources are not linked in quick location calculations */ return; } remote_node = pe_find_node(data_set->nodes, new_rsc->id); CRM_CHECK(remote_node != NULL, return;); pe_rsc_trace(new_rsc, "Linking remote connection resource %s to node %s", new_rsc->id, remote_node->details->uname); remote_node->details->remote_rsc = new_rsc; if (new_rsc->container == NULL) { /* Handle start-up fencing for remote nodes (as opposed to guest nodes) * the same as is done for cluster nodes. */ handle_startup_fencing(data_set, remote_node); } else { /* pe_create_node() marks the new node as "remote" or "cluster"; now * that we know the node is a guest node, update it correctly. */ g_hash_table_replace(remote_node->details->attrs, strdup(CRM_ATTR_KIND), strdup("container")); } } static void destroy_tag(gpointer data) { tag_t *tag = data; if (tag) { free(tag->id); g_list_free_full(tag->refs, free); free(tag); } } /*! * \internal * \brief Parse configuration XML for resource information * * \param[in] xml_resources Top of resource configuration XML * \param[in,out] data_set Where to put resource information * * \return TRUE * * \note unpack_remote_nodes() MUST be called before this, so that the nodes can * be used when common_unpack() calls resource_location() */ gboolean unpack_resources(xmlNode * xml_resources, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; GListPtr gIter = NULL; data_set->template_rsc_sets = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_tag); for (xml_obj = __xml_first_child_element(xml_resources); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { resource_t *new_rsc = NULL; if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RSC_TEMPLATE, TRUE)) { const char *template_id = ID(xml_obj); if (template_id && g_hash_table_lookup_extended(data_set->template_rsc_sets, template_id, NULL, NULL) == FALSE) { /* Record the template's ID for the knowledge of its existence anyway. */ g_hash_table_insert(data_set->template_rsc_sets, strdup(template_id), NULL); } continue; } crm_trace("Beginning unpack... <%s id=%s... >", crm_element_name(xml_obj), ID(xml_obj)); if (common_unpack(xml_obj, &new_rsc, NULL, data_set)) { data_set->resources = g_list_append(data_set->resources, new_rsc); pe_rsc_trace(new_rsc, "Added resource %s", new_rsc->id); } else { crm_config_err("Failed unpacking %s %s", crm_element_name(xml_obj), crm_element_value(xml_obj, XML_ATTR_ID)); if (new_rsc != NULL && new_rsc->fns != NULL) { new_rsc->fns->free(new_rsc); } } } for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; setup_container(rsc, data_set); link_rsc2remotenode(data_set, rsc); } data_set->resources = g_list_sort(data_set->resources, sort_rsc_priority); if (is_set(data_set->flags, pe_flag_quick_location)) { /* Ignore */ } else if (is_set(data_set->flags, pe_flag_stonith_enabled) && is_set(data_set->flags, pe_flag_have_stonith_resource) == FALSE) { crm_config_err("Resource start-up disabled since no STONITH resources have been defined"); crm_config_err("Either configure some or disable STONITH with the stonith-enabled option"); crm_config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity"); } return TRUE; } gboolean unpack_tags(xmlNode * xml_tags, pe_working_set_t * data_set) { xmlNode *xml_tag = NULL; data_set->tags = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_tag); for (xml_tag = __xml_first_child_element(xml_tags); xml_tag != NULL; xml_tag = __xml_next_element(xml_tag)) { xmlNode *xml_obj_ref = NULL; const char *tag_id = ID(xml_tag); if (crm_str_eq((const char *)xml_tag->name, XML_CIB_TAG_TAG, TRUE) == FALSE) { continue; } if (tag_id == NULL) { crm_config_err("Failed unpacking %s: %s should be specified", crm_element_name(xml_tag), XML_ATTR_ID); continue; } for (xml_obj_ref = __xml_first_child_element(xml_tag); xml_obj_ref != NULL; xml_obj_ref = __xml_next_element(xml_obj_ref)) { const char *obj_ref = ID(xml_obj_ref); if (crm_str_eq((const char *)xml_obj_ref->name, XML_CIB_TAG_OBJ_REF, TRUE) == FALSE) { continue; } if (obj_ref == NULL) { crm_config_err("Failed unpacking %s for tag %s: %s should be specified", crm_element_name(xml_obj_ref), tag_id, XML_ATTR_ID); continue; } if (add_tag_ref(data_set->tags, tag_id, obj_ref) == FALSE) { return FALSE; } } } return TRUE; } /* The ticket state section: * "/cib/status/tickets/ticket_state" */ static gboolean unpack_ticket_state(xmlNode * xml_ticket, pe_working_set_t * data_set) { const char *ticket_id = NULL; const char *granted = NULL; const char *last_granted = NULL; const char *standby = NULL; xmlAttrPtr xIter = NULL; ticket_t *ticket = NULL; ticket_id = ID(xml_ticket); if (ticket_id == NULL || strlen(ticket_id) == 0) { return FALSE; } crm_trace("Processing ticket state for %s", ticket_id); ticket = g_hash_table_lookup(data_set->tickets, ticket_id); if (ticket == NULL) { ticket = ticket_new(ticket_id, data_set); if (ticket == NULL) { return FALSE; } } for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) { const char *prop_name = (const char *)xIter->name; const char *prop_value = crm_element_value(xml_ticket, prop_name); if (crm_str_eq(prop_name, XML_ATTR_ID, TRUE)) { continue; } g_hash_table_replace(ticket->state, strdup(prop_name), strdup(prop_value)); } granted = g_hash_table_lookup(ticket->state, "granted"); if (granted && crm_is_true(granted)) { ticket->granted = TRUE; crm_info("We have ticket '%s'", ticket->id); } else { ticket->granted = FALSE; crm_info("We do not have ticket '%s'", ticket->id); } last_granted = g_hash_table_lookup(ticket->state, "last-granted"); if (last_granted) { ticket->last_granted = crm_parse_int(last_granted, 0); } standby = g_hash_table_lookup(ticket->state, "standby"); if (standby && crm_is_true(standby)) { ticket->standby = TRUE; if (ticket->granted) { crm_info("Granted ticket '%s' is in standby-mode", ticket->id); } } else { ticket->standby = FALSE; } crm_trace("Done with ticket state for %s", ticket_id); return TRUE; } static gboolean unpack_tickets_state(xmlNode * xml_tickets, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; for (xml_obj = __xml_first_child_element(xml_tickets); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_TICKET_STATE, TRUE) == FALSE) { continue; } unpack_ticket_state(xml_obj, data_set); } return TRUE; } static void unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * data_set) { const char *resource_discovery_enabled = NULL; xmlNode *attrs = NULL; resource_t *rsc = NULL; if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { return; } if ((this_node == NULL) || !pe__is_guest_or_remote_node(this_node)) { return; } crm_trace("Processing remote node id=%s, uname=%s", this_node->details->id, this_node->details->uname); this_node->details->remote_maintenance = crm_atoi(crm_element_value(state, XML_NODE_IS_MAINTENANCE), "0"); rsc = this_node->details->remote_rsc; if (this_node->details->remote_requires_reset == FALSE) { this_node->details->unclean = FALSE; this_node->details->unseen = FALSE; } attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); add_node_attrs(attrs, this_node, TRUE, data_set); if (pe__shutdown_requested(this_node)) { crm_info("Node %s is shutting down", this_node->details->uname); this_node->details->shutdown = TRUE; if (rsc) { rsc->next_role = RSC_ROLE_STOPPED; } } if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) { crm_info("Node %s is in standby-mode", this_node->details->uname); this_node->details->standby = TRUE; } if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance")) || (rsc && !is_set(rsc->flags, pe_rsc_managed))) { crm_info("Node %s is in maintenance-mode", this_node->details->uname); this_node->details->maintenance = TRUE; } resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY); if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) { if (pe__is_remote_node(this_node) && is_not_set(data_set->flags, pe_flag_stonith_enabled)) { crm_warn("Ignoring %s attribute on remote node %s because stonith is disabled", XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname); } else { /* This is either a remote node with fencing enabled, or a guest * node. We don't care whether fencing is enabled when fencing guest * nodes, because they are "fenced" by recovering their containing * resource. */ crm_info("Node %s has resource discovery disabled", this_node->details->uname); this_node->details->rsc_discovery_enabled = FALSE; } } } static bool unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set) { bool changed = false; xmlNode *lrm_rsc = NULL; for (xmlNode *state = __xml_first_child_element(status); state != NULL; state = __xml_next_element(state)) { const char *id = NULL; const char *uname = NULL; node_t *this_node = NULL; bool process = FALSE; if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if (this_node == NULL) { crm_info("Node %s is unknown", id); continue; } else if (this_node->details->unpacked) { crm_info("Node %s is already processed", id); continue; } else if (!pe__is_guest_or_remote_node(this_node) && is_set(data_set->flags, pe_flag_stonith_enabled)) { // A redundant test, but preserves the order for regression tests process = TRUE; } else if (pe__is_guest_or_remote_node(this_node)) { bool check = FALSE; resource_t *rsc = this_node->details->remote_rsc; if(fence) { check = TRUE; } else if(rsc == NULL) { /* Not ready yet */ } else if (pe__is_guest_node(this_node) && rsc->role == RSC_ROLE_STARTED && rsc->container->role == RSC_ROLE_STARTED) { /* Both the connection and its containing resource need to be * known to be up before we process resources running in it. */ check = TRUE; crm_trace("Checking node %s/%s/%s status %d/%d/%d", id, rsc->id, rsc->container->id, fence, rsc->role, RSC_ROLE_STARTED); } else if (!pe__is_guest_node(this_node) && rsc->role == RSC_ROLE_STARTED) { check = TRUE; crm_trace("Checking node %s/%s status %d/%d/%d", id, rsc->id, fence, rsc->role, RSC_ROLE_STARTED); } if (check) { determine_remote_online_status(data_set, this_node); unpack_handle_remote_attrs(this_node, state, data_set); process = TRUE; } } else if (this_node->details->online) { process = TRUE; } else if (fence) { process = TRUE; } if(process) { crm_trace("Processing lrm resource entries on %shealthy%s node: %s", fence?"un":"", (pe__is_guest_or_remote_node(this_node)? " remote" : ""), this_node->details->uname); changed = TRUE; this_node->details->unpacked = TRUE; lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); unpack_lrm_resources(this_node, lrm_rsc, data_set); } } return changed; } /* remove nodes that are down, stopping */ /* create positive rsc_to_node constraints between resources and the nodes they are running on */ /* anything else? */ gboolean unpack_status(xmlNode * status, pe_working_set_t * data_set) { const char *id = NULL; const char *uname = NULL; xmlNode *state = NULL; node_t *this_node = NULL; crm_trace("Beginning unpack"); if (data_set->tickets == NULL) { data_set->tickets = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_ticket); } for (state = __xml_first_child_element(status); state != NULL; state = __xml_next_element(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_TICKETS, TRUE)) { unpack_tickets_state((xmlNode *) state, data_set); } else if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE)) { xmlNode *attrs = NULL; const char *resource_discovery_enabled = NULL; id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if (uname == NULL) { /* error */ continue; } else if (this_node == NULL) { crm_config_warn("Node %s in status section no longer exists", uname); continue; } else if (pe__is_guest_or_remote_node(this_node)) { /* online state for remote nodes is determined by the * rsc state after all the unpacking is done. we do however * need to mark whether or not the node has been fenced as this plays * a role during unpacking cluster node resource state */ this_node->details->remote_was_fenced = crm_atoi(crm_element_value(state, XML_NODE_IS_FENCED), "0"); continue; } crm_trace("Processing node id=%s, uname=%s", id, uname); /* Mark the node as provisionally clean * - at least we have seen it in the current cluster's lifetime */ this_node->details->unclean = FALSE; this_node->details->unseen = FALSE; attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); add_node_attrs(attrs, this_node, TRUE, data_set); if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) { crm_info("Node %s is in standby-mode", this_node->details->uname); this_node->details->standby = TRUE; } if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance"))) { crm_info("Node %s is in maintenance-mode", this_node->details->uname); this_node->details->maintenance = TRUE; } resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY); if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) { crm_warn("ignoring %s attribute on node %s, disabling resource discovery is not allowed on cluster nodes", XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname); } crm_trace("determining node state"); determine_online_status(state, this_node, data_set); if (is_not_set(data_set->flags, pe_flag_have_quorum) && this_node->details->online && (data_set->no_quorum_policy == no_quorum_suicide)) { /* Everything else should flow from this automatically * At least until the PE becomes able to migrate off healthy resources */ pe_fence_node(data_set, this_node, "cluster does not have quorum"); } } } while(unpack_node_loop(status, FALSE, data_set)) { crm_trace("Start another loop"); } // Now catch any nodes we didn't see unpack_node_loop(status, is_set(data_set->flags, pe_flag_stonith_enabled), data_set); /* Now that we know where resources are, we can schedule stops of containers * with failed bundle connections */ if (data_set->stop_needed != NULL) { for (GList *item = data_set->stop_needed; item; item = item->next) { pe_resource_t *container = item->data; pe_node_t *node = pe__current_node(container); if (node) { stop_action(container, node, FALSE); } } g_list_free(data_set->stop_needed); data_set->stop_needed = NULL; } for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *this_node = gIter->data; if (this_node == NULL) { continue; } else if (!pe__is_guest_or_remote_node(this_node)) { continue; } else if(this_node->details->unpacked) { continue; } determine_remote_online_status(data_set, this_node); } return TRUE; } static gboolean determine_online_status_no_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (!crm_is_true(in_cluster)) { crm_trace("Node is down: in_cluster=%s", crm_str(in_cluster)); } else if (safe_str_eq(is_peer, ONLINESTATUS)) { if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { online = TRUE; } else { crm_debug("Node is not ready to run resources: %s", join); } } else if (this_node->details->expected_up == FALSE) { crm_trace("Controller is down: in_cluster=%s", crm_str(in_cluster)); crm_trace("\tis_peer=%s, join=%s, expected=%s", crm_str(is_peer), crm_str(join), crm_str(exp_state)); } else { /* mark it unclean */ pe_fence_node(data_set, this_node, "peer is unexpectedly down"); crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s", crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state)); } return online; } static gboolean determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; gboolean do_terminate = FALSE; bool crmd_online = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); const char *terminate = pe_node_attribute_raw(this_node, "terminate"); /* - XML_NODE_IN_CLUSTER ::= true|false - XML_NODE_IS_PEER ::= online|offline - XML_NODE_JOIN_STATE ::= member|down|pending|banned - XML_NODE_EXPECTED ::= member|down */ if (crm_is_true(terminate)) { do_terminate = TRUE; } else if (terminate != NULL && strlen(terminate) > 0) { /* could be a time() value */ char t = terminate[0]; if (t != '0' && isdigit(t)) { do_terminate = TRUE; } } crm_trace("%s: in_cluster=%s, is_peer=%s, join=%s, expected=%s, term=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate); online = crm_is_true(in_cluster); crmd_online = safe_str_eq(is_peer, ONLINESTATUS); if (exp_state == NULL) { exp_state = CRMD_JOINSTATE_DOWN; } if (this_node->details->shutdown) { crm_debug("%s is shutting down", this_node->details->uname); /* Slightly different criteria since we can't shut down a dead peer */ online = crmd_online; } else if (in_cluster == NULL) { pe_fence_node(data_set, this_node, "peer has not been seen by the cluster"); } else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) { pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria"); } else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) { if (crm_is_true(in_cluster) || crmd_online) { crm_info("- Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { crm_trace("%s is down or still coming up", this_node->details->uname); } } else if (do_terminate && safe_str_eq(join, CRMD_JOINSTATE_DOWN) && crm_is_true(in_cluster) == FALSE && !crmd_online) { crm_info("Node %s was just shot", this_node->details->uname); online = FALSE; } else if (crm_is_true(in_cluster) == FALSE) { pe_fence_node(data_set, this_node, "peer is no longer part of the cluster"); } else if (!crmd_online) { pe_fence_node(data_set, this_node, "peer process is no longer available"); /* Everything is running at this point, now check join state */ } else if (do_terminate) { pe_fence_node(data_set, this_node, "termination was requested"); } else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { crm_info("Node %s is active", this_node->details->uname); } else if (safe_str_eq(join, CRMD_JOINSTATE_PENDING) || safe_str_eq(join, CRMD_JOINSTATE_DOWN)) { crm_info("Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { pe_fence_node(data_set, this_node, "peer was in an unknown state"); crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown); } return online; } static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) { resource_t *rsc = this_node->details->remote_rsc; resource_t *container = NULL; pe_node_t *host = NULL; /* If there is a node state entry for a (former) Pacemaker Remote node * but no resource creating that node, the node's connection resource will * be NULL. Consider it an offline remote node in that case. */ if (rsc == NULL) { this_node->details->online = FALSE; goto remote_online_done; } container = rsc->container; if (container && (g_list_length(rsc->running_on) == 1)) { host = rsc->running_on->data; } /* If the resource is currently started, mark it online. */ if (rsc->role == RSC_ROLE_STARTED) { crm_trace("%s node %s presumed ONLINE because connection resource is started", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = TRUE; } /* consider this node shutting down if transitioning start->stop */ if (rsc->role == RSC_ROLE_STARTED && rsc->next_role == RSC_ROLE_STOPPED) { crm_trace("%s node %s shutting down because connection resource is stopping", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->shutdown = TRUE; } /* Now check all the failure conditions. */ if(container && is_set(container->flags, pe_rsc_failed)) { crm_trace("Guest node %s UNCLEAN because guest resource failed", this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = TRUE; } else if(is_set(rsc->flags, pe_rsc_failed)) { crm_trace("%s node %s OFFLINE because connection resource failed", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; } else if (rsc->role == RSC_ROLE_STOPPED || (container && container->role == RSC_ROLE_STOPPED)) { crm_trace("%s node %s OFFLINE because its resource is stopped", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = FALSE; } else if (host && (host->details->online == FALSE) && host->details->unclean) { crm_trace("Guest node %s UNCLEAN because host is unclean", this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = TRUE; } remote_online_done: crm_trace("Remote node %s online=%s", this_node->details->id, this_node->details->online ? "TRUE" : "FALSE"); return this_node->details->online; } gboolean determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) { gboolean online = FALSE; const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (this_node == NULL) { crm_config_err("No node to check"); return online; } this_node->details->shutdown = FALSE; this_node->details->expected_up = FALSE; if (pe__shutdown_requested(this_node)) { this_node->details->shutdown = TRUE; } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { this_node->details->expected_up = TRUE; } if (this_node->details->type == node_ping) { this_node->details->unclean = FALSE; online = FALSE; /* As far as resource management is concerned, * the node is safely offline. * Anyone caught abusing this logic will be shot */ } else if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) { online = determine_online_status_no_fencing(data_set, node_state, this_node); } else { online = determine_online_status_fencing(data_set, node_state, this_node); } if (online) { this_node->details->online = TRUE; } else { /* remove node from contention */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (online && this_node->details->shutdown) { /* don't run resources here */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (this_node->details->type == node_ping) { crm_info("Node %s is not a pacemaker node", this_node->details->uname); } else if (this_node->details->unclean) { pe_proc_warn("Node %s is unclean", this_node->details->uname); } else if (this_node->details->online) { crm_info("Node %s is %s", this_node->details->uname, this_node->details->shutdown ? "shutting down" : this_node->details->pending ? "pending" : this_node->details->standby ? "standby" : this_node->details->maintenance ? "maintenance" : "online"); } else { crm_trace("Node %s is offline", this_node->details->uname); } return online; } /*! * \internal * \brief Find the end of a resource's name, excluding any clone suffix * * \param[in] id Resource ID to check * * \return Pointer to last character of resource's base name */ const char * pe_base_name_end(const char *id) { if (!crm_strlen_zero(id)) { const char *end = id + strlen(id) - 1; for (const char *s = end; s > id; --s) { switch (*s) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case ':': return (s == end)? s : (s - 1); default: return end; } } return end; } return NULL; } /*! * \internal * \brief Get a resource name excluding any clone suffix * * \param[in] last_rsc_id Resource ID to check * * \return Pointer to newly allocated string with resource's base name * \note It is the caller's responsibility to free() the result. * This asserts on error, so callers can assume result is not NULL. */ char * clone_strip(const char *last_rsc_id) { const char *end = pe_base_name_end(last_rsc_id); char *basename = NULL; CRM_ASSERT(end); basename = strndup(last_rsc_id, end - last_rsc_id + 1); CRM_ASSERT(basename); return basename; } /*! * \internal * \brief Get the name of the first instance of a cloned resource * * \param[in] last_rsc_id Resource ID to check * * \return Pointer to newly allocated string with resource's base name plus :0 * \note It is the caller's responsibility to free() the result. * This asserts on error, so callers can assume result is not NULL. */ char * clone_zero(const char *last_rsc_id) { const char *end = pe_base_name_end(last_rsc_id); size_t base_name_len = end - last_rsc_id + 1; char *zero = NULL; CRM_ASSERT(end); zero = calloc(base_name_len + 3, sizeof(char)); CRM_ASSERT(zero); memcpy(zero, last_rsc_id, base_name_len); zero[base_name_len] = ':'; zero[base_name_len + 1] = '0'; return zero; } static resource_t * create_fake_resource(const char *rsc_id, xmlNode * rsc_entry, pe_working_set_t * data_set) { resource_t *rsc = NULL; xmlNode *xml_rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE); copy_in_properties(xml_rsc, rsc_entry); crm_xml_add(xml_rsc, XML_ATTR_ID, rsc_id); crm_log_xml_debug(xml_rsc, "Orphan resource"); if (!common_unpack(xml_rsc, &rsc, NULL, data_set)) { return NULL; } if (xml_contains_remote_node(xml_rsc)) { node_t *node; crm_debug("Detected orphaned remote node %s", rsc_id); node = pe_find_node(data_set->nodes, rsc_id); if (node == NULL) { node = pe_create_node(rsc_id, rsc_id, "remote", NULL, data_set); } link_rsc2remotenode(data_set, rsc); if (node) { crm_trace("Setting node %s as shutting down due to orphaned connection resource", rsc_id); node->details->shutdown = TRUE; } } if (crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER)) { /* This orphaned rsc needs to be mapped to a container. */ crm_trace("Detected orphaned container filler %s", rsc_id); set_bit(rsc->flags, pe_rsc_orphan_container_filler); } set_bit(rsc->flags, pe_rsc_orphan); data_set->resources = g_list_append(data_set->resources, rsc); return rsc; } /*! * \internal * \brief Create orphan instance for anonymous clone resource history */ static pe_resource_t * create_anonymous_orphan(pe_resource_t *parent, const char *rsc_id, pe_node_t *node, pe_working_set_t *data_set) { pe_resource_t *top = pe__create_clone_child(parent, data_set); // find_rsc() because we might be a cloned group pe_resource_t *orphan = top->fns->find_rsc(top, rsc_id, NULL, pe_find_clone); pe_rsc_debug(parent, "Created orphan %s for %s: %s on %s", top->id, parent->id, rsc_id, node->details->uname); return orphan; } /*! * \internal * \brief Check a node for an instance of an anonymous clone * * Return a child instance of the specified anonymous clone, in order of * preference: (1) the instance running on the specified node, if any; * (2) an inactive instance (i.e. within the total of clone-max instances); * (3) a newly created orphan (i.e. clone-max instances are already active). * * \param[in] data_set Cluster information * \param[in] node Node on which to check for instance * \param[in] parent Clone to check * \param[in] rsc_id Name of cloned resource in history (without instance) */ static resource_t * find_anonymous_clone(pe_working_set_t * data_set, node_t * node, resource_t * parent, const char *rsc_id) { GListPtr rIter = NULL; pe_resource_t *rsc = NULL; pe_resource_t *inactive_instance = NULL; gboolean skip_inactive = FALSE; CRM_ASSERT(parent != NULL); CRM_ASSERT(pe_rsc_is_clone(parent)); CRM_ASSERT(is_not_set(parent->flags, pe_rsc_unique)); // Check for active (or partially active, for cloned groups) instance pe_rsc_trace(parent, "Looking for %s on %s in %s", rsc_id, node->details->uname, parent->id); for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) { GListPtr locations = NULL; resource_t *child = rIter->data; /* Check whether this instance is already known to be active or pending * anywhere, at this stage of unpacking. Because this function is called * for a resource before the resource's individual operation history * entries are unpacked, locations will generally not contain the * desired node. * * However, there are three exceptions: * (1) when child is a cloned group and we have already unpacked the * history of another member of the group on the same node; * (2) when we've already unpacked the history of another numbered * instance on the same node (which can happen if globally-unique * was flipped from true to false); and * (3) when we re-run calculations on the same data set as part of a * simulation. */ child->fns->location(child, &locations, 2); if (locations) { /* We should never associate the same numbered anonymous clone * instance with multiple nodes, and clone instances can't migrate, * so there must be only one location, regardless of history. */ CRM_LOG_ASSERT(locations->next == NULL); if (((pe_node_t *)locations->data)->details == node->details) { /* This child instance is active on the requested node, so check * for a corresponding configured resource. We use find_rsc() * instead of child because child may be a cloned group, and we * need the particular member corresponding to rsc_id. * * If the history entry is orphaned, rsc will be NULL. */ rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); if (rsc) { /* If there are multiple instance history entries for an * anonymous clone in a single node's history (which can * happen if globally-unique is switched from true to * false), we want to consider the instances beyond the * first as orphans, even if there are inactive instance * numbers available. */ if (rsc->running_on) { crm_notice("Active (now-)anonymous clone %s has " "multiple (orphan) instance histories on %s", parent->id, node->details->uname); skip_inactive = TRUE; rsc = NULL; } else { pe_rsc_trace(parent, "Resource %s, active", rsc->id); } } } g_list_free(locations); } else { pe_rsc_trace(parent, "Resource %s, skip inactive", child->id); if (!skip_inactive && !inactive_instance && is_not_set(child->flags, pe_rsc_block)) { // Remember one inactive instance in case we don't find active inactive_instance = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); /* ... but don't use it if it was already associated with a * pending action on another node */ if (inactive_instance && inactive_instance->pending_node && (inactive_instance->pending_node->details != node->details)) { inactive_instance = NULL; } } } } if ((rsc == NULL) && !skip_inactive && (inactive_instance != NULL)) { pe_rsc_trace(parent, "Resource %s, empty slot", inactive_instance->id); rsc = inactive_instance; } /* If the resource has "requires" set to "quorum" or "nothing", and we don't * have a clone instance for every node, we don't want to consume a valid * instance number for unclean nodes. Such instances may appear to be active * according to the history, but should be considered inactive, so we can * start an instance elsewhere. Treat such instances as orphans. * * An exception is instances running on guest nodes -- since guest node * "fencing" is actually just a resource stop, requires shouldn't apply. * * @TODO Ideally, we'd use an inactive instance number if it is not needed * for any clean instances. However, we don't know that at this point. */ if ((rsc != NULL) && is_not_set(rsc->flags, pe_rsc_needs_fencing) && (!node->details->online || node->details->unclean) && !pe__is_guest_node(node) && !pe__is_universal_clone(parent, data_set)) { rsc = NULL; } if (rsc == NULL) { rsc = create_anonymous_orphan(parent, rsc_id, node, data_set); pe_rsc_trace(parent, "Resource %s, orphan", rsc->id); } return rsc; } static resource_t * unpack_find_resource(pe_working_set_t * data_set, node_t * node, const char *rsc_id, xmlNode * rsc_entry) { resource_t *rsc = NULL; resource_t *parent = NULL; crm_trace("looking for %s", rsc_id); rsc = pe_find_resource(data_set->resources, rsc_id); if (rsc == NULL) { /* If we didn't find the resource by its name in the operation history, * check it again as a clone instance. Even when clone-max=0, we create * a single :0 orphan to match against here. */ char *clone0_id = clone_zero(rsc_id); resource_t *clone0 = pe_find_resource(data_set->resources, clone0_id); if (clone0 && is_not_set(clone0->flags, pe_rsc_unique)) { rsc = clone0; parent = uber_parent(clone0); crm_trace("%s found as %s (%s)", rsc_id, clone0_id, parent->id); } else { crm_trace("%s is not known as %s either (orphan)", rsc_id, clone0_id); } free(clone0_id); } else if (rsc->variant > pe_native) { crm_trace("Resource history for %s is orphaned because it is no longer primitive", rsc_id); return NULL; } else { parent = uber_parent(rsc); } if (pe_rsc_is_anon_clone(parent)) { if (pe_rsc_is_bundled(parent)) { rsc = pe__find_bundle_replica(parent->parent, node); } else { char *base = clone_strip(rsc_id); rsc = find_anonymous_clone(data_set, node, parent, base); free(base); CRM_ASSERT(rsc != NULL); } } if (rsc && safe_str_neq(rsc_id, rsc->id) && safe_str_neq(rsc_id, rsc->clone_name)) { free(rsc->clone_name); rsc->clone_name = strdup(rsc_id); pe_rsc_debug(rsc, "Internally renamed %s on %s to %s%s", rsc_id, node->details->uname, rsc->id, (is_set(rsc->flags, pe_rsc_orphan)? " (ORPHAN)" : "")); } return rsc; } static resource_t * process_orphan_resource(xmlNode * rsc_entry, node_t * node, pe_working_set_t * data_set) { resource_t *rsc = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); crm_debug("Detected orphan resource %s on %s", rsc_id, node->details->uname); rsc = create_fake_resource(rsc_id, rsc_entry, data_set); if (is_set(data_set->flags, pe_flag_stop_rsc_orphans) == FALSE) { clear_bit(rsc->flags, pe_rsc_managed); } else { CRM_CHECK(rsc != NULL, return NULL); pe_rsc_trace(rsc, "Added orphan %s", rsc->id); resource_location(rsc, NULL, -INFINITY, "__orphan_do_not_run__", data_set); } return rsc; } static void process_rsc_state(resource_t * rsc, node_t * node, enum action_fail_response on_fail, xmlNode * migrate_op, pe_working_set_t * data_set) { node_t *tmpnode = NULL; char *reason = NULL; CRM_ASSERT(rsc); pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s", rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail)); /* process current state */ if (rsc->role != RSC_ROLE_UNKNOWN) { resource_t *iter = rsc; while (iter) { if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) { node_t *n = node_copy(node); pe_rsc_trace(rsc, "%s (aka. %s) known on %s", rsc->id, rsc->clone_name, n->details->uname); g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n); } if (is_set(iter->flags, pe_rsc_unique)) { break; } iter = iter->parent; } } /* If a managed resource is believed to be running, but node is down ... */ if (rsc->role > RSC_ROLE_STOPPED && node->details->online == FALSE && node->details->maintenance == FALSE && is_set(rsc->flags, pe_rsc_managed)) { gboolean should_fence = FALSE; /* If this is a guest node, fence it (regardless of whether fencing is * enabled, because guest node fencing is done by recovery of the * container resource rather than by the fencer). Mark the resource * we're processing as failed. When the guest comes back up, its * operation history in the CIB will be cleared, freeing the affected * resource to run again once we are sure we know its state. */ if (pe__is_guest_node(node)) { set_bit(rsc->flags, pe_rsc_failed); should_fence = TRUE; } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) { if (pe__is_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) { /* Setting unseen means that fencing of the remote node will * occur only if the connection resource is not going to start * somewhere. This allows connection resources on a failed * cluster node to move to another node without requiring the * remote nodes to be fenced as well. */ node->details->unseen = TRUE; reason = crm_strdup_printf("%s is active there (fencing will be" " revoked if remote connection can " "be re-established elsewhere)", rsc->id); } should_fence = TRUE; } if (should_fence) { if (reason == NULL) { reason = crm_strdup_printf("%s is thought to be active there", rsc->id); } pe_fence_node(data_set, node, reason); } free(reason); } if (node->details->unclean) { /* No extra processing needed * Also allows resources to be started again after a node is shot */ on_fail = action_fail_ignore; } switch (on_fail) { case action_fail_ignore: /* nothing to do */ break; case action_fail_fence: /* treat it as if it is still running * but also mark the node as unclean */ reason = crm_strdup_printf("%s failed there", rsc->id); pe_fence_node(data_set, node, reason); free(reason); break; case action_fail_standby: node->details->standby = TRUE; node->details->standby_onfail = TRUE; break; case action_fail_block: /* is_managed == FALSE will prevent any * actions being sent for the resource */ clear_bit(rsc->flags, pe_rsc_managed); set_bit(rsc->flags, pe_rsc_block); break; case action_fail_migrate: /* make sure it comes up somewhere else * or not at all */ resource_location(rsc, node, -INFINITY, "__action_migration_auto__", data_set); break; case action_fail_stop: rsc->next_role = RSC_ROLE_STOPPED; break; case action_fail_recover: if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { set_bit(rsc->flags, pe_rsc_failed); stop_action(rsc, node, FALSE); } break; case action_fail_restart_container: set_bit(rsc->flags, pe_rsc_failed); if (rsc->container && pe_rsc_is_bundled(rsc)) { /* A bundle's remote connection can run on a different node than * the bundle's container. We don't necessarily know where the * container is running yet, so remember it and add a stop * action for it later. */ data_set->stop_needed = g_list_prepend(data_set->stop_needed, rsc->container); } else if (rsc->container) { stop_action(rsc->container, node, FALSE); } else if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { stop_action(rsc, node, FALSE); } break; case action_fail_reset_remote: set_bit(rsc->flags, pe_rsc_failed); if (is_set(data_set->flags, pe_flag_stonith_enabled)) { tmpnode = NULL; if (rsc->is_remote_node) { tmpnode = pe_find_node(data_set->nodes, rsc->id); } if (tmpnode && pe__is_remote_node(tmpnode) && tmpnode->details->remote_was_fenced == 0) { /* The remote connection resource failed in a way that * should result in fencing the remote node. */ pe_fence_node(data_set, tmpnode, "remote connection is unrecoverable"); } } /* require the stop action regardless if fencing is occurring or not. */ if (rsc->role > RSC_ROLE_STOPPED) { stop_action(rsc, node, FALSE); } /* if reconnect delay is in use, prevent the connection from exiting the * "STOPPED" role until the failure is cleared by the delay timeout. */ if (rsc->remote_reconnect_ms) { rsc->next_role = RSC_ROLE_STOPPED; } break; } /* ensure a remote-node connection failure forces an unclean remote-node * to be fenced. By setting unseen = FALSE, the remote-node failure will * result in a fencing operation regardless if we're going to attempt to * reconnect to the remote-node in this transition or not. */ if (is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) { tmpnode = pe_find_node(data_set->nodes, rsc->id); if (tmpnode && tmpnode->details->unclean) { tmpnode->details->unseen = FALSE; } } if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { if (is_set(rsc->flags, pe_rsc_orphan)) { if (is_set(rsc->flags, pe_rsc_managed)) { crm_config_warn("Detected active orphan %s running on %s", rsc->id, node->details->uname); } else { crm_config_warn("Cluster configured not to stop active orphans." " %s must be stopped manually on %s", rsc->id, node->details->uname); } } native_add_running(rsc, node, data_set); if (on_fail != action_fail_ignore) { set_bit(rsc->flags, pe_rsc_failed); } } else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) { /* Only do this for older status sections that included instance numbers * Otherwise stopped instances will appear as orphans */ pe_rsc_trace(rsc, "Resetting clone_name %s for %s (stopped)", rsc->clone_name, rsc->id); free(rsc->clone_name); rsc->clone_name = NULL; } else { GList *possible_matches = pe__resource_actions(rsc, node, RSC_STOP, FALSE); GListPtr gIter = possible_matches; for (; gIter != NULL; gIter = gIter->next) { action_t *stop = (action_t *) gIter->data; stop->flags |= pe_action_optional; } g_list_free(possible_matches); } } /* create active recurring operations as optional */ static void process_recurring(node_t * node, resource_t * rsc, int start_index, int stop_index, GListPtr sorted_op_list, pe_working_set_t * data_set) { int counter = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; CRM_ASSERT(rsc); pe_rsc_trace(rsc, "%s: Start index %d, stop index = %d", rsc->id, start_index, stop_index); for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; guint interval_ms = 0; char *key = NULL; const char *id = ID(rsc_op); const char *interval_ms_s = NULL; counter++; if (node->details->online == FALSE) { pe_rsc_trace(rsc, "Skipping %s/%s: node is offline", rsc->id, node->details->uname); break; /* Need to check if there's a monitor for role="Stopped" */ } else if (start_index < stop_index && counter <= stop_index) { pe_rsc_trace(rsc, "Skipping %s/%s: resource is not active", id, node->details->uname); continue; } else if (counter < start_index) { pe_rsc_trace(rsc, "Skipping %s/%s: old %d", id, node->details->uname, counter); continue; } interval_ms_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL_MS); interval_ms = crm_parse_ms(interval_ms_s); if (interval_ms == 0) { pe_rsc_trace(rsc, "Skipping %s/%s: non-recurring", id, node->details->uname); continue; } status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(status, "-1")) { pe_rsc_trace(rsc, "Skipping %s/%s: status", id, node->details->uname); continue; } task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); /* create the action */ key = generate_op_key(rsc->id, task, interval_ms); pe_rsc_trace(rsc, "Creating %s/%s", key, node->details->uname); custom_action(rsc, key, task, node, TRUE, TRUE, data_set); } } void calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index) { int counter = -1; int implied_monitor_start = -1; int implied_clone_start = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; *stop_index = -1; *start_index = -1; for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(task, CRMD_ACTION_STOP) && safe_str_eq(status, "0")) { *stop_index = counter; } else if (safe_str_eq(task, CRMD_ACTION_START) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { *start_index = counter; } else if ((implied_monitor_start <= *stop_index) && safe_str_eq(task, CRMD_ACTION_STATUS)) { const char *rc = crm_element_value(rsc_op, XML_LRM_ATTR_RC); if (safe_str_eq(rc, "0") || safe_str_eq(rc, "8")) { implied_monitor_start = counter; } } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE) || safe_str_eq(task, CRMD_ACTION_DEMOTE)) { implied_clone_start = counter; } } if (*start_index == -1) { if (implied_clone_start != -1) { *start_index = implied_clone_start; } else if (implied_monitor_start != -1) { *start_index = implied_monitor_start; } } } static resource_t * unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set) { GListPtr gIter = NULL; int stop_index = -1; int start_index = -1; enum rsc_role_e req_role = RSC_ROLE_UNKNOWN; const char *task = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); resource_t *rsc = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; xmlNode *migrate_op = NULL; xmlNode *rsc_op = NULL; xmlNode *last_failure = NULL; enum action_fail_response on_fail = FALSE; enum rsc_role_e saved_role = RSC_ROLE_UNKNOWN; crm_trace("[%s] Processing %s on %s", crm_element_name(rsc_entry), rsc_id, node->details->uname); /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child_element(rsc_entry); rsc_op != NULL; rsc_op = __xml_next_element(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return NULL; } /* find the resource */ rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry); if (rsc == NULL) { rsc = process_orphan_resource(rsc_entry, node, data_set); } CRM_ASSERT(rsc != NULL); /* process operations */ saved_role = rsc->role; on_fail = action_fail_ignore; rsc->role = RSC_ROLE_UNKNOWN; sorted_op_list = g_list_sort(op_list, sort_op_by_callid); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { migrate_op = rsc_op; } unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail, data_set); } /* create active recurring operations as optional */ calculate_active_ops(sorted_op_list, &start_index, &stop_index); process_recurring(node, rsc, start_index, stop_index, sorted_op_list, data_set); /* no need to free the contents */ g_list_free(sorted_op_list); process_rsc_state(rsc, node, on_fail, migrate_op, data_set); if (get_target_role(rsc, &req_role)) { if (rsc->next_role == RSC_ROLE_UNKNOWN || req_role < rsc->next_role) { pe_rsc_debug(rsc, "%s: Overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); rsc->next_role = req_role; } else if (req_role > rsc->next_role) { pe_rsc_info(rsc, "%s: Not overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); } } if (saved_role > rsc->role) { rsc->role = saved_role; } return rsc; } static void handle_orphaned_container_fillers(xmlNode * lrm_rsc_list, pe_working_set_t * data_set) { xmlNode *rsc_entry = NULL; for (rsc_entry = __xml_first_child_element(lrm_rsc_list); rsc_entry != NULL; rsc_entry = __xml_next_element(rsc_entry)) { resource_t *rsc; resource_t *container; const char *rsc_id; const char *container_id; if (safe_str_neq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE)) { continue; } container_id = crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER); rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); if (container_id == NULL || rsc_id == NULL) { continue; } container = pe_find_resource(data_set->resources, container_id); if (container == NULL) { continue; } rsc = pe_find_resource(data_set->resources, rsc_id); if (rsc == NULL || is_set(rsc->flags, pe_rsc_orphan_container_filler) == FALSE || rsc->container != NULL) { continue; } pe_rsc_trace(rsc, "Mapped container of orphaned resource %s to %s", rsc->id, container_id); rsc->container = container; container->fillers = g_list_append(container->fillers, rsc); } } gboolean unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * data_set) { xmlNode *rsc_entry = NULL; gboolean found_orphaned_container_filler = FALSE; CRM_CHECK(node != NULL, return FALSE); crm_trace("Unpacking resources on %s", node->details->uname); for (rsc_entry = __xml_first_child_element(lrm_rsc_list); rsc_entry != NULL; rsc_entry = __xml_next_element(rsc_entry)) { if (crm_str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, TRUE)) { resource_t *rsc = unpack_lrm_rsc_state(node, rsc_entry, data_set); if (!rsc) { continue; } if (is_set(rsc->flags, pe_rsc_orphan_container_filler)) { found_orphaned_container_filler = TRUE; } } } /* now that all the resource state has been unpacked for this node * we have to go back and map any orphaned container fillers to their * container resource */ if (found_orphaned_container_filler) { handle_orphaned_container_fillers(lrm_rsc_list, data_set); } return TRUE; } static void set_active(resource_t * rsc) { resource_t *top = uber_parent(rsc); if (top && is_set(top->flags, pe_rsc_promotable)) { rsc->role = RSC_ROLE_SLAVE; } else { rsc->role = RSC_ROLE_STARTED; } } static void set_node_score(gpointer key, gpointer value, gpointer user_data) { node_t *node = value; int *score = user_data; node->weight = *score; } #define STATUS_PATH_MAX 1024 static xmlNode * find_lrm_op(const char *resource, const char *op, const char *node, const char *source, bool success_only, pe_working_set_t *data_set) { int offset = 0; char xpath[STATUS_PATH_MAX]; xmlNode *xml = NULL; offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//node_state[@uname='%s']", node); offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//" XML_LRM_TAG_RESOURCE "[@id='%s']", resource); /* Need to check against transition_magic too? */ if (source && safe_str_eq(op, CRMD_ACTION_MIGRATE)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_target='%s']", op, source); } else if (source && safe_str_eq(op, CRMD_ACTION_MIGRATED)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_source='%s']", op, source); } else { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s']", op); } CRM_LOG_ASSERT(offset > 0); xml = get_xpath_object(xpath, data_set->input, LOG_DEBUG); if (xml && success_only) { int rc = PCMK_OCF_UNKNOWN_ERROR; int status = PCMK_LRM_OP_ERROR; crm_element_value_int(xml, XML_LRM_ATTR_RC, &rc); crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, &status); if ((rc != PCMK_OCF_OK) || (status != PCMK_LRM_OP_DONE)) { return NULL; } } return xml; } static int pe__call_id(xmlNode *op_xml) { int id = 0; if (op_xml) { crm_element_value_int(op_xml, XML_LRM_ATTR_CALLID, &id); } return id; } /*! * \brief Check whether a stop happened on the same node after some event * * \param[in] rsc Resource being checked * \param[in] node Node being checked * \param[in] xml_op Event that stop is being compared to * \param[in] data_set Cluster working set * * \return TRUE if stop happened after event, FALSE otherwise * * \note This is really unnecessary, but kept as a safety mechanism. We * currently don't save more than one successful event in history, so this * only matters when processing really old CIB files that we don't * technically support anymore, or as preparation for logging an extended * history in the future. */ static bool stop_happened_after(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, pe_working_set_t *data_set) { xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, node->details->uname, NULL, TRUE, data_set); return (stop_op && (pe__call_id(stop_op) > pe__call_id(xml_op))); } static void unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, pe_working_set_t *data_set) { /* A successful migration sequence is: * migrate_to on source node * migrate_from on target node * stop on source node * * If a migrate_to is followed by a stop, the entire migration (successful * or failed) is complete, and we don't care what happened on the target. * * If no migrate_from has happened, the migration is considered to be * "partial". If the migrate_from failed, make sure the resource gets * stopped on both source and target (if up). * * If the migrate_to and migrate_from both succeeded (which also implies the * resource is no longer running on the source), but there is no stop, the * migration is considered to be "dangling". Schedule a stop on the source * in this case. */ int from_rc = 0; int from_status = 0; pe_node_t *target_node = NULL; pe_node_t *source_node = NULL; xmlNode *migrate_from = NULL; const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); // Sanity check CRM_CHECK(source && target && !strcmp(source, node->details->uname), return); if (stop_happened_after(rsc, node, xml_op, data_set)) { return; } // Clones are not allowed to migrate, so role can't be master rsc->role = RSC_ROLE_STARTED; target_node = pe_find_node(data_set->nodes, target); source_node = pe_find_node(data_set->nodes, source); // Check whether there was a migrate_from action on the target migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target, source, FALSE, data_set); if (migrate_from) { crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc); crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, &from_status); pe_rsc_trace(rsc, "%s op on %s exited with status=%d, rc=%d", ID(migrate_from), target, from_status, from_rc); } if (migrate_from && from_rc == PCMK_OCF_OK && from_status == PCMK_LRM_OP_DONE) { /* The migrate_to and migrate_from both succeeded, so mark the migration * as "dangling". This will be used to schedule a stop action on the * source without affecting the target. */ pe_rsc_trace(rsc, "Detected dangling migration op: %s on %s", ID(xml_op), source); rsc->role = RSC_ROLE_STOPPED; rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } else if (migrate_from && (from_status != PCMK_LRM_OP_PENDING)) { // Failed if (target_node && target_node->details->online) { pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node, target_node->details->online); native_add_running(rsc, target_node, data_set); } } else { // Pending, or complete but erased if (target_node && target_node->details->online) { pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node, target_node->details->online); native_add_running(rsc, target_node, data_set); if (source_node && source_node->details->online) { /* This is a partial migration: the migrate_to completed * successfully on the source, but the migrate_from has not * completed. Remember the source and target; if the newly * chosen target remains the same when we schedule actions * later, we may continue with the migration. */ rsc->partial_migration_target = target_node; rsc->partial_migration_source = source_node; } } else { /* Consider it failed here - forces a restart, prevents migration */ set_bit(rsc->flags, pe_rsc_failed); clear_bit(rsc->flags, pe_rsc_allow_migrate); } } } static void unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, pe_working_set_t *data_set) { int target_stop_id = 0; int target_migrate_from_id = 0; xmlNode *target_stop = NULL; xmlNode *target_migrate_from = NULL; const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); // Sanity check CRM_CHECK(source && target && !strcmp(source, node->details->uname), return); /* If a migration failed, we have to assume the resource is active. Clones * are not allowed to migrate, so role can't be master. */ rsc->role = RSC_ROLE_STARTED; // Check for stop on the target target_stop = find_lrm_op(rsc->id, CRMD_ACTION_STOP, target, NULL, TRUE, data_set); target_stop_id = pe__call_id(target_stop); // Check for migrate_from on the target target_migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target, source, TRUE, data_set); target_migrate_from_id = pe__call_id(target_migrate_from); if ((target_stop == NULL) || (target_stop_id < target_migrate_from_id)) { /* There was no stop on the source, or a stop that happened before a * migrate_from, so assume the resource is still active on the target * (if it is up). */ node_t *target_node = pe_find_node(data_set->nodes, target); pe_rsc_trace(rsc, "stop (%d) + migrate_from (%d)", target_stop_id, target_migrate_from_id); if (target_node && target_node->details->online) { native_add_running(rsc, target_node, data_set); } } else if (target_migrate_from == NULL) { /* We know there was a stop on the target, but there may not have been a * migrate_from (the stop could have happened before migrate_from was * scheduled or attempted). * * That means this could be a "dangling" migration. But first, check * whether there is a newer migrate_from or start on the source node -- * it's possible the failed migration was followed by a successful * full restart or migration in the reverse direction, in which case we * don't want to force it to stop. */ xmlNode *source_migrate_from = NULL; xmlNode *source_start = NULL; int source_migrate_to_id = pe__call_id(xml_op); source_migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, source, NULL, TRUE, data_set); if (pe__call_id(source_migrate_from) > source_migrate_to_id) { return; } source_start = find_lrm_op(rsc->id, CRMD_ACTION_START, source, NULL, TRUE, data_set); if (pe__call_id(source_start) > source_migrate_to_id) { return; } // Mark node as having dangling migration so we can force a stop later rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } } static void unpack_migrate_from_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, pe_working_set_t *data_set) { xmlNode *source_stop = NULL; xmlNode *source_migrate_to = NULL; const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); // Sanity check CRM_CHECK(source && target && !strcmp(target, node->details->uname), return); /* If a migration failed, we have to assume the resource is active. Clones * are not allowed to migrate, so role can't be master. */ rsc->role = RSC_ROLE_STARTED; // Check for a stop on the source source_stop = find_lrm_op(rsc->id, CRMD_ACTION_STOP, source, NULL, TRUE, data_set); // Check for a migrate_to on the source source_migrate_to = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATE, source, target, TRUE, data_set); if ((source_stop == NULL) || (pe__call_id(source_stop) < pe__call_id(source_migrate_to))) { /* There was no stop on the source, or a stop that happened before * migrate_to, so assume the resource is still active on the source (if * it is up). */ pe_node_t *source_node = pe_find_node(data_set->nodes, source); if (source_node && source_node->details->online) { native_add_running(rsc, source_node, data_set); } } } static void record_failed_op(xmlNode *op, const pe_node_t *node, const pe_resource_t *rsc, pe_working_set_t *data_set) { xmlNode *xIter = NULL; const char *op_key = crm_element_value(op, XML_LRM_ATTR_TASK_KEY); if (node->details->online == FALSE) { return; } for (xIter = data_set->failed->children; xIter; xIter = xIter->next) { const char *key = crm_element_value(xIter, XML_LRM_ATTR_TASK_KEY); const char *uname = crm_element_value(xIter, XML_ATTR_UNAME); if(safe_str_eq(op_key, key) && safe_str_eq(uname, node->details->uname)) { crm_trace("Skipping duplicate entry %s on %s", op_key, node->details->uname); return; } } crm_trace("Adding entry %s on %s", op_key, node->details->uname); crm_xml_add(op, XML_ATTR_UNAME, node->details->uname); crm_xml_add(op, XML_LRM_ATTR_RSCID, rsc->id); add_node_copy(data_set->failed, op); } static const char *get_op_key(xmlNode *xml_op) { const char *key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); if(key == NULL) { key = ID(xml_op); } return key; } +static const char * +last_change_str(xmlNode *xml_op) +{ + time_t when; + const char *when_s = NULL; + + if (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE, + &when) == pcmk_ok) { + when_s = crm_now_string(&when); + if (when_s) { + // Skip day of week to make message shorter + when_s = strchr(when_s, ' '); + if (when_s) { + ++when_s; + } + } + } + return ((when_s && *when_s)? when_s : "unknown time"); +} + static void unpack_rsc_op_failure(resource_t * rsc, node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { guint interval_ms = 0; - bool is_probe = FALSE; + bool is_probe = false; action_t *action = NULL; const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); + const char *exit_reason = crm_element_value(xml_op, + XML_LRM_ATTR_EXIT_REASON); CRM_ASSERT(rsc); + CRM_CHECK(task != NULL, return); *last_failure = xml_op; crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - if ((interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) { - is_probe = TRUE; - pe_rsc_trace(rsc, "is a probe: %s", key); + if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { + is_probe = true; } - if (rc != PCMK_OCF_NOT_INSTALLED || is_set(data_set->flags, pe_flag_symmetric_cluster)) { - crm_warn("Processing failed %s of %s on %s: %s " CRM_XS " rc=%d", + if (exit_reason == NULL) { + exit_reason = ""; + } + + if (is_not_set(data_set->flags, pe_flag_symmetric_cluster) + && (rc == PCMK_OCF_NOT_INSTALLED)) { + crm_trace("Unexpected result (%s%s%s) was recorded for " + "%s of %s on %s at %s " CRM_XS " rc=%d id=%s", + services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, + (is_probe? "probe" : task), rsc->id, node->details->uname, + last_change_str(xml_op), rc, ID(xml_op)); + } else { + crm_warn("Unexpected result (%s%s%s) was recorded for " + "%s of %s on %s at %s " CRM_XS " rc=%d id=%s", + services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, (is_probe? "probe" : task), rsc->id, node->details->uname, - services_ocf_exitcode_str(rc), rc); + last_change_str(xml_op), rc, ID(xml_op)); if (is_probe && (rc != PCMK_OCF_OK) && (rc != PCMK_OCF_NOT_RUNNING) && (rc != PCMK_OCF_RUNNING_MASTER)) { /* A failed (not just unexpected) probe result could mean the user * didn't know resources will be probed even where they can't run. */ crm_notice("If it is not possible for %s to run on %s, see " "the resource-discovery option for location constraints", rsc->id, node->details->uname); } record_failed_op(xml_op, node, rsc, data_set); - - } else { - crm_trace("Processing failed op %s for %s on %s: %s (%d)", - task, rsc->id, node->details->uname, services_ocf_exitcode_str(rc), - rc); } action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set); if ((action->on_fail <= action_fail_fence && *on_fail < action->on_fail) || (action->on_fail == action_fail_reset_remote && *on_fail <= action_fail_recover) || (action->on_fail == action_fail_restart_container && *on_fail <= action_fail_recover) || (*on_fail == action_fail_restart_container && action->on_fail >= action_fail_migrate)) { pe_rsc_trace(rsc, "on-fail %s -> %s for %s (%s)", fail2text(*on_fail), fail2text(action->on_fail), action->uuid, key); *on_fail = action->on_fail; } - if (safe_str_eq(task, CRMD_ACTION_STOP)) { + if (!strcmp(task, CRMD_ACTION_STOP)) { resource_location(rsc, node, -INFINITY, "__stop_fail__", data_set); - } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { + } else if (!strcmp(task, CRMD_ACTION_MIGRATE)) { unpack_migrate_to_failure(rsc, node, xml_op, data_set); - } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { + } else if (!strcmp(task, CRMD_ACTION_MIGRATED)) { unpack_migrate_from_failure(rsc, node, xml_op, data_set); - } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { + } else if (!strcmp(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; - } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { + } else if (!strcmp(task, CRMD_ACTION_DEMOTE)) { if (action->on_fail == action_fail_block) { rsc->role = RSC_ROLE_MASTER; rsc->next_role = RSC_ROLE_STOPPED; } else if(rc == PCMK_OCF_NOT_RUNNING) { rsc->role = RSC_ROLE_STOPPED; } else { /* * Staying in master role would put the PE/TE into a loop. Setting * slave role is not dangerous because the resource will be stopped * as part of recovery, and any master promotion will be ordered * after that stop. */ rsc->role = RSC_ROLE_SLAVE; } } if(is_probe && rc == PCMK_OCF_NOT_INSTALLED) { /* leave stopped */ pe_rsc_trace(rsc, "Leaving %s stopped", rsc->id); rsc->role = RSC_ROLE_STOPPED; } else if (rsc->role < RSC_ROLE_STARTED) { pe_rsc_trace(rsc, "Setting %s active", rsc->id); set_active(rsc); } pe_rsc_trace(rsc, "Resource %s: role=%s, unclean=%s, on_fail=%s, fail_role=%s", rsc->id, role2text(rsc->role), node->details->unclean ? "true" : "false", fail2text(action->on_fail), role2text(action->fail_role)); if (action->fail_role != RSC_ROLE_STARTED && rsc->next_role < action->fail_role) { rsc->next_role = action->fail_role; } if (action->fail_role == RSC_ROLE_STOPPED) { int score = -INFINITY; resource_t *fail_rsc = rsc; if (fail_rsc->parent) { resource_t *parent = uber_parent(fail_rsc); if (pe_rsc_is_clone(parent) && is_not_set(parent->flags, pe_rsc_unique)) { /* For clone resources, if a child fails on an operation * with on-fail = stop, all the resources fail. Do this by preventing * the parent from coming up again. */ fail_rsc = parent; } } - crm_warn("Making sure %s doesn't come up again", fail_rsc->id); + crm_notice("%s will not be started under current conditions", + fail_rsc->id); /* make sure it doesn't come up again */ if (fail_rsc->allowed_nodes != NULL) { g_hash_table_destroy(fail_rsc->allowed_nodes); } fail_rsc->allowed_nodes = node_hash_from_list(data_set->nodes); g_hash_table_foreach(fail_rsc->allowed_nodes, set_node_score, &score); } pe_free_action(action); } /*! * \internal * \brief Remap operation status based on action result * * Given an action result, determine an appropriate operation status for the * purposes of responding to the action (the status provided by the executor is * not directly usable since the executor does not know what was expected). * * \param[in,out] rsc Resource that operation history entry is for * \param[in] rc Actual return code of operation * \param[in] target_rc Expected return code of operation * \param[in] node Node where operation was executed * \param[in] xml_op Operation history entry XML from CIB status * \param[in,out] on_fail What should be done about the result * \param[in] data_set Current cluster working set * * \return Operation status based on return code and action info * \note This may update the resource's current and next role. */ static int determine_op_status( resource_t *rsc, int rc, int target_rc, node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) { guint interval_ms = 0; + bool is_probe = false; int result = PCMK_LRM_OP_DONE; - const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); - - bool is_probe = FALSE; + const char *exit_reason = crm_element_value(xml_op, + XML_LRM_ATTR_EXIT_REASON); CRM_ASSERT(rsc); + CRM_CHECK(task != NULL, return PCMK_LRM_OP_ERROR); + + if (exit_reason == NULL) { + exit_reason = ""; + } crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - if ((interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) { - is_probe = TRUE; + if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { + is_probe = true; + task = "probe"; } if (target_rc < 0) { /* Pre-1.0 Pacemaker versions, and Pacemaker 1.1.6 or earlier with * Heartbeat 2.0.7 or earlier as the cluster layer, did not include the * target_rc in the transition key, which (along with the similar case * of a corrupted transition key in the CIB) will be reported to this * function as -1. Pacemaker 2.0+ does not support rolling upgrades from * those versions or processing of saved CIB files from those versions, * so we do not need to care much about this case. */ result = PCMK_LRM_OP_ERROR; crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", key, node->details->uname); } else if (target_rc != rc) { result = PCMK_LRM_OP_ERROR; - pe_rsc_debug(rsc, "%s on %s returned '%s' (%d) instead of the expected value: '%s' (%d)", + pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", key, node->details->uname, - services_ocf_exitcode_str(rc), rc, - services_ocf_exitcode_str(target_rc), target_rc); + target_rc, services_ocf_exitcode_str(target_rc), + rc, services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason); } switch (rc) { case PCMK_OCF_OK: - // @TODO Should this be (rc != target_rc)? if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { result = PCMK_LRM_OP_DONE; - pe_rsc_info(rsc, "Operation %s found resource %s active on %s", - task, rsc->id, node->details->uname); + pe_rsc_info(rsc, "Probe found %s active on %s at %s", + rsc->id, node->details->uname, + last_change_str(xml_op)); } break; case PCMK_OCF_NOT_RUNNING: if (is_probe || target_rc == rc || is_not_set(rsc->flags, pe_rsc_managed)) { result = PCMK_LRM_OP_DONE; rsc->role = RSC_ROLE_STOPPED; /* clear any previous failure actions */ *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } break; case PCMK_OCF_RUNNING_MASTER: if (is_probe && (rc != target_rc)) { result = PCMK_LRM_OP_DONE; - pe_rsc_info(rsc, "Operation %s found resource %s active in master mode on %s", - task, rsc->id, node->details->uname); + pe_rsc_info(rsc, + "Probe found %s active and promoted on %s at %s", + rsc->id, node->details->uname, + last_change_str(xml_op)); } rsc->role = RSC_ROLE_MASTER; break; case PCMK_OCF_DEGRADED_MASTER: case PCMK_OCF_FAILED_MASTER: rsc->role = RSC_ROLE_MASTER; result = PCMK_LRM_OP_ERROR; break; case PCMK_OCF_NOT_CONFIGURED: result = PCMK_LRM_OP_ERROR_FATAL; break; case PCMK_OCF_UNIMPLEMENT_FEATURE: if (interval_ms > 0) { result = PCMK_LRM_OP_NOTSUPPORTED; break; } // fall through case PCMK_OCF_NOT_INSTALLED: case PCMK_OCF_INVALID_PARAM: case PCMK_OCF_INSUFFICIENT_PRIV: if (!pe_can_fence(data_set, node) - && safe_str_eq(task, CRMD_ACTION_STOP)) { + && !strcmp(task, CRMD_ACTION_STOP)) { /* If a stop fails and we can't fence, there's nothing else we can do */ - pe_proc_err("No further recovery can be attempted for %s: %s action failed with '%s' (%d)", - rsc->id, task, services_ocf_exitcode_str(rc), rc); + pe_proc_err("No further recovery can be attempted for %s " + "because %s on %s failed (%s%s%s) at %s " + CRM_XS " rc=%d id=%s", rsc->id, task, + node->details->uname, services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, + last_change_str(xml_op), rc, ID(xml_op)); clear_bit(rsc->flags, pe_rsc_managed); set_bit(rsc->flags, pe_rsc_block); } result = PCMK_LRM_OP_ERROR_HARD; break; default: if (result == PCMK_LRM_OP_DONE) { - crm_info("Treating unknown return code %d for %s on %s as failure", - rc, key, node->details->uname); + crm_info("Treating unknown exit status %d from %s of %s " + "on %s at %s as failure", + rc, task, rsc->id, node->details->uname, + last_change_str(xml_op)); result = PCMK_LRM_OP_ERROR; } break; } return result; } // return TRUE if start or monitor last failure but parameters changed static bool should_clear_for_param_change(xmlNode *xml_op, const char *task, pe_resource_t *rsc, pe_node_t *node, pe_working_set_t *data_set) { if (!strcmp(task, "start") || !strcmp(task, "monitor")) { if (pe__bundle_needs_remote_name(rsc)) { /* We haven't allocated resources yet, so we can't reliably * substitute addr parameters for the REMOTE_CONTAINER_HACK. * When that's needed, defer the check until later. */ pe__add_param_check(xml_op, rsc, node, pe_check_last_failure, data_set); } else { op_digest_cache_t *digest_data = NULL; digest_data = rsc_action_digest_cmp(rsc, xml_op, node, data_set); switch (digest_data->rc) { case RSC_DIGEST_UNKNOWN: crm_trace("Resource %s history entry %s on %s" " has no digest to compare", rsc->id, get_op_key(xml_op), node->details->id); break; case RSC_DIGEST_MATCH: break; default: return TRUE; } } } return FALSE; } // Order action after fencing of remote node, given connection rsc static void order_after_remote_fencing(pe_action_t *action, pe_resource_t *remote_conn, pe_working_set_t *data_set) { pe_node_t *remote_node = pe_find_node(data_set->nodes, remote_conn->id); if (remote_node) { pe_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL, data_set); order_actions(fence, action, pe_order_implies_then); } } static bool should_ignore_failure_timeout(pe_resource_t *rsc, xmlNode *xml_op, const char *task, guint interval_ms, bool is_last_failure, pe_working_set_t *data_set) { /* Clearing failures of recurring monitors has special concerns. The * executor reports only changes in the monitor result, so if the * monitor is still active and still getting the same failure result, * that will go undetected after the failure is cleared. * * Also, the operation history will have the time when the recurring * monitor result changed to the given code, not the time when the * result last happened. * * @TODO We probably should clear such failures only when the failure * timeout has passed since the last occurrence of the failed result. * However we don't record that information. We could maybe approximate * that by clearing only if there is a more recent successful monitor or * stop result, but we don't even have that information at this point * since we are still unpacking the resource's operation history. * * This is especially important for remote connection resources with a * reconnect interval, so in that case, we skip clearing failures * if the remote node hasn't been fenced. */ if (rsc->remote_reconnect_ms && is_set(data_set->flags, pe_flag_stonith_enabled) && (interval_ms != 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) { pe_node_t *remote_node = pe_find_node(data_set->nodes, rsc->id); if (remote_node && !remote_node->details->remote_was_fenced) { if (is_last_failure) { crm_info("Waiting to clear monitor failure for remote node %s" " until fencing has occurred", rsc->id); } return TRUE; } } return FALSE; } /*! * \internal * \brief Check operation age and schedule failure clearing when appropriate * * This function has two distinct purposes. The first is to check whether an * operation history entry is expired (i.e. the resource has a failure timeout, * the entry is older than the timeout, and the resource either has no fail * count or its fail count is entirely older than the timeout). The second is to * schedule fail count clearing when appropriate (i.e. the operation is expired * and either the resource has an expired fail count or the operation is a * last_failure for a remote connection resource with a reconnect interval, * or the operation is a last_failure for a start or monitor operation and the * resource's parameters have changed since the operation). * * \param[in] rsc Resource that operation happened to * \param[in] node Node that operation happened on * \param[in] rc Actual result of operation * \param[in] xml_op Operation history entry XML * \param[in] data_set Current working set * * \return TRUE if operation history entry is expired, FALSE otherwise */ static bool check_operation_expiry(pe_resource_t *rsc, pe_node_t *node, int rc, xmlNode *xml_op, pe_working_set_t *data_set) { bool expired = FALSE; bool is_last_failure = crm_ends_with(ID(xml_op), "_last_failure_0"); time_t last_run = 0; guint interval_ms = 0; int unexpired_fail_count = 0; const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *clear_reason = NULL; crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if ((rsc->failure_timeout > 0) && (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE, &last_run) == 0)) { // Resource has a failure-timeout, and history entry has a timestamp time_t now = get_effective_time(data_set); time_t last_failure = 0; // Is this particular operation history older than the failure timeout? if ((now >= (last_run + rsc->failure_timeout)) && !should_ignore_failure_timeout(rsc, xml_op, task, interval_ms, is_last_failure, data_set)) { expired = TRUE; } // Does the resource as a whole have an unexpired fail count? unexpired_fail_count = pe_get_failcount(node, rsc, &last_failure, pe_fc_effective, xml_op, data_set); // Update scheduler recheck time according to *last* failure crm_trace("%s@%lld is %sexpired @%lld with unexpired_failures=%d timeout=%ds" " last-failure@%lld", ID(xml_op), (long long) last_run, (expired? "" : "not "), (long long) now, unexpired_fail_count, rsc->failure_timeout, (long long) last_failure); last_failure += rsc->failure_timeout + 1; if (unexpired_fail_count && (now < last_failure)) { pe__update_recheck_time(last_failure, data_set); } } if (expired) { if (pe_get_failcount(node, rsc, NULL, pe_fc_default, xml_op, data_set)) { // There is a fail count ignoring timeout if (unexpired_fail_count == 0) { // There is no fail count considering timeout clear_reason = "it expired"; } else { /* This operation is old, but there is an unexpired fail count. * In a properly functioning cluster, this should only be * possible if this operation is not a failure (otherwise the * fail count should be expired too), so this is really just a * failsafe. */ expired = FALSE; } } else if (is_last_failure && rsc->remote_reconnect_ms) { /* Clear any expired last failure when reconnect interval is set, * even if there is no fail count. */ clear_reason = "reconnect interval is set"; } } if (!expired && is_last_failure && should_clear_for_param_change(xml_op, task, rsc, node, data_set)) { clear_reason = "resource parameters have changed"; } if (clear_reason != NULL) { // Schedule clearing of the fail count pe_action_t *clear_op = pe__clear_failcount(rsc, node, clear_reason, data_set); if (is_set(data_set->flags, pe_flag_stonith_enabled) && rsc->remote_reconnect_ms) { /* If we're clearing a remote connection due to a reconnect * interval, we want to wait until any scheduled fencing * completes. * * We could limit this to remote_node->details->unclean, but at * this point, that's always true (it won't be reliable until * after unpack_node_loop() is done). */ crm_info("Clearing %s failure will wait until any scheduled " "fencing of %s completes", task, rsc->id); order_after_remote_fencing(clear_op, rsc, data_set); } } if (expired && (interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) { switch(rc) { case PCMK_OCF_OK: case PCMK_OCF_NOT_RUNNING: case PCMK_OCF_RUNNING_MASTER: case PCMK_OCF_DEGRADED: case PCMK_OCF_DEGRADED_MASTER: // Don't expire probes that return these values expired = FALSE; break; } } return expired; } int pe__target_rc_from_xml(xmlNode *xml_op) { int target_rc = 0; const char *key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY); if (key == NULL) { return -1; } decode_transition_key(key, NULL, NULL, NULL, &target_rc); return target_rc; } static enum action_fail_response get_action_on_fail(resource_t *rsc, const char *key, const char *task, pe_working_set_t * data_set) { int result = action_fail_recover; action_t *action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set); result = action->on_fail; pe_free_action(action); return result; } static void update_resource_state(resource_t * rsc, node_t * node, xmlNode * xml_op, const char * task, int rc, xmlNode * last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { gboolean clear_past_failure = FALSE; CRM_ASSERT(rsc); CRM_ASSERT(xml_op); if (rc == PCMK_OCF_NOT_RUNNING) { clear_past_failure = TRUE; } else if (rc == PCMK_OCF_NOT_INSTALLED) { rsc->role = RSC_ROLE_STOPPED; } else if (safe_str_eq(task, CRMD_ACTION_STATUS)) { if (last_failure) { const char *op_key = get_op_key(xml_op); const char *last_failure_key = get_op_key(last_failure); if (safe_str_eq(op_key, last_failure_key)) { clear_past_failure = TRUE; } } if (rsc->role < RSC_ROLE_STARTED) { set_active(rsc); } } else if (safe_str_eq(task, CRMD_ACTION_START)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { rsc->role = RSC_ROLE_STOPPED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { /* Demote from Master does not clear an error */ rsc->role = RSC_ROLE_SLAVE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { unpack_migrate_to_success(rsc, node, xml_op, data_set); } else if (rsc->role < RSC_ROLE_STARTED) { pe_rsc_trace(rsc, "%s active on %s", rsc->id, node->details->uname); set_active(rsc); } /* clear any previous failure actions */ if (clear_past_failure) { switch (*on_fail) { case action_fail_stop: case action_fail_fence: case action_fail_migrate: case action_fail_standby: pe_rsc_trace(rsc, "%s.%s is not cleared by a completed stop", rsc->id, fail2text(*on_fail)); break; case action_fail_block: case action_fail_ignore: case action_fail_recover: case action_fail_restart_container: *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; break; case action_fail_reset_remote: if (rsc->remote_reconnect_ms == 0) { /* With no reconnect interval, the connection is allowed to * start again after the remote node is fenced and * completely stopped. (With a reconnect interval, we wait * for the failure to be cleared entirely before attempting * to reconnect.) */ *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } break; } } } /*! * \internal * \brief Remap informational monitor results to usual values * * Certain OCF result codes are for providing extended information to the * user about services that aren't yet failed but not entirely healthy either. * These must be treated as the "normal" result by pacemaker. * * \param[in] rc Actual result of a monitor action * \param[in] xml_op Operation history XML * \param[in] node Node that operation happened on * \param[in] rsc Resource that operation happened to * \param[in] data_set Cluster working set * * \return Result code that pacemaker should use * * \note If the result is remapped, and the node is not shutting down or failed, * the operation will be recorded in the data set's list of failed * operations, to highlight it for the user. */ static int remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, const pe_resource_t *rsc, pe_working_set_t *data_set) { int remapped_rc = rc; switch (rc) { case PCMK_OCF_DEGRADED: remapped_rc = PCMK_OCF_OK; break; case PCMK_OCF_DEGRADED_MASTER: remapped_rc = PCMK_OCF_RUNNING_MASTER; break; default: break; } if (rc != remapped_rc) { crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); if (!node->details->shutdown || node->details->online) { record_failed_op(xml_op, node, rsc, data_set); } } return remapped_rc; } static void unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, xmlNode **last_failure, enum action_fail_response *on_fail, pe_working_set_t *data_set) { - int task_id = 0; - - const char *task = NULL; - const char *task_key = NULL; - int rc = 0; + int task_id = 0; + int target_rc = 0; int status = PCMK_LRM_OP_UNKNOWN; - int target_rc = pe__target_rc_from_xml(xml_op); guint interval_ms = 0; - + const char *task = NULL; + const char *task_key = NULL; + const char *exit_reason = NULL; bool expired = FALSE; resource_t *parent = rsc; enum action_fail_response failure_strategy = action_fail_recover; - CRM_CHECK(rsc != NULL, return); - CRM_CHECK(node != NULL, return); - CRM_CHECK(xml_op != NULL, return); + CRM_CHECK(rsc && node && xml_op, return); + target_rc = pe__target_rc_from_xml(xml_op); task_key = get_op_key(xml_op); - task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); + exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON); + if (exit_reason == NULL) { + exit_reason = ""; + } crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &task_id); crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); CRM_CHECK(task != NULL, return); CRM_CHECK(status <= PCMK_LRM_OP_INVALID, return); CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return); - if (safe_str_eq(task, CRMD_ACTION_NOTIFY) || - safe_str_eq(task, CRMD_ACTION_METADATA)) { + if (!strcmp(task, CRMD_ACTION_NOTIFY) || + !strcmp(task, CRMD_ACTION_METADATA)) { /* safe to ignore these */ return; } if (is_not_set(rsc->flags, pe_rsc_unique)) { parent = uber_parent(rsc); } pe_rsc_trace(rsc, "Unpacking task %s/%s (call_id=%d, status=%d, rc=%d) on %s (role=%s)", task_key, task, task_id, status, rc, node->details->uname, role2text(rsc->role)); if (node->details->unclean) { pe_rsc_trace(rsc, "Node %s (where %s is running) is unclean." " Further action depends on the value of the stop's on-fail attribute", node->details->uname, rsc->id); } /* It should be possible to call remap_monitor_rc() first then call * check_operation_expiry() only if rc != target_rc, because there should * never be a fail count without at least one unexpected result in the * resource history. That would be more efficient by avoiding having to call * check_operation_expiry() for expected results. * * However, we do have such configurations in the scheduler regression * tests, even if it shouldn't be possible with the current code. It's * probably a good idea anyway, but that would require updating the test * inputs to something currently possible. */ if ((status != PCMK_LRM_OP_NOT_INSTALLED) && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { expired = TRUE; } if (!strcmp(task, CRMD_ACTION_STATUS)) { rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); } if (expired && (rc != target_rc)) { const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); if (interval_ms == 0) { crm_notice("Ignoring expired %s failure on %s " CRM_XS " actual=%d expected=%d magic=%s", task_key, node->details->uname, rc, target_rc, magic); goto done; } else if(node->details->online && node->details->unclean == FALSE) { /* Reschedule the recurring monitor. CancelXmlOp() won't work at * this stage, so as a hacky workaround, forcibly change the restart * digest so check_action_definition() does what we want later. * * @TODO We should skip this if there is a newer successful monitor. * Also, this causes rescheduling only if the history entry * has an op-digest (which the expire-non-blocked-failure * scheduler regression test doesn't, but that may not be a * realistic scenario in production). */ crm_notice("Rescheduling %s after failure expired on %s " CRM_XS " actual=%d expected=%d magic=%s", task_key, node->details->uname, rc, target_rc, magic); crm_xml_add(xml_op, XML_LRM_ATTR_RESTART_DIGEST, "calculated-failure-timeout"); goto done; } } /* If the executor reported an operation status of anything but done or * error, consider that final. But for done or error, we know better whether * it should be treated as a failure or not, because we know the expected * result. */ if(status == PCMK_LRM_OP_DONE || status == PCMK_LRM_OP_ERROR) { status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); + pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); } - pe_rsc_trace(rsc, "Handling status: %d", status); switch (status) { case PCMK_LRM_OP_CANCELLED: - /* do nothing?? */ - pe_err("Don't know what to do for cancelled ops yet"); + // Should never happen + pe_err("Resource history contains cancellation '%s' " + "(%s of %s on %s at %s)", + ID(xml_op), task, rsc->id, node->details->uname, + last_change_str(xml_op)); break; case PCMK_LRM_OP_PENDING: - if (safe_str_eq(task, CRMD_ACTION_START)) { + if (!strcmp(task, CRMD_ACTION_START)) { set_bit(rsc->flags, pe_rsc_start_pending); set_active(rsc); - } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { + } else if (!strcmp(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; - } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE) && node->details->unclean) { + } else if (!strcmp(task, CRMD_ACTION_MIGRATE) && node->details->unclean) { /* If a pending migrate_to action is out on a unclean node, * we have to force the stop action on the target. */ const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); node_t *target = pe_find_node(data_set->nodes, migrate_target); if (target) { stop_action(rsc, target, FALSE); } } if (rsc->pending_task == NULL) { - if (safe_str_eq(task, CRMD_ACTION_STATUS) && (interval_ms == 0)) { + if ((interval_ms != 0) || strcmp(task, CRMD_ACTION_STATUS)) { + rsc->pending_task = strdup(task); + rsc->pending_node = node; + } else { /* Pending probes are not printed, even if pending * operations are requested. If someone ever requests that - * behavior, uncomment this and the corresponding part of + * behavior, enable the below and the corresponding part of * native.c:native_pending_task(). */ - /*rsc->pending_task = strdup("probe");*/ - /*rsc->pending_node = node;*/ - } else { - rsc->pending_task = strdup(task); +#if 0 + rsc->pending_task = strdup("probe"); rsc->pending_node = node; +#endif } } break; case PCMK_LRM_OP_DONE: - pe_rsc_trace(rsc, "%s/%s completed on %s", rsc->id, task, node->details->uname); + pe_rsc_trace(rsc, "%s of %s on %s completed at %s " CRM_XS " id=%s", + task, rsc->id, node->details->uname, + last_change_str(xml_op), ID(xml_op)); update_resource_state(rsc, node, xml_op, task, rc, *last_failure, on_fail, data_set); break; case PCMK_LRM_OP_NOT_INSTALLED: failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); if (failure_strategy == action_fail_ignore) { - crm_warn("Cannot ignore failed %s (status=%d, rc=%d) on %s: " - "Resource agent doesn't exist", - task_key, status, rc, node->details->uname); + crm_warn("Cannot ignore failed %s of %s on %s: " + "Resource agent doesn't exist " + CRM_XS " status=%d rc=%d id=%s", + task, rsc->id, node->details->uname, status, rc, + ID(xml_op)); /* Also for printing it as "FAILED" by marking it as pe_rsc_failed later */ *on_fail = action_fail_migrate; } resource_location(parent, node, -INFINITY, "hard-error", data_set); unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set); break; case PCMK_LRM_OP_NOT_CONNECTED: if (pe__is_guest_or_remote_node(node) && is_set(node->details->remote_rsc->flags, pe_rsc_managed)) { /* We should never get into a situation where a managed remote * connection resource is considered OK but a resource action * behind the connection gets a "not connected" status. But as a * fail-safe in case a bug or unusual circumstances do lead to * that, ensure the remote connection is considered failed. */ set_bit(node->details->remote_rsc->flags, pe_rsc_failed); } // fall through case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_ERROR_HARD: case PCMK_LRM_OP_ERROR_FATAL: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: case PCMK_LRM_OP_INVALID: failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); if ((failure_strategy == action_fail_ignore) || (failure_strategy == action_fail_restart_container - && safe_str_eq(task, CRMD_ACTION_STOP))) { + && !strcmp(task, CRMD_ACTION_STOP))) { - crm_warn("Pretending the failure of %s (rc=%d) on %s succeeded", - task_key, rc, node->details->uname); + crm_warn("Pretending failed %s (%s%s%s) of %s on %s at %s " + "succeeded " CRM_XS " rc=%d id=%s", + task, services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, rsc->id, + node->details->uname, last_change_str(xml_op), rc, + ID(xml_op)); update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, on_fail, data_set); crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); set_bit(rsc->flags, pe_rsc_failure_ignored); record_failed_op(xml_op, node, rsc, data_set); if (failure_strategy == action_fail_restart_container && *on_fail <= action_fail_recover) { *on_fail = failure_strategy; } } else { unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set); if(status == PCMK_LRM_OP_ERROR_HARD) { do_crm_log(rc != PCMK_OCF_NOT_INSTALLED?LOG_ERR:LOG_NOTICE, - "Preventing %s from re-starting on %s: operation %s failed '%s' (%d)", + "Preventing %s from restarting on %s because " + "of hard failure (%s%s%s)" CRM_XS " rc=%d id=%s", parent->id, node->details->uname, - task, services_ocf_exitcode_str(rc), rc); - + services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, + rc, ID(xml_op)); resource_location(parent, node, -INFINITY, "hard-error", data_set); } else if(status == PCMK_LRM_OP_ERROR_FATAL) { - crm_err("Preventing %s from re-starting anywhere: operation %s failed '%s' (%d)", - parent->id, task, services_ocf_exitcode_str(rc), rc); - + crm_err("Preventing %s from restarting anywhere because " + "of fatal failure (%s%s%s) " CRM_XS " rc=%d id=%s", + parent->id, services_ocf_exitcode_str(rc), + (*exit_reason? ": " : ""), exit_reason, + rc, ID(xml_op)); resource_location(parent, NULL, -INFINITY, "fatal-error", data_set); } } break; } done: pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s", rsc->id, task, role2text(rsc->role), role2text(rsc->next_role)); } static void add_node_attrs(xmlNode *xml_obj, pe_node_t *node, bool overwrite, pe_working_set_t *data_set) { const char *cluster_name = NULL; g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_UNAME), strdup(node->details->uname)); g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_ID), strdup(node->details->id)); if (safe_str_eq(node->details->id, data_set->dc_uuid)) { data_set->dc_node = node; node->details->is_dc = TRUE; g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_TRUE)); } else { g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_FALSE)); } cluster_name = g_hash_table_lookup(data_set->config_hash, "cluster-name"); if (cluster_name) { g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_CLUSTER_NAME), strdup(cluster_name)); } pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_ATTR_SETS, NULL, node->details->attrs, NULL, overwrite, data_set); if (pe_node_attribute_raw(node, CRM_ATTR_SITE_NAME) == NULL) { const char *site_name = pe_node_attribute_raw(node, "site-name"); if (site_name) { g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_SITE_NAME), strdup(site_name)); } else if (cluster_name) { /* Default to cluster-name if unset */ g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_SITE_NAME), strdup(cluster_name)); } } } static GListPtr extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter) { int counter = -1; int stop_index = -1; int start_index = -1; xmlNode *rsc_op = NULL; GListPtr gIter = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child_element(rsc_entry); rsc_op != NULL; rsc_op = __xml_next_element(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { crm_xml_add(rsc_op, "resource", rsc); crm_xml_add(rsc_op, XML_ATTR_UNAME, node); op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return NULL; } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); /* create active recurring operations as optional */ if (active_filter == FALSE) { return sorted_op_list; } op_list = NULL; calculate_active_ops(sorted_op_list, &start_index, &stop_index); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; if (start_index < stop_index) { crm_trace("Skipping %s: not active", ID(rsc_entry)); break; } else if (counter < start_index) { crm_trace("Skipping %s: old", ID(rsc_op)); continue; } op_list = g_list_append(op_list, rsc_op); } g_list_free(sorted_op_list); return op_list; } GListPtr find_operations(const char *rsc, const char *node, gboolean active_filter, pe_working_set_t * data_set) { GListPtr output = NULL; GListPtr intermediate = NULL; xmlNode *tmp = NULL; xmlNode *status = find_xml_node(data_set->input, XML_CIB_TAG_STATUS, TRUE); node_t *this_node = NULL; xmlNode *node_state = NULL; for (node_state = __xml_first_child_element(status); node_state != NULL; node_state = __xml_next_element(node_state)) { if (crm_str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, TRUE)) { const char *uname = crm_element_value(node_state, XML_ATTR_UNAME); if (node != NULL && safe_str_neq(uname, node)) { continue; } this_node = pe_find_node(data_set->nodes, uname); if(this_node == NULL) { CRM_LOG_ASSERT(this_node != NULL); continue; } else if (pe__is_guest_or_remote_node(this_node)) { determine_remote_online_status(data_set, this_node); } else { determine_online_status(node_state, this_node, data_set); } if (this_node->details->online || is_set(data_set->flags, pe_flag_stonith_enabled)) { /* offline nodes run no resources... * unless stonith is enabled in which case we need to * make sure rsc start events happen after the stonith */ xmlNode *lrm_rsc = NULL; tmp = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); tmp = find_xml_node(tmp, XML_LRM_TAG_RESOURCES, FALSE); for (lrm_rsc = __xml_first_child_element(tmp); lrm_rsc != NULL; lrm_rsc = __xml_next_element(lrm_rsc)) { if (crm_str_eq((const char *)lrm_rsc->name, XML_LRM_TAG_RESOURCE, TRUE)) { const char *rsc_id = crm_element_value(lrm_rsc, XML_ATTR_ID); if (rsc != NULL && safe_str_neq(rsc_id, rsc)) { continue; } intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter); output = g_list_concat(output, intermediate); } } } } } return output; } diff --git a/tools/crm_mon_output.c b/tools/crm_mon_output.c index 9a9f3ce74b..d012ef8d5d 100644 --- a/tools/crm_mon_output.c +++ b/tools/crm_mon_output.c @@ -1,1203 +1,1205 @@ /* * Copyright 2019 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "crm_mon.h" static char * time_t_string(time_t when) { crm_time_t *crm_when = crm_time_new(NULL); char *buf = NULL; crm_time_set_timet(crm_when, &when); buf = crm_time_as_string(crm_when, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone); crm_time_free(crm_when); return buf; } static char * failed_action_string(xmlNodePtr xml_op) { const char *op_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); int rc = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_RC), "0"); int status = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS), "0"); const char *exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON); time_t last_change = 0; if (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE, &last_change) == pcmk_ok) { char *time = time_t_string(last_change); char *buf = crm_strdup_printf("%s on %s '%s' (%d): call=%s, status='%s', exitreason='%s', last-rc-change='%s', queued=%sms, exec=%sms", op_key ? op_key : ID(xml_op), crm_element_value(xml_op, XML_ATTR_UNAME), services_ocf_exitcode_str(rc), rc, crm_element_value(xml_op, XML_LRM_ATTR_CALLID), services_lrm_status_str(status), exit_reason ? exit_reason : "none", time, crm_element_value(xml_op, XML_RSC_OP_T_QUEUE), crm_element_value(xml_op, XML_RSC_OP_T_EXEC)); free(time); return buf; } else { return crm_strdup_printf("%s on %s '%s' (%d): call=%s, status=%s, exitreason='%s'", op_key ? op_key : ID(xml_op), crm_element_value(xml_op, XML_ATTR_UNAME), services_ocf_exitcode_str(rc), rc, crm_element_value(xml_op, XML_LRM_ATTR_CALLID), services_lrm_status_str(status), exit_reason ? exit_reason : "none"); } } static char * last_changed_string(const char *last_written, const char *user, const char *client, const char *origin) { if (last_written != NULL || user != NULL || client != NULL || origin != NULL) { return crm_strdup_printf("%s%s%s%s%s%s%s", last_written ? last_written : "", user ? " by " : "", user ? user : "", client ? " via " : "", client ? client : "", origin ? " on " : "", origin ? origin : ""); } else { return strdup(""); } } static char * op_history_string(xmlNode *xml_op, const char *task, const char *interval_ms_s, int rc, unsigned int mon_ops) { const char *call = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); char *interval_str = NULL; char *buf = NULL; if (interval_ms_s && safe_str_neq(interval_ms_s, "0")) { char *pair = pcmk_format_nvpair("interval", interval_ms_s, "ms"); interval_str = crm_strdup_printf(" %s", pair); free(pair); } if (is_set(mon_ops, mon_op_print_timing)) { char *last_change_str = NULL; char *last_run_str = NULL; char *exec_str = NULL; char *queue_str = NULL; const char *value = NULL; time_t epoch = 0; if ((crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE, &epoch) == pcmk_ok) && (epoch > 0)) { char *time = pcmk_format_named_time(XML_RSC_OP_LAST_CHANGE, epoch); last_change_str = crm_strdup_printf(" %s", time); free(time); } if ((crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_RUN, &epoch) == pcmk_ok) && (epoch > 0)) { char *time = pcmk_format_named_time(XML_RSC_OP_LAST_RUN, epoch); last_run_str = crm_strdup_printf(" %s", time); free(time); } value = crm_element_value(xml_op, XML_RSC_OP_T_EXEC); if (value) { char *pair = pcmk_format_nvpair(XML_RSC_OP_T_EXEC, value, "ms"); exec_str = crm_strdup_printf(" %s", pair); free(pair); } value = crm_element_value(xml_op, XML_RSC_OP_T_QUEUE); if (value) { char *pair = pcmk_format_nvpair(XML_RSC_OP_T_QUEUE, value, "ms"); queue_str = crm_strdup_printf(" %s", pair); free(pair); } buf = crm_strdup_printf("(%s) %s:%s%s%s%s%s rc=%d (%s)", call, task, interval_str ? interval_str : "", last_change_str ? last_change_str : "", last_run_str ? last_run_str : "", exec_str ? exec_str : "", queue_str ? queue_str : "", rc, services_ocf_exitcode_str(rc)); if (last_change_str) { free(last_change_str); } if (last_run_str) { free(last_run_str); } if (exec_str) { free(exec_str); } if (queue_str) { free(queue_str); } } else { buf = crm_strdup_printf("(%s) %s:%s", call, task, interval_str ? interval_str : ""); } if (interval_str) { free(interval_str); } return buf; } static char * resource_history_string(resource_t *rsc, const char *rsc_id, gboolean all, int failcount, time_t last_failure) { char *buf = NULL; if (rsc == NULL) { buf = crm_strdup_printf("%s: orphan", rsc_id); } else if (all || failcount || last_failure > 0) { char *failcount_s = failcount > 0 ? crm_strdup_printf(" %s=%d", CRM_FAIL_COUNT_PREFIX, failcount) : strdup(""); char *lastfail_s = last_failure > 0 ? crm_strdup_printf(" %s=%s", CRM_LAST_FAILURE_PREFIX, crm_now_string(&last_failure)) : strdup(""); buf = crm_strdup_printf("%s: migration-threshold=%d%s%s", rsc_id, rsc->migration_threshold, failcount_s, lastfail_s); free(failcount_s); free(lastfail_s); } else { buf = crm_strdup_printf("%s:", rsc_id); } return buf; } static int ban_html(pcmk__output_t *out, va_list args) { pe_node_t *pe_node = va_arg(args, pe_node_t *); pe__location_t *location = va_arg(args, pe__location_t *); unsigned int mon_ops = va_arg(args, unsigned int); char *node_name = get_node_display_name(pe_node, mon_ops); char *buf = crm_strdup_printf("%s\tprevents %s from running %son %s", location->id, location->rsc_lh->id, location->role_filter == RSC_ROLE_MASTER ? "as Master " : "", node_name); pcmk__output_create_html_node(out, "li", NULL, NULL, buf); free(node_name); free(buf); return 0; } static int ban_text(pcmk__output_t *out, va_list args) { pe_node_t *pe_node = va_arg(args, pe_node_t *); pe__location_t *location = va_arg(args, pe__location_t *); unsigned int mon_ops = va_arg(args, unsigned int); char *node_name = get_node_display_name(pe_node, mon_ops); out->list_item(out, NULL, "%s\tprevents %s from running %son %s", location->id, location->rsc_lh->id, location->role_filter == RSC_ROLE_MASTER ? "as Master " : "", node_name); free(node_name); return 0; } static int ban_xml(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "ban"); pe_node_t *pe_node = va_arg(args, pe_node_t *); pe__location_t *location = va_arg(args, pe__location_t *); char *weight_s = crm_itoa(pe_node->weight); xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) location->id); xmlSetProp(node, (pcmkXmlStr) "resource", (pcmkXmlStr) location->rsc_lh->id); xmlSetProp(node, (pcmkXmlStr) "node", (pcmkXmlStr) pe_node->details->uname); xmlSetProp(node, (pcmkXmlStr) "weight", (pcmkXmlStr) weight_s); xmlSetProp(node, (pcmkXmlStr) "master_only", (pcmkXmlStr) (location->role_filter == RSC_ROLE_MASTER ? "true" : "false")); free(weight_s); return 0; } static int cluster_counts_html(pcmk__output_t *out, va_list args) { xmlNodePtr nodes_node = pcmk__output_create_xml_node(out, "li"); xmlNodePtr resources_node = pcmk__output_create_xml_node(out, "li"); unsigned int nnodes = va_arg(args, unsigned int); unsigned int nresources = va_arg(args, unsigned int); unsigned int ndisabled = va_arg(args, unsigned int); unsigned int nblocked = va_arg(args, unsigned int); char *nnodes_str = crm_strdup_printf("%d node%s configured", nnodes, s_if_plural(nnodes)); pcmk_create_html_node(nodes_node, "span", NULL, NULL, nnodes_str); free(nnodes_str); if (ndisabled && nblocked) { char *s = crm_strdup_printf("%d resource instance%s configured (%d ", nresources, s_if_plural(nresources), ndisabled); pcmk_create_html_node(resources_node, "span", NULL, NULL, s); free(s); pcmk_create_html_node(resources_node, "span", NULL, "bold", "DISABLED"); s = crm_strdup_printf(", %d ", nblocked); pcmk_create_html_node(resources_node, "span", NULL, NULL, s); free(s); pcmk_create_html_node(resources_node, "span", NULL, "bold", "BLOCKED"); - pcmk_create_html_node(resources_node, "span", NULL, NULL, " from starting due to failure)"); + pcmk_create_html_node(resources_node, "span", NULL, NULL, + " from further action due to failure)"); } else if (ndisabled && !nblocked) { char *s = crm_strdup_printf("%d resource instance%s configured (%d ", nresources, s_if_plural(nresources), ndisabled); pcmk_create_html_node(resources_node, "span", NULL, NULL, s); free(s); pcmk_create_html_node(resources_node, "span", NULL, "bold", "DISABLED"); pcmk_create_html_node(resources_node, "span", NULL, NULL, ")"); } else if (!ndisabled && nblocked) { char *s = crm_strdup_printf("%d resource instance%s configured (%d ", nresources, s_if_plural(nresources), nblocked); pcmk_create_html_node(resources_node, "span", NULL, NULL, s); free(s); pcmk_create_html_node(resources_node, "span", NULL, "bold", "BLOCKED"); - pcmk_create_html_node(resources_node, "span", NULL, NULL, " from starting due to failure)"); + pcmk_create_html_node(resources_node, "span", NULL, NULL, + " from further action due to failure)"); } else { char *s = crm_strdup_printf("%d resource instance%s configured", nresources, s_if_plural(nresources)); pcmk_create_html_node(resources_node, "span", NULL, NULL, s); free(s); } return 0; } static int cluster_counts_text(pcmk__output_t *out, va_list args) { unsigned int nnodes = va_arg(args, unsigned int); unsigned int nresources = va_arg(args, unsigned int); unsigned int ndisabled = va_arg(args, unsigned int); unsigned int nblocked = va_arg(args, unsigned int); out->list_item(out, NULL, "%d node%s configured", nnodes, s_if_plural(nnodes)); if (ndisabled && nblocked) { out->list_item(out, NULL, "%d resource instance%s configured " "(%d DISABLED, %d BLOCKED from " "further action due to failure)", nresources, s_if_plural(nresources), ndisabled, nblocked); } else if (ndisabled && !nblocked) { out->list_item(out, NULL, "%d resource instance%s configured " "(%d DISABLED)", nresources, s_if_plural(nresources), ndisabled); } else if (!ndisabled && nblocked) { out->list_item(out, NULL, "%d resource instance%s configured " "(%d BLOCKED from further action " "due to failure)", nresources, s_if_plural(nresources), nblocked); } else { out->list_item(out, NULL, "%d resource instance%s configured", nresources, s_if_plural(nresources)); } return 0; } static int cluster_counts_xml(pcmk__output_t *out, va_list args) { xmlNodePtr nodes_node = pcmk__output_create_xml_node(out, "nodes_configured"); xmlNodePtr resources_node = pcmk__output_create_xml_node(out, "resources_configured"); unsigned int nnodes = va_arg(args, unsigned int); unsigned int nresources = va_arg(args, unsigned int); unsigned int ndisabled = va_arg(args, unsigned int); unsigned int nblocked = va_arg(args, unsigned int); char *s = crm_itoa(nnodes); xmlSetProp(nodes_node, (pcmkXmlStr) "number", (pcmkXmlStr) s); free(s); s = crm_itoa(nresources); xmlSetProp(resources_node, (pcmkXmlStr) "number", (pcmkXmlStr) s); free(s); s = crm_itoa(ndisabled); xmlSetProp(resources_node, (pcmkXmlStr) "disabled", (pcmkXmlStr) s); free(s); s = crm_itoa(nblocked); xmlSetProp(resources_node, (pcmkXmlStr) "blocked", (pcmkXmlStr) s); free(s); return 0; } static int cluster_dc_html(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "li"); node_t *dc = va_arg(args, node_t *); const char *quorum = va_arg(args, const char *); const char *dc_version_s = va_arg(args, const char *); const char *dc_name = va_arg(args, const char *); pcmk_create_html_node(node, "span", NULL, "bold", "Current DC: "); if (dc) { if (crm_is_true(quorum)) { char *buf = crm_strdup_printf("%s (version %s) - partition with quorum", dc_name, dc_version_s ? dc_version_s : "unknown"); pcmk_create_html_node(node, "span", NULL, NULL, buf); free(buf); } else { char *buf = crm_strdup_printf("%s (version %s) - partition", dc_name, dc_version_s ? dc_version_s : "unknown"); pcmk_create_html_node(node, "span", NULL, NULL, buf); free(buf); pcmk_create_html_node(node, "span", NULL, "warning", "WITHOUT"); pcmk_create_html_node(node, "span", NULL, NULL, "quorum"); } } else { pcmk_create_html_node(node ,"span", NULL, "warning", "NONE"); } return 0; } static int cluster_dc_text(pcmk__output_t *out, va_list args) { node_t *dc = va_arg(args, node_t *); const char *quorum = va_arg(args, const char *); const char *dc_version_s = va_arg(args, const char *); const char *dc_name = va_arg(args, const char *); if (dc) { out->list_item(out, "Current DC", "%s (version %s) - partition %s quorum", dc_name, dc_version_s ? dc_version_s : "unknown", crm_is_true(quorum) ? "with" : "WITHOUT"); } else { out->list_item(out, "Current DC", "NONE"); } return 0; } static int cluster_dc_xml(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "current_dc"); node_t *dc = va_arg(args, node_t *); const char *quorum = va_arg(args, const char *); const char *dc_version_s = va_arg(args, const char *); if (dc) { xmlSetProp(node, (pcmkXmlStr) "present", (pcmkXmlStr) "true"); xmlSetProp(node, (pcmkXmlStr) "version", (pcmkXmlStr) (dc_version_s ? dc_version_s : "")); xmlSetProp(node, (pcmkXmlStr) "name", (pcmkXmlStr) dc->details->uname); xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) dc->details->id); xmlSetProp(node, (pcmkXmlStr) "with_quorum", (pcmkXmlStr) (crm_is_true(quorum) ? "true" : "false")); } else { xmlSetProp(node, (pcmkXmlStr) "present", (pcmkXmlStr) "false"); } return 0; } static int cluster_options_html(pcmk__output_t *out, va_list args) { pe_working_set_t *data_set = va_arg(args, pe_working_set_t *); /* Kind of a hack - close the list started by print_cluster_summary so we * can put all the options in their own list, but just for HTML output. */ out->end_list(out); /* And then this list will be closed by print_cluster_summary since it * wants to close the list it created unconditionally. */ out->begin_list(out, NULL, NULL, "Config Options"); out->list_item(out, NULL, "STONITH of failed nodes %s", is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled"); out->list_item(out, NULL, "Cluster is %s", is_set(data_set->flags, pe_flag_symmetric_cluster) ? "symmetric" : "asymmetric"); switch (data_set->no_quorum_policy) { case no_quorum_freeze: out->list_item(out, NULL, "No Quorum policy: Freeze resources"); break; case no_quorum_stop: out->list_item(out, NULL, "No Quorum policy: Stop ALL resources"); break; case no_quorum_ignore: out->list_item(out, NULL, "No Quorum policy: Ignore"); break; case no_quorum_suicide: out->list_item(out, NULL, "No Quorum policy: Suicide"); break; } if (is_set(data_set->flags, pe_flag_maintenance_mode)) { xmlNodePtr node = pcmk__output_create_xml_node(out, "li"); pcmk_create_html_node(node, "span", NULL, "bold", "DISABLED"); pcmk_create_html_node(node, "span", NULL, NULL, " (the cluster will not attempt to start, stop, or recover services)"); } else { out->list_item(out, NULL, "Resource management enabled"); } return 0; } static int cluster_options_text(pcmk__output_t *out, va_list args) { pe_working_set_t *data_set = va_arg(args, pe_working_set_t *); if (is_set(data_set->flags, pe_flag_maintenance_mode)) { fprintf(out->dest, "\n *** Resource management is DISABLED ***"); fprintf(out->dest, "\n The cluster will not attempt to start, stop or recover services"); fprintf(out->dest, "\n"); } return 0; } static int cluster_options_xml(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "cluster_options"); pe_working_set_t *data_set = va_arg(args, pe_working_set_t *); xmlSetProp(node, (pcmkXmlStr) "stonith-enabled", (pcmkXmlStr) (is_set(data_set->flags, pe_flag_stonith_enabled) ? "true" : "false")); xmlSetProp(node, (pcmkXmlStr) "symmetric-cluster", (pcmkXmlStr) (is_set(data_set->flags, pe_flag_symmetric_cluster) ? "true" : "false")); switch (data_set->no_quorum_policy) { case no_quorum_freeze: xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "freeze"); break; case no_quorum_stop: xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "stop"); break; case no_quorum_ignore: xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "ignore"); break; case no_quorum_suicide: xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "suicide"); break; } xmlSetProp(node, (pcmkXmlStr) "maintenance-mode", (pcmkXmlStr) (is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false")); return 0; } static int cluster_stack_html(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "li"); const char *stack_s = va_arg(args, const char *); pcmk_create_html_node(node, "span", NULL, "bold", "Stack: "); pcmk_create_html_node(node, "span", NULL, NULL, stack_s); return 0; } static int cluster_stack_text(pcmk__output_t *out, va_list args) { const char *stack_s = va_arg(args, const char *); out->list_item(out, "Stack", "%s", stack_s); return 0; } static int cluster_stack_xml(pcmk__output_t *out, va_list args) { xmlNodePtr node = pcmk__output_create_xml_node(out, "stack"); const char *stack_s = va_arg(args, const char *); xmlSetProp(node, (pcmkXmlStr) "type", (pcmkXmlStr) stack_s); return 0; } static int cluster_times_html(pcmk__output_t *out, va_list args) { xmlNodePtr updated_node = pcmk__output_create_xml_node(out, "li"); xmlNodePtr changed_node = pcmk__output_create_xml_node(out, "li"); const char *last_written = va_arg(args, const char *); const char *user = va_arg(args, const char *); const char *client = va_arg(args, const char *); const char *origin = va_arg(args, const char *); char *buf = last_changed_string(last_written, user, client, origin); pcmk_create_html_node(updated_node, "span", NULL, "bold", "Last updated: "); pcmk_create_html_node(updated_node, "span", NULL, NULL, crm_now_string(NULL)); pcmk_create_html_node(changed_node, "span", NULL, "bold", "Last change: "); pcmk_create_html_node(changed_node, "span", NULL, NULL, buf); free(buf); return 0; } static int cluster_times_xml(pcmk__output_t *out, va_list args) { xmlNodePtr updated_node = pcmk__output_create_xml_node(out, "last_update"); xmlNodePtr changed_node = pcmk__output_create_xml_node(out, "last_change"); const char *last_written = va_arg(args, const char *); const char *user = va_arg(args, const char *); const char *client = va_arg(args, const char *); const char *origin = va_arg(args, const char *); xmlSetProp(updated_node, (pcmkXmlStr) "time", (pcmkXmlStr) crm_now_string(NULL)); xmlSetProp(changed_node, (pcmkXmlStr) "time", (pcmkXmlStr) (last_written ? last_written : "")); xmlSetProp(changed_node, (pcmkXmlStr) "user", (pcmkXmlStr) (user ? user : "")); xmlSetProp(changed_node, (pcmkXmlStr) "client", (pcmkXmlStr) (client ? client : "")); xmlSetProp(changed_node, (pcmkXmlStr) "origin", (pcmkXmlStr) (origin ? origin : "")); return 0; } static int cluster_times_text(pcmk__output_t *out, va_list args) { const char *last_written = va_arg(args, const char *); const char *user = va_arg(args, const char *); const char *client = va_arg(args, const char *); const char *origin = va_arg(args, const char *); char *buf = last_changed_string(last_written, user, client, origin); out->list_item(out, "Last updated", "%s", crm_now_string(NULL)); out->list_item(out, "Last change", " %s", buf); free(buf); return 0; } static int failed_action_console(pcmk__output_t *out, va_list args) { xmlNodePtr xml_op = va_arg(args, xmlNodePtr); char *s = failed_action_string(xml_op); curses_indented_printf(out, "%s\n", s); free(s); return 0; } static int failed_action_html(pcmk__output_t *out, va_list args) { xmlNodePtr xml_op = va_arg(args, xmlNodePtr); char *s = failed_action_string(xml_op); pcmk__output_create_html_node(out, "li", NULL, NULL, s); free(s); return 0; } static int failed_action_text(pcmk__output_t *out, va_list args) { xmlNodePtr xml_op = va_arg(args, xmlNodePtr); char *s = failed_action_string(xml_op); pcmk__indented_printf(out, "%s\n", s); free(s); return 0; } static int failed_action_xml(pcmk__output_t *out, va_list args) { xmlNodePtr xml_op = va_arg(args, xmlNodePtr); const char *op_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); const char *last = crm_element_value(xml_op, XML_RSC_OP_LAST_CHANGE); int rc = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_RC), "0"); int status = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS), "0"); const char *exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON); char *rc_s = crm_itoa(rc); char *reason_s = crm_xml_escape(exit_reason ? exit_reason : "none"); xmlNodePtr node = pcmk__output_create_xml_node(out, "failure"); xmlSetProp(node, (pcmkXmlStr) (op_key ? "op_key" : "id"), (pcmkXmlStr) (op_key ? op_key : "id")); xmlSetProp(node, (pcmkXmlStr) "node", (pcmkXmlStr) crm_element_value(xml_op, XML_ATTR_UNAME)); xmlSetProp(node, (pcmkXmlStr) "exitstatus", (pcmkXmlStr) services_ocf_exitcode_str(rc)); xmlSetProp(node, (pcmkXmlStr) "exitreason", (pcmkXmlStr) reason_s); xmlSetProp(node, (pcmkXmlStr) "exitcode", (pcmkXmlStr) rc_s); xmlSetProp(node, (pcmkXmlStr) "call", (pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_CALLID)); xmlSetProp(node, (pcmkXmlStr) "status", (pcmkXmlStr) services_lrm_status_str(status)); if (last) { char *s = crm_itoa(crm_parse_ms(crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS))); char *rc_change = time_t_string(crm_parse_int(last, "0")); xmlSetProp(node, (pcmkXmlStr) "last-rc-change", (pcmkXmlStr) rc_change); xmlSetProp(node, (pcmkXmlStr) "queued", (pcmkXmlStr) crm_element_value(xml_op, XML_RSC_OP_T_QUEUE)); xmlSetProp(node, (pcmkXmlStr) "exec", (pcmkXmlStr) crm_element_value(xml_op, XML_RSC_OP_T_EXEC)); xmlSetProp(node, (pcmkXmlStr) "interval", (pcmkXmlStr) s); xmlSetProp(node, (pcmkXmlStr) "task", (pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_TASK)); free(s); free(rc_change); } free(reason_s); free(rc_s); return 0; } static int node_html(pcmk__output_t *out, va_list args) { node_t *node = va_arg(args, node_t *); unsigned int mon_ops = va_arg(args, unsigned int); gboolean full = va_arg(args, gboolean); char *node_name = get_node_display_name(node, mon_ops); char *buf = crm_strdup_printf("Node: %s", node_name); int print_opts = get_resource_display_options(mon_ops, mon_output_html); if (full) { xmlNodePtr item_node = pcmk__output_create_xml_node(out, "li"); pcmk_create_html_node(item_node, "span", NULL, NULL, buf); if (node->details->standby_onfail && node->details->online) { pcmk_create_html_node(item_node, "span", NULL, "standby", " standby (on-fail)"); } else if (node->details->standby && node->details->online) { char *s = crm_strdup_printf(" standby%s", node->details->running_rsc ? " (with active resources)" : ""); pcmk_create_html_node(item_node, "span", NULL, " standby", s); free(s); } else if (node->details->standby) { pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE (standby)"); } else if (node->details->maintenance && node->details->online) { pcmk_create_html_node(item_node, "span", NULL, "maint", " maintenance"); } else if (node->details->maintenance) { pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE (maintenance)"); } else if (node->details->online) { pcmk_create_html_node(item_node, "span", NULL, "online", " online"); } else { pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE"); } if (is_set(mon_ops, mon_op_print_brief) && is_set(mon_ops, mon_op_group_by_node)) { out->begin_list(out, NULL, NULL, NULL); pe__rscs_brief_output(out, node->details->running_rsc, print_opts | pe_print_rsconly, FALSE); out->end_list(out); } else if (is_set(mon_ops, mon_op_group_by_node)) { GListPtr lpc2 = NULL; out->begin_list(out, NULL, NULL, NULL); for (lpc2 = node->details->running_rsc; lpc2 != NULL; lpc2 = lpc2->next) { resource_t *rsc = (resource_t *) lpc2->data; out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc); } out->end_list(out); } } else { out->begin_list(out, NULL, NULL, "%s", buf); } free(buf); free(node_name); return 0; } static int node_text(pcmk__output_t *out, va_list args) { node_t *node = va_arg(args, node_t *); unsigned int mon_ops = va_arg(args, unsigned int); gboolean full = va_arg(args, gboolean); if (full) { const char *node_mode = va_arg(args, const char *); char *node_name = get_node_display_name(node, mon_ops); int print_opts = get_resource_display_options(mon_ops, mon_output_xml); char *buf = NULL; /* Print the node name and status */ if (pe__is_guest_node(node)) { buf = crm_strdup_printf("GuestNode %s: %s", node_name, node_mode); } else if (pe__is_remote_node(node)) { buf = crm_strdup_printf("RemoteNode %s: %s", node_name, node_mode); } else { buf = crm_strdup_printf("Node %s: %s", node_name, node_mode); } /* If we're grouping by node, print its resources */ if (is_set(mon_ops, mon_op_group_by_node)) { out->begin_list(out, NULL, NULL, "%s", buf); out->begin_list(out, NULL, NULL, "Resources"); if (is_set(mon_ops, mon_op_print_brief)) { pe__rscs_brief_output(out, node->details->running_rsc, print_opts | pe_print_rsconly, FALSE); } else { GListPtr gIter2 = NULL; for (gIter2 = node->details->running_rsc; gIter2 != NULL; gIter2 = gIter2->next) { resource_t *rsc = (resource_t *) gIter2->data; out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc); } } out->end_list(out); out->end_list(out); } else { out->list_item(out, NULL, "%s", buf); } free(buf); free(node_name); } else { out->begin_list(out, NULL, NULL, "Node: %s", get_node_display_name(node, mon_ops)); } return 0; } static int node_xml(pcmk__output_t *out, va_list args) { node_t *node = va_arg(args, node_t *); unsigned int mon_ops G_GNUC_UNUSED = va_arg(args, unsigned int); gboolean full = va_arg(args, gboolean); if (full) { const char *node_type = "unknown"; int print_opts = get_resource_display_options(mon_ops, mon_output_xml); char *length_s = crm_itoa(g_list_length(node->details->running_rsc)); switch (node->details->type) { case node_member: node_type = "member"; break; case node_remote: node_type = "remote"; break; case node_ping: node_type = "ping"; break; } pe__name_and_nvpairs_xml(out, true, "node", 13, "name", node->details->uname, "id", node->details->id, "online", node->details->online ? "true" : "false", "standby", node->details->standby ? "true" : "false", "standby_onfail", node->details->standby_onfail ? "true" : "false", "maintenance", node->details->maintenance ? "true" : "false", "pending", node->details->pending ? "true" : "false", "unclean", node->details->unclean ? "true" : "false", "shutdown", node->details->shutdown ? "true" : "false", "expected_up", node->details->expected_up ? "true" : "false", "is_dc", node->details->is_dc ? "true" : "false", "resources_running", length_s, "type", node_type); if (pe__is_guest_node(node)) { xmlNodePtr xml_node = pcmk__output_xml_peek_parent(out); xmlSetProp(xml_node, (pcmkXmlStr) "id_as_resource", (pcmkXmlStr) node->details->remote_rsc->container->id); } if (is_set(mon_ops, mon_op_group_by_node)) { GListPtr lpc = NULL; for (lpc = node->details->running_rsc; lpc != NULL; lpc = lpc->next) { resource_t *rsc = (resource_t *) lpc->data; out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc); } } free(length_s); out->end_list(out); } else { xmlNodePtr parent = pcmk__output_xml_create_parent(out, "node"); xmlSetProp(parent, (pcmkXmlStr) "name", (pcmkXmlStr) node->details->uname); } return 0; } static int node_attribute_text(pcmk__output_t *out, va_list args) { const char *name = va_arg(args, const char *); const char *value = va_arg(args, const char *); gboolean add_extra = va_arg(args, gboolean); int expected_score = va_arg(args, int); if (add_extra) { int v = crm_parse_int(value, "0"); if (v <= 0) { out->list_item(out, NULL, "%-32s\t: %-10s\t: Connectivity is lost", name, value); } else if (v < expected_score) { out->list_item(out, NULL, "%-32s\t: %-10s\t: Connectivity is degraded (Expected=%d)", name, value, expected_score); } else { out->list_item(out, NULL, "%-32s\t: %-10s", name, value); } } else { out->list_item(out, NULL, "%-32s\t: %-10s", name, value); } return 0; } static int node_attribute_html(pcmk__output_t *out, va_list args) { const char *name = va_arg(args, const char *); const char *value = va_arg(args, const char *); gboolean add_extra = va_arg(args, gboolean); int expected_score = va_arg(args, int); if (add_extra) { int v = crm_parse_int(value, "0"); char *s = crm_strdup_printf("%s: %s", name, value); xmlNodePtr item_node = pcmk__output_create_xml_node(out, "li"); pcmk_create_html_node(item_node, "span", NULL, NULL, s); free(s); if (v <= 0) { pcmk_create_html_node(item_node, "span", NULL, "bold", "(connectivity is lost)"); } else if (v < expected_score) { char *buf = crm_strdup_printf("(connectivity is degraded -- expected %d", expected_score); pcmk_create_html_node(item_node, "span", NULL, "bold", buf); free(buf); } } else { out->list_item(out, NULL, "%s: %s", name, value); } return 0; } static int node_attribute_xml(pcmk__output_t *out, va_list args) { const char *name = va_arg(args, const char *); const char *value = va_arg(args, const char *); gboolean add_extra = va_arg(args, gboolean); int expected_score = va_arg(args, int); xmlNodePtr node = pcmk__output_create_xml_node(out, "attribute"); xmlSetProp(node, (pcmkXmlStr) "name", (pcmkXmlStr) name); xmlSetProp(node, (pcmkXmlStr) "value", (pcmkXmlStr) value); if (add_extra) { char *buf = crm_itoa(expected_score); xmlSetProp(node, (pcmkXmlStr) "expected", (pcmkXmlStr) buf); free(buf); } return 0; } static int op_history_text(pcmk__output_t *out, va_list args) { xmlNode *xml_op = va_arg(args, xmlNode *); const char *task = va_arg(args, const char *); const char *interval_ms_s = va_arg(args, const char *); int rc = va_arg(args, int); unsigned int mon_ops = va_arg(args, unsigned int); char *buf = op_history_string(xml_op, task, interval_ms_s, rc, mon_ops); out->list_item(out, NULL, "%s", buf); free(buf); return 0; } static int op_history_xml(pcmk__output_t *out, va_list args) { xmlNode *xml_op = va_arg(args, xmlNode *); const char *task = va_arg(args, const char *); const char *interval_ms_s = va_arg(args, const char *); int rc = va_arg(args, int); unsigned int mon_ops = va_arg(args, unsigned int); char *rc_s = NULL; xmlNodePtr node = pcmk__output_create_xml_node(out, "operation_history"); xmlSetProp(node, (pcmkXmlStr) "call", (pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_CALLID)); xmlSetProp(node, (pcmkXmlStr) "task", (pcmkXmlStr) task); if (interval_ms_s && safe_str_neq(interval_ms_s, "0")) { char *s = crm_strdup_printf("%sms", interval_ms_s); xmlSetProp(node, (pcmkXmlStr) "interval", (pcmkXmlStr) s); free(s); } if (is_set(mon_ops, mon_op_print_timing)) { const char *value = NULL; value = crm_element_value(xml_op, XML_RSC_OP_LAST_CHANGE); if (value) { time_t int_value = (time_t) crm_parse_int(value, NULL); if (int_value > 0) { xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_LAST_CHANGE, (pcmkXmlStr) crm_now_string(&int_value)); } } value = crm_element_value(xml_op, XML_RSC_OP_LAST_RUN); if (value) { time_t int_value = (time_t) crm_parse_int(value, NULL); if (int_value > 0) { xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_LAST_RUN, (pcmkXmlStr) crm_now_string(&int_value)); } } value = crm_element_value(xml_op, XML_RSC_OP_T_EXEC); if (value) { char *s = crm_strdup_printf("%sms", value); xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_T_EXEC, (pcmkXmlStr) s); free(s); } value = crm_element_value(xml_op, XML_RSC_OP_T_QUEUE); if (value) { char *s = crm_strdup_printf("%sms", value); xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_T_QUEUE, (pcmkXmlStr) s); free(s); } } rc_s = crm_itoa(rc); xmlSetProp(node, (pcmkXmlStr) "rc", (pcmkXmlStr) rc_s); xmlSetProp(node, (pcmkXmlStr) "rc_text", (pcmkXmlStr) services_ocf_exitcode_str(rc)); free(rc_s); return 0; } static int resource_history_text(pcmk__output_t *out, va_list args) { resource_t *rsc = va_arg(args, resource_t *); const char *rsc_id = va_arg(args, const char *); gboolean all = va_arg(args, gboolean); int failcount = va_arg(args, int); time_t last_failure = va_arg(args, int); char *buf = resource_history_string(rsc, rsc_id, all, failcount, last_failure); out->begin_list(out, NULL, NULL, "%s", buf); free(buf); return 0; } static int resource_history_xml(pcmk__output_t *out, va_list args) { resource_t *rsc = va_arg(args, resource_t *); const char *rsc_id = va_arg(args, const char *); gboolean all = va_arg(args, gboolean); int failcount = va_arg(args, int); time_t last_failure = va_arg(args, int); xmlNodePtr node = pcmk__output_xml_create_parent(out, "resource_history"); xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) rsc_id); if (rsc == NULL) { xmlSetProp(node, (pcmkXmlStr) "orphan", (pcmkXmlStr) "true"); } else if (all || failcount || last_failure > 0) { char *migration_s = crm_itoa(rsc->migration_threshold); xmlSetProp(node, (pcmkXmlStr) "orphan", (pcmkXmlStr) "false"); xmlSetProp(node, (pcmkXmlStr) "migration-threshold", (pcmkXmlStr) migration_s); free(migration_s); if (failcount > 0) { char *s = crm_itoa(failcount); xmlSetProp(node, (pcmkXmlStr) CRM_FAIL_COUNT_PREFIX, (pcmkXmlStr) s); free(s); } if (last_failure > 0) { xmlSetProp(node, (pcmkXmlStr) CRM_LAST_FAILURE_PREFIX, (pcmkXmlStr) crm_now_string(&last_failure)); } } return 0; } static int stonith_event_console(pcmk__output_t *out, va_list args) { stonith_history_t *event = va_arg(args, stonith_history_t *); int full_history = va_arg(args, int); gboolean later_succeeded = va_arg(args, gboolean); char *buf = NULL; buf = time_t_string(event->completed); switch (event->state) { case st_failed: curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s %s'\n", stonith_action_str(event->action), event->target, event->delegate ? event->delegate : "", event->client, event->origin, full_history ? "completed" : "last-failed", buf, later_succeeded ? "(a later attempt succeeded)" : ""); break; case st_done: curses_indented_printf(out, "%s of %s successful: delegate=%s, client=%s, origin=%s, %s='%s'\n", stonith_action_str(event->action), event->target, event->delegate ? event->delegate : "", event->client, event->origin, full_history ? "completed" : "last-successful", buf); break; default: curses_indented_printf(out, "%s of %s pending: client=%s, origin=%s\n", stonith_action_str(event->action), event->target, event->client, event->origin); break; } free(buf); return 0; } static int ticket_console(pcmk__output_t *out, va_list args) { ticket_t *ticket = va_arg(args, ticket_t *); if (ticket->last_granted > -1) { char *time = pcmk_format_named_time("last-granted", ticket->last_granted); out->list_item(out, ticket->id, "\t%s%s %s", ticket->granted ? "granted" : "revoked", ticket->standby ? " [standby]" : "", time); free(time); } else { out->list_item(out, ticket->id, "\t%s%s", ticket->granted ? "granted" : "revoked", ticket->standby ? " [standby]" : ""); } return 0; } static pcmk__message_entry_t fmt_functions[] = { { "ban", "console", ban_text }, { "ban", "html", ban_html }, { "ban", "text", ban_text }, { "ban", "xml", ban_xml }, { "bundle", "console", pe__bundle_text }, { "clone", "console", pe__clone_text }, { "cluster-counts", "console", cluster_counts_text }, { "cluster-counts", "html", cluster_counts_html }, { "cluster-counts", "text", cluster_counts_text }, { "cluster-counts", "xml", cluster_counts_xml }, { "cluster-dc", "console", cluster_dc_text }, { "cluster-dc", "html", cluster_dc_html }, { "cluster-dc", "text", cluster_dc_text }, { "cluster-dc", "xml", cluster_dc_xml }, { "cluster-options", "console", cluster_options_text }, { "cluster-options", "html", cluster_options_html }, { "cluster-options", "text", cluster_options_text }, { "cluster-options", "xml", cluster_options_xml }, { "cluster-stack", "console", cluster_stack_text }, { "cluster-stack", "html", cluster_stack_html }, { "cluster-stack", "text", cluster_stack_text }, { "cluster-stack", "xml", cluster_stack_xml }, { "cluster-times", "console", cluster_times_text }, { "cluster-times", "html", cluster_times_html }, { "cluster-times", "text", cluster_times_text }, { "cluster-times", "xml", cluster_times_xml }, { "failed-action", "console", failed_action_console }, { "failed-action", "html", failed_action_html }, { "failed-action", "text", failed_action_text }, { "failed-action", "xml", failed_action_xml }, { "group", "console", pe__group_text }, { "node", "console", node_text }, { "node", "html", node_html }, { "node", "text", node_text }, { "node", "xml", node_xml }, { "node-attribute", "console", node_attribute_text }, { "node-attribute", "html", node_attribute_html }, { "node-attribute", "text", node_attribute_text }, { "node-attribute", "xml", node_attribute_xml }, { "op-history", "console", op_history_text }, { "op-history", "html", op_history_text }, { "op-history", "text", op_history_text }, { "op-history", "xml", op_history_xml }, { "primitive", "console", pe__resource_text }, { "resource-history", "console", resource_history_text }, { "resource-history", "html", resource_history_text }, { "resource-history", "text", resource_history_text }, { "resource-history", "xml", resource_history_xml }, { "stonith-event", "console", stonith_event_console }, { "ticket", "console", ticket_console }, { NULL, NULL, NULL } }; void crm_mon_register_messages(pcmk__output_t *out) { pcmk__register_messages(out, fmt_functions); }