diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py index ceda289415..9c5929b0d3 100644 --- a/cts/lab/CTStests.py +++ b/cts/lab/CTStests.py @@ -1,907 +1,785 @@ """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS) """ __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import re import time from stat import * from pacemaker import BuildOptions from pacemaker._cts.CTS import NodeStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests import * from pacemaker._cts.timer import Timer AllTestClasses = [ ] AllTestClasses.append(FlipTest) AllTestClasses.append(RestartTest) AllTestClasses.append(StonithdTest) AllTestClasses.append(StartOnebyOne) AllTestClasses.append(SimulStart) AllTestClasses.append(SimulStop) AllTestClasses.append(StopOnebyOne) AllTestClasses.append(RestartOnebyOne) AllTestClasses.append(PartialStart) AllTestClasses.append(StandbyTest) AllTestClasses.append(MaintenanceMode) AllTestClasses.append(ResourceRecover) - - -class ComponentFail(CTSTest): - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "ComponentFail" - self._startall = SimulStartLite(cm) - self.complist = cm.Components() - self.patterns = [] - self.okerrpatterns = [] - self.is_unsafe = True - - def __call__(self, node): - '''Perform the 'ComponentFail' test. ''' - self.incr("calls") - self.patterns = [] - self.okerrpatterns = [] - - # start all nodes - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - if not self._cm.cluster_stable(self._env["StableTime"]): - return self.failure("Setup failed - unstable") - - node_is_dc = self._cm.is_node_dc(node, None) - - # select a component to kill - chosen = self._env.random_gen.choice(self.complist) - while chosen.dc_only and node_is_dc == 0: - chosen = self._env.random_gen.choice(self.complist) - - self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc)) - self.incr(chosen.name) - - if chosen.name != "corosync": - self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) - self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) - - self.patterns.extend(chosen.pats) - if node_is_dc: - self.patterns.extend(chosen.dc_pats) - - # @TODO this should be a flag in the Component - if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: - # Ignore actions for fence devices if fencer will respawn - # (their registration will be lost, and probes will fail) - self.okerrpatterns = [ self.templates["Pat:Fencing_active"] ] - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - r = AuditResource(self._cm, line) - if r.rclass == "stonith": - self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) - self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) - - # supply a copy so self.patterns doesn't end up empty - tmpPats = [] - tmpPats.extend(self.patterns) - self.patterns.extend(chosen.badnews_ignore) - - # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status - stonithPats = [] - stonithPats.append(self.templates["Pat:Fencing_ok"] % node) - stonith = self.create_watch(stonithPats, 0) - stonith.set_watch() - - # set the watch for stable - watch = self.create_watch( - tmpPats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) - watch.set_watch() - - # kill the component - chosen.kill(node) - - self.debug("Waiting for the cluster to recover") - self._cm.cluster_stable() - - self.debug("Waiting for any fenced node to come back up") - self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) - - self.debug("Waiting for the cluster to re-stabilize with all nodes") - self._cm.cluster_stable(self._env["StartTime"]) - - self.debug("Checking if %s was shot" % node) - shot = stonith.look(60) - if shot: - self.debug("Found: " + repr(shot)) - self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) - - if not self._env["at-boot"]: - self._cm.ShouldBeStatus[node] = "down" - - # If fencing occurred, chances are many (if not all) the expected logs - # will not be sent - or will be lost when the node reboots - return self.success() - - # check for logs indicating a graceful recovery - matched = watch.look_for_all(allow_multiple_matches=True) - if watch.unmatched: - self._logger.log("Patterns not found: " + repr(watch.unmatched)) - - self.debug("Waiting for the cluster to re-stabilize with all nodes") - is_stable = self._cm.cluster_stable(self._env["StartTime"]) - - if not matched: - return self.failure("Didn't find all expected %s patterns" % chosen.name) - elif not is_stable: - return self.failure("Cluster did not become stable after killing %s" % chosen.name) - - return self.success() - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - # Note that okerrpatterns refers to the last time we ran this test - # The good news is that this works fine for us... - self.okerrpatterns.extend(self.patterns) - return self.okerrpatterns - AllTestClasses.append(ComponentFail) class SplitBrainTest(CTSTest): '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.is_experimental = True def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self._env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log("Node "+node+" not in " + repr(self._env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.debug("Creating partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self._cm.isolate_node(node, other_nodes): self._logger.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self._env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log("Node "+node+" not in " + repr(self._env["nodes"])) if len(other_nodes) == 0: return 1 self.debug("Healing partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: self._cm.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = True partitions = {} ret = self._startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self._env["nodes"]) for node in self._env["nodes"]: p = self._env.random_gen.randint(1, p_max) if not p in partitions: partitions[p] = [] partitions[p].append(node) p_max = len(list(partitions.keys())) if p_max > 1: break # else, try again self.debug("Created %d partitions" % p_max) for key in list(partitions.keys()): self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") for key in list(partitions.keys()): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self._cm.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self._cm.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self._cm.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self._cm.partitions_expected = 1 # And heal them again for key in list(partitions.keys()): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self._cm.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self._cm.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self._env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self._cm.cluster_stable(1200): self.failure("Reformed cluster not stable") if self._env["continue"]: answer = "Y" else: try: answer = input('Continue? [nY]') except EOFError as e: answer = "n" if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self._env["DoFencing"]: self._rsh(node, "crm_attribute -V -D -n stonith-enabled") self._cm.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"pacemaker-controld.*:.*not in our membership list", r"CRIT:.*node.*returning after partition" ] def is_applicable(self): if not CTSTest.is_applicable(self): return False return len(self._env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) class Reattach(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Reattach" self._startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = False def _is_managed(self, node): (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) is_managed = is_managed[0].strip() return is_managed == "true" def _set_unmanaged(self, node): self.debug("Disable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): self.debug("Re-enable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def setup(self, node): attempt = 0 if not self._startall(None): return None # Make sure we are really _really_ stable and that all # resources, including those that depend on transient node # attributes, are started while not self._cm.cluster_stable(double_check=True): if attempt < 5: attempt += 1 self.debug("Not stable yet, re-testing") else: self._logger.log("Cluster is not stable") return None return 1 def teardown(self, node): # Make sure 'node' is up start = StartTest(self._cm) start(node) if not self._is_managed(node): self._logger.log("Attempting to re-enable resource management on %s" % node) self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): self._logger.log("Could not re-enable resource management") return 0 return 1 def can_run_now(self, node): """ Return True if we can meaningfully run right now""" if self._find_ocfs2_resources(node): self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return False return True def __call__(self, node): self.incr("calls") pats = [] # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["No fencing will be done"], 60) managed.set_watch() self._set_unmanaged(node) if not managed.look_for_all(): self._logger.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*")) watch = self.create_watch(pats, 60, "ShutdownActivity") watch.set_watch() self.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self._startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.set_watch() # Re-enable resource management (and verify it happened). self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.debug("Ignoring start actions for %s" % r.id) ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"resource( was|s were) active at shutdown" ] def is_applicable(self): return True AllTestClasses.append(Reattach) class SpecialTest1(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SpecialTest1" self._startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Could not stop all nodes") # Test config recovery when the other nodes come up self._rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*") # Start the selected node ret = self.restart1(node) if not ret: return self.failure("Could not start "+node) # Start all remaining nodes ret = self._startall(None) if not ret: return self.failure("Could not start the remaining nodes") return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ # Errors that occur as a result of the CIB being wiped return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", r"error.*: Resource start-up disabled since no STONITH resources have been defined", r"error.*: Either configure some or disable STONITH with the stonith-enabled option", r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" ] AllTestClasses.append(SpecialTest1) class NearQuorumPointTest(CTSTest): ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] stonith = self._cm.prepare_fencing_watcher("NearQuorumPoint") #decide what to do with each node for node in self._env["nodes"]: action = self._env.random_gen.choice(["start","stop"]) #action = self._env.random_gen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.debug("start nodes:" + repr(startset)) self.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self._cm.ShouldBeStatus[node] == "up": watchpats.append(self.templates["Pat:We_stopped"] % node) for node in startset: if self._cm.ShouldBeStatus[node] == "down": #watchpats.append(self.templates["Pat:NonDC_started"] % node) watchpats.append(self.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self._cm.ShouldBeStatus[stopping] == "up": watchpats.append(self.templates["Pat:They_stopped"] % (node, self._cm.key_for_node(stopping))) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self._env["DeadTime"]+10) watch.set_watch() #begin actions for node in stopset: if self._cm.ShouldBeStatus[node] == "up": self._cm.StopaCMnoBlock(node) for node in startset: if self._cm.ShouldBeStatus[node] == "down": self._cm.StartaCMnoBlock(node) #get the result if watch.look_for_all(): self._cm.cluster_stable() self._cm.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self._logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self._cm.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self._cm.StataCM(node) == 0: downnodes.append(node) self._cm.fencing_cleanup("NearQuorumPoint", stonith) if upnodes == [] and downnodes == []: self._cm.cluster_stable() # Make sure they're completely down with no residule for node in stopset: self._rsh(node, self.templates["StopCmd"]) return self.success() if len(upnodes) > 0: self._logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self._logger.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() def is_applicable(self): return True AllTestClasses.append(NearQuorumPointTest) def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.audits = audits result.append(bound_test) return result class RemoteLXC(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteLXC" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = True self.fail_string = "" def start_lxc_simple(self, node): # restore any artifacts laying around from a previous test. self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") # generate the containers, put them in the config, add some resources to them pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) with Timer(self._logger, self.name, "remoteSimpleInit"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True def cleanup_lxc_simple(self, node): pats = [ ] # if the test failed, attempt to clean up the cib and libvirt environment # as best as possible if self.failed: # restore libvirt and cib self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") return watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") with Timer(self._logger, self.name, "remoteSimpleCleanup"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True # cleanup libvirt self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") def __call__(self, node): '''Perform the 'RemoteLXC' test. ''' self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") (rc, _) = self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") if rc == 1: self.log("Environment test for lxc support failed.") return self.skipped() self.start_lxc_simple(node) self.cleanup_lxc_simple(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Updating failcount for ping", r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)", # The orphaned lxc-ms resource causes an expected transition error # that is a result of the scheduler not having knowledge that the # promotable resource used to be a clone. As a result, it looks like that # resource is running in multiple locations when it shouldn't... But in # this instance we know why this error is occurring and that it is expected. r"Calculated [Tt]ransition .*pe-error", r"Resource lxc-ms .* is active on 2 nodes attempting recovery", r"Unknown operation: fail", r"VirtualDomain.*ERROR: Unable to determine emulator" ] AllTestClasses.append(RemoteLXC) class RemoteBasic(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteBasic" def __call__(self, node): '''Perform the 'RemoteBaremetal' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.test_attributes(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteBasic) class RemoteStonithd(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteStonithd" def __call__(self, node): '''Perform the 'RemoteStonithd' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return False if "DoFencing" in self._env: return self._env["DoFencing"] return True @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Lost connection to Pacemaker Remote node", r"Software caused connection abort", r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)", r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" ] + super().errors_to_ignore AllTestClasses.append(RemoteStonithd) class RemoteMigrate(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteMigrate" def __call__(self, node): '''Perform the 'RemoteMigrate' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.migrate_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteMigrate) class RemoteRscFailure(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteRscFailure" def __call__(self, node): '''Perform the 'RemoteRscFailure' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) # This is an important step. We are migrating the connection # before failing the resource. This verifies that the migration # has properly maintained control over the remote-node. self.migrate_connection(node) self.fail_rsc(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)", r"Dummy.*: No process state file found" ] + super().errors_to_ignore def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteRscFailure) # vim:ts=4:sw=4:et: diff --git a/python/pacemaker/_cts/tests/Makefile.am b/python/pacemaker/_cts/tests/Makefile.am index 7b4872b8f3..3fbcdecc31 100644 --- a/python/pacemaker/_cts/tests/Makefile.am +++ b/python/pacemaker/_cts/tests/Makefile.am @@ -1,31 +1,32 @@ # # Copyright 2023 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # MAINTAINERCLEANFILES = Makefile.in pkgpythondir = $(pythondir)/$(PACKAGE)/_cts/tests pkgpython_PYTHON = __init__.py \ + componentfail.py \ ctstest.py \ fliptest.py \ maintenancemode.py \ partialstart.py \ remotedriver.py \ resourcerecover.py \ restarttest.py \ restartonebyone.py \ simulstart.py \ simulstop.py \ simulstartlite.py \ simulstoplite.py \ standbytest.py \ startonebyone.py \ starttest.py \ stonithdtest.py \ stoptest.py diff --git a/python/pacemaker/_cts/tests/__init__.py b/python/pacemaker/_cts/tests/__init__.py index b6d91906f8..d9bda20213 100644 --- a/python/pacemaker/_cts/tests/__init__.py +++ b/python/pacemaker/_cts/tests/__init__.py @@ -1,25 +1,26 @@ """ Test classes for the `pacemaker._cts` package. """ __copyright__ = "Copyright 2023 the Pacemaker project contributors" __license__ = "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)" +from pacemaker._cts.tests.componentfail import ComponentFail from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.fliptest import FlipTest from pacemaker._cts.tests.maintenancemode import MaintenanceMode from pacemaker._cts.tests.partialstart import PartialStart from pacemaker._cts.tests.restartonebyone import RestartOnebyOne from pacemaker._cts.tests.resourcerecover import ResourceRecover from pacemaker._cts.tests.restarttest import RestartTest from pacemaker._cts.tests.remotedriver import RemoteDriver from pacemaker._cts.tests.simulstart import SimulStart from pacemaker._cts.tests.simulstop import SimulStop from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.standbytest import StandbyTest from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.startonebyone import StartOnebyOne from pacemaker._cts.tests.stonithdtest import StonithdTest from pacemaker._cts.tests.stoponebyone import StopOnebyOne from pacemaker._cts.tests.stoptest import StopTest diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py new file mode 100644 index 0000000000..868fb81e2a --- /dev/null +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -0,0 +1,133 @@ +""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS) +""" + +__all__ = ["ComponentFail"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite + + +class ComponentFail(CTSTest): + def __init__(self, cm): + CTSTest.__init__(self,cm) + self.name = "ComponentFail" + self._startall = SimulStartLite(cm) + self.complist = cm.Components() + self.patterns = [] + self.okerrpatterns = [] + self.is_unsafe = True + + def __call__(self, node): + '''Perform the 'ComponentFail' test. ''' + self.incr("calls") + self.patterns = [] + self.okerrpatterns = [] + + # start all nodes + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + if not self._cm.cluster_stable(self._env["StableTime"]): + return self.failure("Setup failed - unstable") + + node_is_dc = self._cm.is_node_dc(node, None) + + # select a component to kill + chosen = self._env.random_gen.choice(self.complist) + while chosen.dc_only and node_is_dc == 0: + chosen = self._env.random_gen.choice(self.complist) + + self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc)) + self.incr(chosen.name) + + if chosen.name != "corosync": + self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) + self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) + + self.patterns.extend(chosen.pats) + if node_is_dc: + self.patterns.extend(chosen.dc_pats) + + # @TODO this should be a flag in the Component + if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: + # Ignore actions for fence devices if fencer will respawn + # (their registration will be lost, and probes will fail) + self.okerrpatterns = [ self.templates["Pat:Fencing_active"] ] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self._cm, line) + if r.rclass == "stonith": + self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) + self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) + + # supply a copy so self.patterns doesn't end up empty + tmpPats = [] + tmpPats.extend(self.patterns) + self.patterns.extend(chosen.badnews_ignore) + + # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status + stonithPats = [] + stonithPats.append(self.templates["Pat:Fencing_ok"] % node) + stonith = self.create_watch(stonithPats, 0) + stonith.set_watch() + + # set the watch for stable + watch = self.create_watch( + tmpPats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + watch.set_watch() + + # kill the component + chosen.kill(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + self.debug("Waiting for any fenced node to come back up") + self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + self._cm.cluster_stable(self._env["StartTime"]) + + self.debug("Checking if %s was shot" % node) + shot = stonith.look(60) + if shot: + self.debug("Found: " + repr(shot)) + self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) + + if not self._env["at-boot"]: + self._cm.ShouldBeStatus[node] = "down" + + # If fencing occurred, chances are many (if not all) the expected logs + # will not be sent - or will be lost when the node reboots + return self.success() + + # check for logs indicating a graceful recovery + matched = watch.look_for_all(allow_multiple_matches=True) + if watch.unmatched: + self._logger.log("Patterns not found: " + repr(watch.unmatched)) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + is_stable = self._cm.cluster_stable(self._env["StartTime"]) + + if not matched: + return self.failure("Didn't find all expected %s patterns" % chosen.name) + elif not is_stable: + return self.failure("Cluster did not become stable after killing %s" % chosen.name) + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # Note that okerrpatterns refers to the last time we ran this test + # The good news is that this works fine for us... + self.okerrpatterns.extend(self.patterns) + return self.okerrpatterns