diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py index 9fbee2320e..7741087ea4 100644 --- a/cts/lab/CTStests.py +++ b/cts/lab/CTStests.py @@ -1,1223 +1,1047 @@ """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS) """ __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import re import time from stat import * from pacemaker import BuildOptions from pacemaker._cts.CTS import NodeStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests import * from pacemaker._cts.timer import Timer AllTestClasses = [ ] AllTestClasses.append(FlipTest) AllTestClasses.append(RestartTest) AllTestClasses.append(StonithdTest) AllTestClasses.append(StartOnebyOne) AllTestClasses.append(SimulStart) AllTestClasses.append(SimulStop) AllTestClasses.append(StopOnebyOne) AllTestClasses.append(RestartOnebyOne) AllTestClasses.append(PartialStart) AllTestClasses.append(StandbyTest) - - -################################################################### -class MaintenanceMode(CTSTest): -################################################################### - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "MaintenanceMode" - self._start = StartTest(cm) - self._startall = SimulStartLite(cm) - self.max = 30 - self.benchmark = True - self.action = "asyncmon" - self.interval = 0 - self.rid = "maintenanceDummy" - - def toggleMaintenanceMode(self, node, action): - pats = [] - pats.append(self.templates["Pat:DC_IDLE"]) - - # fail the resource right after turning Maintenance mode on - # verify it is not recovered until maintenance mode is turned off - if action == "On": - pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid)) - else: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) - pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) - - watch = self.create_watch(pats, 60) - watch.set_watch() - - self.debug("Turning maintenance mode %s" % action) - self._rsh(node, self.templates["MaintenanceMode%s" % (action)]) - if (action == "On"): - self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) - - with Timer(self._logger, self.name, "recover%s" % action): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when turning maintenance mode %s" % action) - return repr(watch.unmatched) - - return "" - - def insertMaintenanceDummy(self, node): - pats = [] - pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid))) - - watch = self.create_watch(pats, 60) - watch.set_watch() - - self._cm.AddDummyRsc(node, self.rid) - - with Timer(self._logger, self.name, "addDummy"): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when adding maintenance dummy resource") - return repr(watch.unmatched) - return "" - - def removeMaintenanceDummy(self, node): - pats = [] - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) - - watch = self.create_watch(pats, 60) - watch.set_watch() - self._cm.RemoveDummyRsc(node, self.rid) - - with Timer(self._logger, self.name, "removeDummy"): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when removing maintenance dummy resource") - return repr(watch.unmatched) - return "" - - def managedRscList(self, node): - rscList = [] - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - tmp = AuditResource(self._cm, line) - if tmp.managed: - rscList.append(tmp.id) - - return rscList - - def verifyResources(self, node, rscList, managed): - managedList = list(rscList) - managed_str = "managed" - if not managed: - managed_str = "unmanaged" - - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - tmp = AuditResource(self._cm, line) - if managed and not tmp.managed: - continue - elif not managed and tmp.managed: - continue - elif managedList.count(tmp.id): - managedList.remove(tmp.id) - - if len(managedList) == 0: - self.debug("Found all %s resources on %s" % (managed_str, node)) - return True - - self._logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) - return False - - def __call__(self, node): - '''Perform the 'MaintenanceMode' test. ''' - self.incr("calls") - verify_managed = False - verify_unmanaged = False - failPat = "" - - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - # get a list of all the managed resources. We use this list - # after enabling maintenance mode to verify all managed resources - # become un-managed. After maintenance mode is turned off, we use - # this list to verify all the resources become managed again. - managedResources = self.managedRscList(node) - if len(managedResources) == 0: - self._logger.log("No managed resources on %s" % node) - return self.skipped() - - # insert a fake resource we can fail during maintenance mode - # so we can verify recovery does not take place until after maintenance - # mode is disabled. - failPat = failPat + self.insertMaintenanceDummy(node) - - # toggle maintenance mode ON, then fail dummy resource. - failPat = failPat + self.toggleMaintenanceMode(node, "On") - - # verify all the resources are now unmanaged - if self.verifyResources(node, managedResources, False): - verify_unmanaged = True - - # Toggle maintenance mode OFF, verify dummy is recovered. - failPat = failPat + self.toggleMaintenanceMode(node, "Off") - - # verify all the resources are now managed again - if self.verifyResources(node, managedResources, True): - verify_managed = True - - # Remove our maintenance dummy resource. - failPat = failPat + self.removeMaintenanceDummy(node) - - self._cm.cluster_stable() - - if failPat != "": - return self.failure("Unmatched patterns: %s" % (failPat)) - elif verify_unmanaged is False: - return self.failure("Failed to verify resources became unmanaged during maintenance mode") - elif verify_managed is False: - return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") - - return self.success() - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - return [ r"Updating failcount for %s" % self.rid, - r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self.rid, - r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self.action, self.rid), - r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval) ] - AllTestClasses.append(MaintenanceMode) class ResourceRecover(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ResourceRecover" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.max = 30 self.rid = None self.rid_alt = None self.benchmark = True # these are the values used for the new LRM API call self.action = "asyncmon" self.interval = 0 def __call__(self, node): '''Perform the 'ResourceRecover' test. ''' self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Setup failed") # List all resources active on the node (skip test if none) resourcelist = self._cm.active_resources(node) if len(resourcelist) == 0: self._logger.log("No active resources on %s" % node) return self.skipped() # Choose one resource at random rsc = self.choose_resource(node, resourcelist) if rsc is None: return self.failure("Could not get details of resource '%s'" % self.rid) if rsc.id == rsc.clone_id: self.debug("Failing " + rsc.id) else: self.debug("Failing " + rsc.id + " (also known as " + rsc.clone_id + ")") # Log patterns to watch for (failure, plus restart if managed) pats = [] pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id)) if rsc.managed: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) if rsc.unique: pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) else: # Anonymous clones may get restarted with a different clone number pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count # is incrementing properly, but it might restart on a different node. # We'd have to temporarily ban it from all other nodes and ensure the # migration-threshold hasn't been reached.) if self.fail_resource(rsc, node, pats) is None: return None # self.failure() already called return self.success() def choose_resource(self, node, resourcelist): """ Choose a random resource to target """ self.rid = self._env.random_gen.choice(resourcelist) self.rid_alt = self.rid (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if line.startswith("Resource: "): rsc = AuditResource(self._cm, line) if rsc.id == self.rid: # Handle anonymous clones that get renamed self.rid = rsc.clone_id return rsc return None def get_failcount(self, node): """ Check the fail count of targeted resource on given node """ (rc, lines) = self._rsh(node, "crm_failcount --quiet --query --resource %s " "--operation %s --interval %d " "--node %s" % (self.rid, self.action, self.interval, node), verbose=1) if rc != 0 or len(lines) != 1: self._logger.log("crm_failcount on %s failed (%d): %s" % (node, rc, " // ".join(map(str.strip, lines)))) return -1 try: failcount = int(lines[0]) except (IndexError, ValueError): self._logger.log("crm_failcount output on %s unparseable: %s" % (node, ' '.join(lines))) return -1 return failcount def fail_resource(self, rsc, node, pats): """ Fail the targeted resource, and verify as expected """ orig_failcount = self.get_failcount(node) watch = self.create_watch(pats, 60) watch.set_watch() self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) with Timer(self._logger, self.name, "recover"): watch.look_for_all() self._cm.cluster_stable() recovered = self._cm.ResourceLocation(self.rid) if watch.unmatched: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) elif rsc.unique and len(recovered) > 1: return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) elif len(recovered) > 0: self.debug("%s is running on: %s" % (self.rid, repr(recovered))) elif rsc.managed: return self.failure("%s was not recovered and is inactive" % self.rid) new_failcount = self.get_failcount(node) if new_failcount != (orig_failcount + 1): return self.failure("%s fail count is %d not %d" % (self.rid, new_failcount, orig_failcount + 1)) return 0 # Anything but None is success @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Updating failcount for %s" % self.rid, r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self.rid, self.rid_alt), r"Unknown operation: fail", self.templates["Pat:RscOpOK"] % (self.action, self.rid), r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval) ] AllTestClasses.append(ResourceRecover) class ComponentFail(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ComponentFail" self._startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] self.okerrpatterns = [] self.is_unsafe = True def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") self.patterns = [] self.okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") if not self._cm.cluster_stable(self._env["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self._cm.is_node_dc(node, None) # select a component to kill chosen = self._env.random_gen.choice(self.complist) while chosen.dc_only and node_is_dc == 0: chosen = self._env.random_gen.choice(self.complist) self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc)) self.incr(chosen.name) if chosen.name != "corosync": self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) self.patterns.extend(chosen.pats) if node_is_dc: self.patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self.okerrpatterns = [ self.templates["Pat:Fencing_active"] ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) # supply a copy so self.patterns doesn't end up empty tmpPats = [] tmpPats.extend(self.patterns) self.patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonithPats = [] stonithPats.append(self.templates["Pat:Fencing_ok"] % node) stonith = self.create_watch(stonithPats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( tmpPats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) watch.set_watch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self._cm.cluster_stable(self._env["StartTime"]) self.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.debug("Found: " + repr(shot)) self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.ShouldBeStatus[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log("Patterns not found: " + repr(watch.unmatched)) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["StartTime"]) if not matched: return self.failure("Didn't find all expected %s patterns" % chosen.name) elif not is_stable: return self.failure("Cluster did not become stable after killing %s" % chosen.name) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self.okerrpatterns.extend(self.patterns) return self.okerrpatterns AllTestClasses.append(ComponentFail) class SplitBrainTest(CTSTest): '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.is_experimental = True def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self._env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log("Node "+node+" not in " + repr(self._env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.debug("Creating partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self._cm.isolate_node(node, other_nodes): self._logger.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self._env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log("Node "+node+" not in " + repr(self._env["nodes"])) if len(other_nodes) == 0: return 1 self.debug("Healing partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: self._cm.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = True partitions = {} ret = self._startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self._env["nodes"]) for node in self._env["nodes"]: p = self._env.random_gen.randint(1, p_max) if not p in partitions: partitions[p] = [] partitions[p].append(node) p_max = len(list(partitions.keys())) if p_max > 1: break # else, try again self.debug("Created %d partitions" % p_max) for key in list(partitions.keys()): self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") for key in list(partitions.keys()): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self._cm.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self._cm.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self._cm.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self._cm.partitions_expected = 1 # And heal them again for key in list(partitions.keys()): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self._cm.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self._cm.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self._env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self._cm.cluster_stable(1200): self.failure("Reformed cluster not stable") if self._env["continue"]: answer = "Y" else: try: answer = input('Continue? [nY]') except EOFError as e: answer = "n" if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self._env["DoFencing"]: self._rsh(node, "crm_attribute -V -D -n stonith-enabled") self._cm.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"pacemaker-controld.*:.*not in our membership list", r"CRIT:.*node.*returning after partition" ] def is_applicable(self): if not CTSTest.is_applicable(self): return False return len(self._env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) class Reattach(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Reattach" self._startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = False def _is_managed(self, node): (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) is_managed = is_managed[0].strip() return is_managed == "true" def _set_unmanaged(self, node): self.debug("Disable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): self.debug("Re-enable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def setup(self, node): attempt = 0 if not self._startall(None): return None # Make sure we are really _really_ stable and that all # resources, including those that depend on transient node # attributes, are started while not self._cm.cluster_stable(double_check=True): if attempt < 5: attempt += 1 self.debug("Not stable yet, re-testing") else: self._logger.log("Cluster is not stable") return None return 1 def teardown(self, node): # Make sure 'node' is up start = StartTest(self._cm) start(node) if not self._is_managed(node): self._logger.log("Attempting to re-enable resource management on %s" % node) self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): self._logger.log("Could not re-enable resource management") return 0 return 1 def can_run_now(self, node): """ Return True if we can meaningfully run right now""" if self._find_ocfs2_resources(node): self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return False return True def __call__(self, node): self.incr("calls") pats = [] # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["No fencing will be done"], 60) managed.set_watch() self._set_unmanaged(node) if not managed.look_for_all(): self._logger.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*")) pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*")) watch = self.create_watch(pats, 60, "ShutdownActivity") watch.set_watch() self.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self._startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.set_watch() # Re-enable resource management (and verify it happened). self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.debug("Ignoring start actions for %s" % r.id) ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"resource( was|s were) active at shutdown" ] def is_applicable(self): return True AllTestClasses.append(Reattach) class SpecialTest1(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SpecialTest1" self._startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Could not stop all nodes") # Test config recovery when the other nodes come up self._rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*") # Start the selected node ret = self.restart1(node) if not ret: return self.failure("Could not start "+node) # Start all remaining nodes ret = self._startall(None) if not ret: return self.failure("Could not start the remaining nodes") return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ # Errors that occur as a result of the CIB being wiped return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", r"error.*: Resource start-up disabled since no STONITH resources have been defined", r"error.*: Either configure some or disable STONITH with the stonith-enabled option", r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" ] AllTestClasses.append(SpecialTest1) class NearQuorumPointTest(CTSTest): ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] stonith = self._cm.prepare_fencing_watcher("NearQuorumPoint") #decide what to do with each node for node in self._env["nodes"]: action = self._env.random_gen.choice(["start","stop"]) #action = self._env.random_gen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.debug("start nodes:" + repr(startset)) self.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self._cm.ShouldBeStatus[node] == "up": watchpats.append(self.templates["Pat:We_stopped"] % node) for node in startset: if self._cm.ShouldBeStatus[node] == "down": #watchpats.append(self.templates["Pat:NonDC_started"] % node) watchpats.append(self.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self._cm.ShouldBeStatus[stopping] == "up": watchpats.append(self.templates["Pat:They_stopped"] % (node, self._cm.key_for_node(stopping))) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self._env["DeadTime"]+10) watch.set_watch() #begin actions for node in stopset: if self._cm.ShouldBeStatus[node] == "up": self._cm.StopaCMnoBlock(node) for node in startset: if self._cm.ShouldBeStatus[node] == "down": self._cm.StartaCMnoBlock(node) #get the result if watch.look_for_all(): self._cm.cluster_stable() self._cm.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self._logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self._cm.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self._cm.StataCM(node) == 0: downnodes.append(node) self._cm.fencing_cleanup("NearQuorumPoint", stonith) if upnodes == [] and downnodes == []: self._cm.cluster_stable() # Make sure they're completely down with no residule for node in stopset: self._rsh(node, self.templates["StopCmd"]) return self.success() if len(upnodes) > 0: self._logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self._logger.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() def is_applicable(self): return True AllTestClasses.append(NearQuorumPointTest) def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.audits = audits result.append(bound_test) return result class RemoteLXC(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteLXC" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = True self.fail_string = "" def start_lxc_simple(self, node): # restore any artifacts laying around from a previous test. self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") # generate the containers, put them in the config, add some resources to them pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) with Timer(self._logger, self.name, "remoteSimpleInit"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True def cleanup_lxc_simple(self, node): pats = [ ] # if the test failed, attempt to clean up the cib and libvirt environment # as best as possible if self.failed: # restore libvirt and cib self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") return watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") with Timer(self._logger, self.name, "remoteSimpleCleanup"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True # cleanup libvirt self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") def __call__(self, node): '''Perform the 'RemoteLXC' test. ''' self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") (rc, _) = self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") if rc == 1: self.log("Environment test for lxc support failed.") return self.skipped() self.start_lxc_simple(node) self.cleanup_lxc_simple(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Updating failcount for ping", r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)", # The orphaned lxc-ms resource causes an expected transition error # that is a result of the scheduler not having knowledge that the # promotable resource used to be a clone. As a result, it looks like that # resource is running in multiple locations when it shouldn't... But in # this instance we know why this error is occurring and that it is expected. r"Calculated [Tt]ransition .*pe-error", r"Resource lxc-ms .* is active on 2 nodes attempting recovery", r"Unknown operation: fail", r"VirtualDomain.*ERROR: Unable to determine emulator" ] AllTestClasses.append(RemoteLXC) class RemoteBasic(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteBasic" def __call__(self, node): '''Perform the 'RemoteBaremetal' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.test_attributes(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteBasic) class RemoteStonithd(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteStonithd" def __call__(self, node): '''Perform the 'RemoteStonithd' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return False if "DoFencing" in self._env: return self._env["DoFencing"] return True @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Lost connection to Pacemaker Remote node", r"Software caused connection abort", r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)", r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" ] + super().errors_to_ignore AllTestClasses.append(RemoteStonithd) class RemoteMigrate(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteMigrate" def __call__(self, node): '''Perform the 'RemoteMigrate' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.migrate_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteMigrate) class RemoteRscFailure(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteRscFailure" def __call__(self, node): '''Perform the 'RemoteRscFailure' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) # This is an important step. We are migrating the connection # before failing the resource. This verifies that the migration # has properly maintained control over the remote-node. self.migrate_connection(node) self.fail_rsc(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)", r"Dummy.*: No process state file found" ] + super().errors_to_ignore def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteRscFailure) # vim:ts=4:sw=4:et: diff --git a/python/pacemaker/_cts/tests/Makefile.am b/python/pacemaker/_cts/tests/Makefile.am index f12cbc46e6..a688d0c4f2 100644 --- a/python/pacemaker/_cts/tests/Makefile.am +++ b/python/pacemaker/_cts/tests/Makefile.am @@ -1,29 +1,30 @@ # # Copyright 2023 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # MAINTAINERCLEANFILES = Makefile.in pkgpythondir = $(pythondir)/$(PACKAGE)/_cts/tests pkgpython_PYTHON = __init__.py \ ctstest.py \ fliptest.py \ + maintenancemode.py \ partialstart.py \ remotedriver.py \ restarttest.py \ restartonebyone.py \ simulstart.py \ simulstop.py \ simulstartlite.py \ simulstoplite.py \ standbytest.py \ startonebyone.py \ starttest.py \ stonithdtest.py \ stoptest.py diff --git a/python/pacemaker/_cts/tests/__init__.py b/python/pacemaker/_cts/tests/__init__.py index b2ad0dc8c5..362daea14d 100644 --- a/python/pacemaker/_cts/tests/__init__.py +++ b/python/pacemaker/_cts/tests/__init__.py @@ -1,23 +1,24 @@ """ Test classes for the `pacemaker._cts` package. """ __copyright__ = "Copyright 2023 the Pacemaker project contributors" __license__ = "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)" from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.fliptest import FlipTest +from pacemaker._cts.tests.maintenancemode import MaintenanceMode from pacemaker._cts.tests.partialstart import PartialStart from pacemaker._cts.tests.restartonebyone import RestartOnebyOne from pacemaker._cts.tests.restarttest import RestartTest from pacemaker._cts.tests.remotedriver import RemoteDriver from pacemaker._cts.tests.simulstart import SimulStart from pacemaker._cts.tests.simulstop import SimulStop from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.standbytest import StandbyTest from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.startonebyone import StartOnebyOne from pacemaker._cts.tests.stonithdtest import StonithdTest from pacemaker._cts.tests.stoponebyone import StopOnebyOne from pacemaker._cts.tests.stoptest import StopTest diff --git a/python/pacemaker/_cts/tests/maintenancemode.py b/python/pacemaker/_cts/tests/maintenancemode.py new file mode 100644 index 0000000000..aae39f187f --- /dev/null +++ b/python/pacemaker/_cts/tests/maintenancemode.py @@ -0,0 +1,189 @@ +""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS) +""" + +__all__ = ["MaintenanceMode"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.timer import Timer + + +################################################################### +class MaintenanceMode(CTSTest): +################################################################### + def __init__(self, cm): + CTSTest.__init__(self,cm) + self.name = "MaintenanceMode" + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + self.max = 30 + self.benchmark = True + self.action = "asyncmon" + self.interval = 0 + self.rid = "maintenanceDummy" + + def toggleMaintenanceMode(self, node, action): + pats = [] + pats.append(self.templates["Pat:DC_IDLE"]) + + # fail the resource right after turning Maintenance mode on + # verify it is not recovered until maintenance mode is turned off + if action == "On": + pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid)) + else: + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) + pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self.debug("Turning maintenance mode %s" % action) + self._rsh(node, self.templates["MaintenanceMode%s" % (action)]) + if (action == "On"): + self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) + + with Timer(self._logger, self.name, "recover%s" % action): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when turning maintenance mode %s" % action) + return repr(watch.unmatched) + + return "" + + def insertMaintenanceDummy(self, node): + pats = [] + pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid))) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self._cm.AddDummyRsc(node, self.rid) + + with Timer(self._logger, self.name, "addDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when adding maintenance dummy resource") + return repr(watch.unmatched) + return "" + + def removeMaintenanceDummy(self, node): + pats = [] + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) + + watch = self.create_watch(pats, 60) + watch.set_watch() + self._cm.RemoveDummyRsc(node, self.rid) + + with Timer(self._logger, self.name, "removeDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when removing maintenance dummy resource") + return repr(watch.unmatched) + return "" + + def managedRscList(self, node): + rscList = [] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + if tmp.managed: + rscList.append(tmp.id) + + return rscList + + def verifyResources(self, node, rscList, managed): + managedList = list(rscList) + managed_str = "managed" + if not managed: + managed_str = "unmanaged" + + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + if managed and not tmp.managed: + continue + elif not managed and tmp.managed: + continue + elif managedList.count(tmp.id): + managedList.remove(tmp.id) + + if len(managedList) == 0: + self.debug("Found all %s resources on %s" % (managed_str, node)) + return True + + self._logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) + return False + + def __call__(self, node): + '''Perform the 'MaintenanceMode' test. ''' + self.incr("calls") + verify_managed = False + verify_unmanaged = False + failPat = "" + + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + # get a list of all the managed resources. We use this list + # after enabling maintenance mode to verify all managed resources + # become un-managed. After maintenance mode is turned off, we use + # this list to verify all the resources become managed again. + managedResources = self.managedRscList(node) + if len(managedResources) == 0: + self._logger.log("No managed resources on %s" % node) + return self.skipped() + + # insert a fake resource we can fail during maintenance mode + # so we can verify recovery does not take place until after maintenance + # mode is disabled. + failPat = failPat + self.insertMaintenanceDummy(node) + + # toggle maintenance mode ON, then fail dummy resource. + failPat = failPat + self.toggleMaintenanceMode(node, "On") + + # verify all the resources are now unmanaged + if self.verifyResources(node, managedResources, False): + verify_unmanaged = True + + # Toggle maintenance mode OFF, verify dummy is recovered. + failPat = failPat + self.toggleMaintenanceMode(node, "Off") + + # verify all the resources are now managed again + if self.verifyResources(node, managedResources, True): + verify_managed = True + + # Remove our maintenance dummy resource. + failPat = failPat + self.removeMaintenanceDummy(node) + + self._cm.cluster_stable() + + if failPat != "": + return self.failure("Unmatched patterns: %s" % (failPat)) + elif verify_unmanaged is False: + return self.failure("Failed to verify resources became unmanaged during maintenance mode") + elif verify_managed is False: + return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ r"Updating failcount for %s" % self.rid, + r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self.rid, + r"Unknown operation: fail", + self.templates["Pat:RscOpOK"] % (self.action, self.rid), + r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval) ]