diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py index f47b1f7a37..31d2eef4a7 100644 --- a/cts/lab/CTStests.py +++ b/cts/lab/CTStests.py @@ -1,1317 +1,310 @@ """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS) """ __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import re import time from stat import * from pacemaker import BuildOptions from pacemaker._cts.CTS import NodeStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests import * from pacemaker._cts.timer import Timer AllTestClasses = [ ] AllTestClasses.append(FlipTest) AllTestClasses.append(RestartTest) AllTestClasses.append(StonithdTest) AllTestClasses.append(StartOnebyOne) AllTestClasses.append(SimulStart) AllTestClasses.append(SimulStop) AllTestClasses.append(StopOnebyOne) AllTestClasses.append(RestartOnebyOne) AllTestClasses.append(PartialStart) AllTestClasses.append(StandbyTest) - - -################################################################### -class MaintenanceMode(CTSTest): -################################################################### - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "MaintenanceMode" - self._start = StartTest(cm) - self._startall = SimulStartLite(cm) - self.max = 30 - self.benchmark = True - self.action = "asyncmon" - self.interval = 0 - self.rid = "maintenanceDummy" - - def toggleMaintenanceMode(self, node, action): - pats = [] - pats.append(self.templates["Pat:DC_IDLE"]) - - # fail the resource right after turning Maintenance mode on - # verify it is not recovered until maintenance mode is turned off - if action == "On": - pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid)) - else: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) - pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) - - watch = self.create_watch(pats, 60) - watch.set_watch() - - self.debug("Turning maintenance mode %s" % action) - self._rsh(node, self.templates["MaintenanceMode%s" % (action)]) - if (action == "On"): - self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) - - with Timer(self._logger, self.name, "recover%s" % action): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when turning maintenance mode %s" % action) - return repr(watch.unmatched) - - return "" - - def insertMaintenanceDummy(self, node): - pats = [] - pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid))) - - watch = self.create_watch(pats, 60) - watch.set_watch() - - self._cm.AddDummyRsc(node, self.rid) - - with Timer(self._logger, self.name, "addDummy"): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when adding maintenance dummy resource") - return repr(watch.unmatched) - return "" - - def removeMaintenanceDummy(self, node): - pats = [] - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) - - watch = self.create_watch(pats, 60) - watch.set_watch() - self._cm.RemoveDummyRsc(node, self.rid) - - with Timer(self._logger, self.name, "removeDummy"): - watch.look_for_all() - - if watch.unmatched: - self.debug("Failed to find patterns when removing maintenance dummy resource") - return repr(watch.unmatched) - return "" - - def managedRscList(self, node): - rscList = [] - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - tmp = AuditResource(self._cm, line) - if tmp.managed: - rscList.append(tmp.id) - - return rscList - - def verifyResources(self, node, rscList, managed): - managedList = list(rscList) - managed_str = "managed" - if not managed: - managed_str = "unmanaged" - - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - tmp = AuditResource(self._cm, line) - if managed and not tmp.managed: - continue - elif not managed and tmp.managed: - continue - elif managedList.count(tmp.id): - managedList.remove(tmp.id) - - if len(managedList) == 0: - self.debug("Found all %s resources on %s" % (managed_str, node)) - return True - - self._logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) - return False - - def __call__(self, node): - '''Perform the 'MaintenanceMode' test. ''' - self.incr("calls") - verify_managed = False - verify_unmanaged = False - failPat = "" - - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - # get a list of all the managed resources. We use this list - # after enabling maintenance mode to verify all managed resources - # become un-managed. After maintenance mode is turned off, we use - # this list to verify all the resources become managed again. - managedResources = self.managedRscList(node) - if len(managedResources) == 0: - self._logger.log("No managed resources on %s" % node) - return self.skipped() - - # insert a fake resource we can fail during maintenance mode - # so we can verify recovery does not take place until after maintenance - # mode is disabled. - failPat = failPat + self.insertMaintenanceDummy(node) - - # toggle maintenance mode ON, then fail dummy resource. - failPat = failPat + self.toggleMaintenanceMode(node, "On") - - # verify all the resources are now unmanaged - if self.verifyResources(node, managedResources, False): - verify_unmanaged = True - - # Toggle maintenance mode OFF, verify dummy is recovered. - failPat = failPat + self.toggleMaintenanceMode(node, "Off") - - # verify all the resources are now managed again - if self.verifyResources(node, managedResources, True): - verify_managed = True - - # Remove our maintenance dummy resource. - failPat = failPat + self.removeMaintenanceDummy(node) - - self._cm.cluster_stable() - - if failPat != "": - return self.failure("Unmatched patterns: %s" % (failPat)) - elif verify_unmanaged is False: - return self.failure("Failed to verify resources became unmanaged during maintenance mode") - elif verify_managed is False: - return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") - - return self.success() - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - return [ r"Updating failcount for %s" % self.rid, - r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self.rid, - r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self.action, self.rid), - r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval) ] - AllTestClasses.append(MaintenanceMode) - - -class ResourceRecover(CTSTest): - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "ResourceRecover" - self._start = StartTest(cm) - self._startall = SimulStartLite(cm) - self.max = 30 - self.rid = None - self.rid_alt = None - self.benchmark = True - - # these are the values used for the new LRM API call - self.action = "asyncmon" - self.interval = 0 - - def __call__(self, node): - '''Perform the 'ResourceRecover' test. ''' - self.incr("calls") - - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - # List all resources active on the node (skip test if none) - resourcelist = self._cm.active_resources(node) - if len(resourcelist) == 0: - self._logger.log("No active resources on %s" % node) - return self.skipped() - - # Choose one resource at random - rsc = self.choose_resource(node, resourcelist) - if rsc is None: - return self.failure("Could not get details of resource '%s'" % self.rid) - if rsc.id == rsc.clone_id: - self.debug("Failing " + rsc.id) - else: - self.debug("Failing " + rsc.id + " (also known as " + rsc.clone_id + ")") - - # Log patterns to watch for (failure, plus restart if managed) - pats = [] - pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id)) - if rsc.managed: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) - if rsc.unique: - pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) - else: - # Anonymous clones may get restarted with a different clone number - pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) - - # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count - # is incrementing properly, but it might restart on a different node. - # We'd have to temporarily ban it from all other nodes and ensure the - # migration-threshold hasn't been reached.) - if self.fail_resource(rsc, node, pats) is None: - return None # self.failure() already called - - return self.success() - - def choose_resource(self, node, resourcelist): - """ Choose a random resource to target """ - - self.rid = self._env.random_gen.choice(resourcelist) - self.rid_alt = self.rid - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if line.startswith("Resource: "): - rsc = AuditResource(self._cm, line) - if rsc.id == self.rid: - # Handle anonymous clones that get renamed - self.rid = rsc.clone_id - return rsc - return None - - def get_failcount(self, node): - """ Check the fail count of targeted resource on given node """ - - (rc, lines) = self._rsh(node, - "crm_failcount --quiet --query --resource %s " - "--operation %s --interval %d " - "--node %s" % (self.rid, self.action, - self.interval, node), verbose=1) - if rc != 0 or len(lines) != 1: - self._logger.log("crm_failcount on %s failed (%d): %s" % (node, rc, - " // ".join(map(str.strip, lines)))) - return -1 - try: - failcount = int(lines[0]) - except (IndexError, ValueError): - self._logger.log("crm_failcount output on %s unparseable: %s" % (node, - ' '.join(lines))) - return -1 - return failcount - - def fail_resource(self, rsc, node, pats): - """ Fail the targeted resource, and verify as expected """ - - orig_failcount = self.get_failcount(node) - - watch = self.create_watch(pats, 60) - watch.set_watch() - - self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) - - with Timer(self._logger, self.name, "recover"): - watch.look_for_all() - - self._cm.cluster_stable() - recovered = self._cm.ResourceLocation(self.rid) - - if watch.unmatched: - return self.failure("Patterns not found: %s" % repr(watch.unmatched)) - - elif rsc.unique and len(recovered) > 1: - return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) - - elif len(recovered) > 0: - self.debug("%s is running on: %s" % (self.rid, repr(recovered))) - - elif rsc.managed: - return self.failure("%s was not recovered and is inactive" % self.rid) - - new_failcount = self.get_failcount(node) - if new_failcount != (orig_failcount + 1): - return self.failure("%s fail count is %d not %d" % (self.rid, - new_failcount, orig_failcount + 1)) - - return 0 # Anything but None is success - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - return [ r"Updating failcount for %s" % self.rid, - r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self.rid, self.rid_alt), - r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self.action, self.rid), - r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval) ] - AllTestClasses.append(ResourceRecover) - - -class ComponentFail(CTSTest): - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "ComponentFail" - self._startall = SimulStartLite(cm) - self.complist = cm.Components() - self.patterns = [] - self.okerrpatterns = [] - self.is_unsafe = True - - def __call__(self, node): - '''Perform the 'ComponentFail' test. ''' - self.incr("calls") - self.patterns = [] - self.okerrpatterns = [] - - # start all nodes - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - if not self._cm.cluster_stable(self._env["StableTime"]): - return self.failure("Setup failed - unstable") - - node_is_dc = self._cm.is_node_dc(node, None) - - # select a component to kill - chosen = self._env.random_gen.choice(self.complist) - while chosen.dc_only and node_is_dc == 0: - chosen = self._env.random_gen.choice(self.complist) - - self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc)) - self.incr(chosen.name) - - if chosen.name != "corosync": - self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) - self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) - - self.patterns.extend(chosen.pats) - if node_is_dc: - self.patterns.extend(chosen.dc_pats) - - # @TODO this should be a flag in the Component - if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: - # Ignore actions for fence devices if fencer will respawn - # (their registration will be lost, and probes will fail) - self.okerrpatterns = [ self.templates["Pat:Fencing_active"] ] - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - r = AuditResource(self._cm, line) - if r.rclass == "stonith": - self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) - self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) - - # supply a copy so self.patterns doesn't end up empty - tmpPats = [] - tmpPats.extend(self.patterns) - self.patterns.extend(chosen.badnews_ignore) - - # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status - stonithPats = [] - stonithPats.append(self.templates["Pat:Fencing_ok"] % node) - stonith = self.create_watch(stonithPats, 0) - stonith.set_watch() - - # set the watch for stable - watch = self.create_watch( - tmpPats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) - watch.set_watch() - - # kill the component - chosen.kill(node) - - self.debug("Waiting for the cluster to recover") - self._cm.cluster_stable() - - self.debug("Waiting for any fenced node to come back up") - self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) - - self.debug("Waiting for the cluster to re-stabilize with all nodes") - self._cm.cluster_stable(self._env["StartTime"]) - - self.debug("Checking if %s was shot" % node) - shot = stonith.look(60) - if shot: - self.debug("Found: " + repr(shot)) - self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) - - if not self._env["at-boot"]: - self._cm.ShouldBeStatus[node] = "down" - - # If fencing occurred, chances are many (if not all) the expected logs - # will not be sent - or will be lost when the node reboots - return self.success() - - # check for logs indicating a graceful recovery - matched = watch.look_for_all(allow_multiple_matches=True) - if watch.unmatched: - self._logger.log("Patterns not found: " + repr(watch.unmatched)) - - self.debug("Waiting for the cluster to re-stabilize with all nodes") - is_stable = self._cm.cluster_stable(self._env["StartTime"]) - - if not matched: - return self.failure("Didn't find all expected %s patterns" % chosen.name) - elif not is_stable: - return self.failure("Cluster did not become stable after killing %s" % chosen.name) - - return self.success() - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - # Note that okerrpatterns refers to the last time we ran this test - # The good news is that this works fine for us... - self.okerrpatterns.extend(self.patterns) - return self.okerrpatterns - AllTestClasses.append(ComponentFail) - - -class SplitBrainTest(CTSTest): - '''It is used to test split-brain. when the path between the two nodes break - check the two nodes both take over the resource''' - def __init__(self,cm): - CTSTest.__init__(self,cm) - self.name = "SplitBrain" - self._start = StartTest(cm) - self._startall = SimulStartLite(cm) - self.is_experimental = True - - def isolate_partition(self, partition): - other_nodes = [] - other_nodes.extend(self._env["nodes"]) - - for node in partition: - try: - other_nodes.remove(node) - except ValueError: - self._logger.log("Node "+node+" not in " + repr(self._env["nodes"]) + " from " +repr(partition)) - - if len(other_nodes) == 0: - return 1 - - self.debug("Creating partition: " + repr(partition)) - self.debug("Everyone else: " + repr(other_nodes)) - - for node in partition: - if not self._cm.isolate_node(node, other_nodes): - self._logger.log("Could not isolate %s" % node) - return 0 - - return 1 - - def heal_partition(self, partition): - other_nodes = [] - other_nodes.extend(self._env["nodes"]) - - for node in partition: - try: - other_nodes.remove(node) - except ValueError: - self._logger.log("Node "+node+" not in " + repr(self._env["nodes"])) - - if len(other_nodes) == 0: - return 1 - - self.debug("Healing partition: " + repr(partition)) - self.debug("Everyone else: " + repr(other_nodes)) - - for node in partition: - self._cm.unisolate_node(node, other_nodes) - - def __call__(self, node): - '''Perform split-brain test''' - self.incr("calls") - self.passed = True - partitions = {} - - ret = self._startall(None) - if not ret: - return self.failure("Setup failed") - - while 1: - # Retry until we get multiple partitions - partitions = {} - p_max = len(self._env["nodes"]) - for node in self._env["nodes"]: - p = self._env.random_gen.randint(1, p_max) - if not p in partitions: - partitions[p] = [] - partitions[p].append(node) - p_max = len(list(partitions.keys())) - if p_max > 1: - break - # else, try again - - self.debug("Created %d partitions" % p_max) - for key in list(partitions.keys()): - self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) - - # Disabling STONITH to reduce test complexity for now - self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") - - for key in list(partitions.keys()): - self.isolate_partition(partitions[key]) - - count = 30 - while count > 0: - if len(self._cm.find_partitions()) != p_max: - time.sleep(10) - else: - break - else: - self.failure("Expected partitions were not created") - - # Target number of partitions formed - wait for stability - if not self._cm.cluster_stable(): - self.failure("Partitioned cluster not stable") - - # Now audit the cluster state - self._cm.partitions_expected = p_max - if not self.audit(): - self.failure("Audits failed") - self._cm.partitions_expected = 1 - - # And heal them again - for key in list(partitions.keys()): - self.heal_partition(partitions[key]) - - # Wait for a single partition to form - count = 30 - while count > 0: - if len(self._cm.find_partitions()) != 1: - time.sleep(10) - count -= 1 - else: - break - else: - self.failure("Cluster did not reform") - - # Wait for it to have the right number of members - count = 30 - while count > 0: - members = [] - - partitions = self._cm.find_partitions() - if len(partitions) > 0: - members = partitions[0].split() - - if len(members) != len(self._env["nodes"]): - time.sleep(10) - count -= 1 - else: - break - else: - self.failure("Cluster did not completely reform") - - # Wait up to 20 minutes - the delay is more preferable than - # trying to continue with in a messed up state - if not self._cm.cluster_stable(1200): - self.failure("Reformed cluster not stable") - if self._env["continue"]: - answer = "Y" - else: - try: - answer = input('Continue? [nY]') - except EOFError as e: - answer = "n" - if answer and answer == "n": - raise ValueError("Reformed cluster not stable") - - # Turn fencing back on - if self._env["DoFencing"]: - self._rsh(node, "crm_attribute -V -D -n stonith-enabled") - - self._cm.cluster_stable() - - if self.passed: - return self.success() - return self.failure("See previous errors") - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - return [ r"Another DC detected:", - r"(ERROR|error).*: .*Application of an update diff failed", - r"pacemaker-controld.*:.*not in our membership list", - r"CRIT:.*node.*returning after partition" ] - - def is_applicable(self): - if not CTSTest.is_applicable(self): - return False - return len(self._env["nodes"]) > 2 - AllTestClasses.append(SplitBrainTest) - - -class Reattach(CTSTest): - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "Reattach" - self._startall = SimulStartLite(cm) - self.restart1 = RestartTest(cm) - self.stopall = SimulStopLite(cm) - self.is_unsafe = False - - def _is_managed(self, node): - (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) - is_managed = is_managed[0].strip() - return is_managed == "true" - - def _set_unmanaged(self, node): - self.debug("Disable resource management") - self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") - - def _set_managed(self, node): - self.debug("Re-enable resource management") - self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") - - def setup(self, node): - attempt = 0 - if not self._startall(None): - return None - - # Make sure we are really _really_ stable and that all - # resources, including those that depend on transient node - # attributes, are started - while not self._cm.cluster_stable(double_check=True): - if attempt < 5: - attempt += 1 - self.debug("Not stable yet, re-testing") - else: - self._logger.log("Cluster is not stable") - return None - - return 1 - - def teardown(self, node): - - # Make sure 'node' is up - start = StartTest(self._cm) - start(node) - - if not self._is_managed(node): - self._logger.log("Attempting to re-enable resource management on %s" % node) - self._set_managed(node) - self._cm.cluster_stable() - if not self._is_managed(node): - self._logger.log("Could not re-enable resource management") - return 0 - - return 1 - - def can_run_now(self, node): - """ Return True if we can meaningfully run right now""" - if self._find_ocfs2_resources(node): - self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") - return False - - return True - - def __call__(self, node): - self.incr("calls") - - pats = [] - # Conveniently, the scheduler will display this message when disabling - # management, even if fencing is not enabled, so we can rely on it. - managed = self.create_watch(["No fencing will be done"], 60) - managed.set_watch() - - self._set_unmanaged(node) - - if not managed.look_for_all(): - self._logger.log("Patterns not found: " + repr(managed.unmatched)) - return self.failure("Resource management not disabled") - - pats = [] - pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) - pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*")) - pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*")) - pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*")) - pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*")) - - watch = self.create_watch(pats, 60, "ShutdownActivity") - watch.set_watch() - - self.debug("Shutting down the cluster") - ret = self.stopall(None) - if not ret: - self._set_managed(node) - return self.failure("Couldn't shut down the cluster") - - self.debug("Bringing the cluster back up") - ret = self._startall(None) - time.sleep(5) # allow ping to update the CIB - if not ret: - self._set_managed(node) - return self.failure("Couldn't restart the cluster") - - if self.local_badnews("ResourceActivity:", watch): - self._set_managed(node) - return self.failure("Resources stopped or started during cluster restart") - - watch = self.create_watch(pats, 60, "StartupActivity") - watch.set_watch() - - # Re-enable resource management (and verify it happened). - self._set_managed(node) - self._cm.cluster_stable() - if not self._is_managed(node): - return self.failure("Could not re-enable resource management") - - # Ignore actions for STONITH resources - ignore = [] - (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) - for line in lines: - if re.search("^Resource", line): - r = AuditResource(self._cm, line) - if r.rclass == "stonith": - - self.debug("Ignoring start actions for %s" % r.id) - ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) - - if self.local_badnews("ResourceActivity:", watch, ignore): - return self.failure("Resources stopped or started after resource management was re-enabled") - - return ret - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - return [ r"resource( was|s were) active at shutdown" ] - - def is_applicable(self): - return True - AllTestClasses.append(Reattach) - - -class SpecialTest1(CTSTest): - '''Set up a custom test to cause quorum failure issues for Andrew''' - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "SpecialTest1" - self._startall = SimulStartLite(cm) - self.restart1 = RestartTest(cm) - self.stopall = SimulStopLite(cm) - - def __call__(self, node): - '''Perform the 'SpecialTest1' test for Andrew. ''' - self.incr("calls") - - # Shut down all the nodes... - ret = self.stopall(None) - if not ret: - return self.failure("Could not stop all nodes") - - # Test config recovery when the other nodes come up - self._rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*") - - # Start the selected node - ret = self.restart1(node) - if not ret: - return self.failure("Could not start "+node) - - # Start all remaining nodes - ret = self._startall(None) - if not ret: - return self.failure("Could not start the remaining nodes") - - return self.success() - - @property - def errors_to_ignore(self): - """ Return list of errors which should be ignored """ - - # Errors that occur as a result of the CIB being wiped - return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", - r"error.*: Resource start-up disabled since no STONITH resources have been defined", - r"error.*: Either configure some or disable STONITH with the stonith-enabled option", - r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" ] - -AllTestClasses.append(SpecialTest1) - - -class NearQuorumPointTest(CTSTest): - ''' - This test brings larger clusters near the quorum point (50%). - In addition, it will test doing starts and stops at the same time. - - Here is how I think it should work: - - loop over the nodes and decide randomly which will be up and which - will be down Use a 50% probability for each of up/down. - - figure out what to do to get into that state from the current state - - in parallel, bring up those going up and bring those going down. - ''' - - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "NearQuorumPoint" - - def __call__(self, dummy): - '''Perform the 'NearQuorumPoint' test. ''' - self.incr("calls") - startset = [] - stopset = [] - - stonith = self._cm.prepare_fencing_watcher("NearQuorumPoint") - #decide what to do with each node - for node in self._env["nodes"]: - action = self._env.random_gen.choice(["start","stop"]) - #action = self._env.random_gen.choice(["start","stop","no change"]) - if action == "start" : - startset.append(node) - elif action == "stop" : - stopset.append(node) - - self.debug("start nodes:" + repr(startset)) - self.debug("stop nodes:" + repr(stopset)) - - #add search patterns - watchpats = [ ] - for node in stopset: - if self._cm.ShouldBeStatus[node] == "up": - watchpats.append(self.templates["Pat:We_stopped"] % node) - - for node in startset: - if self._cm.ShouldBeStatus[node] == "down": - #watchpats.append(self.templates["Pat:NonDC_started"] % node) - watchpats.append(self.templates["Pat:Local_started"] % node) - else: - for stopping in stopset: - if self._cm.ShouldBeStatus[stopping] == "up": - watchpats.append(self.templates["Pat:They_stopped"] % (node, self._cm.key_for_node(stopping))) - - if len(watchpats) == 0: - return self.skipped() - - if len(startset) != 0: - watchpats.append(self.templates["Pat:DC_IDLE"]) - - watch = self.create_watch(watchpats, self._env["DeadTime"]+10) - - watch.set_watch() - - #begin actions - for node in stopset: - if self._cm.ShouldBeStatus[node] == "up": - self._cm.StopaCMnoBlock(node) - - for node in startset: - if self._cm.ShouldBeStatus[node] == "down": - self._cm.StartaCMnoBlock(node) - - #get the result - if watch.look_for_all(): - self._cm.cluster_stable() - self._cm.fencing_cleanup("NearQuorumPoint", stonith) - return self.success() - - self._logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) - - #get the "bad" nodes - upnodes = [] - for node in stopset: - if self._cm.StataCM(node) == 1: - upnodes.append(node) - - downnodes = [] - for node in startset: - if self._cm.StataCM(node) == 0: - downnodes.append(node) - - self._cm.fencing_cleanup("NearQuorumPoint", stonith) - if upnodes == [] and downnodes == []: - self._cm.cluster_stable() - - # Make sure they're completely down with no residule - for node in stopset: - self._rsh(node, self.templates["StopCmd"]) - - return self.success() - - if len(upnodes) > 0: - self._logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) - - if len(downnodes) > 0: - self._logger.log("Warn: Unstartable nodes: " + repr(downnodes)) - - return self.failure() - - def is_applicable(self): - return True - +AllTestClasses.append(ResyncCIB) AllTestClasses.append(NearQuorumPointTest) -class RollingUpgradeTest(CTSTest): - '''Perform a rolling upgrade of the cluster''' - def __init__(self, cm): - CTSTest.__init__(self,cm) - self.name = "RollingUpgrade" - self._start = StartTest(cm) - self._stop = StopTest(cm) - self.stopall = SimulStopLite(cm) - self._startall = SimulStartLite(cm) - - def setup(self, node): - # Start all remaining nodes - ret = self.stopall(None) - if not ret: - return self.failure("Couldn't stop all nodes") - - for node in self._env["nodes"]: - if not self.downgrade(node, None): - return self.failure("Couldn't downgrade %s" % node) - - ret = self._startall(None) - if not ret: - return self.failure("Couldn't start all nodes") - return self.success() - - def teardown(self, node): - # Stop everything - ret = self.stopall(None) - if not ret: - return self.failure("Couldn't stop all nodes") - - for node in self._env["nodes"]: - if not self.upgrade(node, None): - return self.failure("Couldn't upgrade %s" % node) - - return self.success() - - def install(self, node, version, start=1, flags="--force"): - - target_dir = "/tmp/rpm-%s" % version - src_dir = "%s/%s" % (self._env["rpm-dir"], version) - - self._logger.log("Installing %s on %s with %s" % (version, node, flags)) - if not self._stop(node): - return self.failure("stop failure: "+node) - - self._rsh(node, "mkdir -p %s" % target_dir) - self._rsh(node, "rm -f %s/*.rpm" % target_dir) - (_, lines) = self._rsh(node, "ls -1 %s/*.rpm" % src_dir, verbose=1) - for line in lines: - line = line[:-1] - rc = self._rsh.copy("%s" % (line), "%s:%s/" % (node, target_dir)) - self._rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) - - if start and not self._start(node): - return self.failure("start failure: "+node) - - return self.success() - - def upgrade(self, node, start=1): - return self.install(node, self._env["current-version"], start) - - def downgrade(self, node, start=1): - return self.install(node, self._env["previous-version"], start, "--force --nodeps") - - def __call__(self, node): - '''Perform the 'Rolling Upgrade' test. ''' - self.incr("calls") - - for node in self._env["nodes"]: - if self.upgrade(node): - return self.failure("Couldn't upgrade %s" % node) - - self._cm.cluster_stable() - - return self.success() - - def is_applicable(self): - if not CTSTest.is_applicable(self): - return None - - if "rpm-dir" not in self._env: - return None - if "current-version" not in self._env: - return None - if "previous-version" not in self._env: - return None - - return 1 - -# Register RestartTest as a good test to run -AllTestClasses.append(RollingUpgradeTest) - - def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.audits = audits result.append(bound_test) return result class RemoteLXC(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteLXC" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = True self.fail_string = "" def start_lxc_simple(self, node): # restore any artifacts laying around from a previous test. self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") # generate the containers, put them in the config, add some resources to them pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2")) pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms")) pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) with Timer(self._logger, self.name, "remoteSimpleInit"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True def cleanup_lxc_simple(self, node): pats = [ ] # if the test failed, attempt to clean up the cib and libvirt environment # as best as possible if self.failed: # restore libvirt and cib self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") return watch = self.create_watch(pats, 120) watch.set_watch() pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1")) pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2")) self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") with Timer(self._logger, self.name, "remoteSimpleCleanup"): watch.look_for_all() if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = True # cleanup libvirt self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") def __call__(self, node): '''Perform the 'RemoteLXC' test. ''' self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") (rc, _) = self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") if rc == 1: self.log("Environment test for lxc support failed.") return self.skipped() self.start_lxc_simple(node) self.cleanup_lxc_simple(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Updating failcount for ping", r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)", # The orphaned lxc-ms resource causes an expected transition error # that is a result of the scheduler not having knowledge that the # promotable resource used to be a clone. As a result, it looks like that # resource is running in multiple locations when it shouldn't... But in # this instance we know why this error is occurring and that it is expected. r"Calculated [Tt]ransition .*pe-error", r"Resource lxc-ms .* is active on 2 nodes attempting recovery", r"Unknown operation: fail", r"VirtualDomain.*ERROR: Unable to determine emulator" ] AllTestClasses.append(RemoteLXC) class RemoteBasic(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteBasic" def __call__(self, node): '''Perform the 'RemoteBaremetal' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.test_attributes(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteBasic) class RemoteStonithd(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteStonithd" def __call__(self, node): '''Perform the 'RemoteStonithd' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return False if "DoFencing" in self._env: return self._env["DoFencing"] return True @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"Lost connection to Pacemaker Remote node", r"Software caused connection abort", r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)", r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" ] + super().errors_to_ignore AllTestClasses.append(RemoteStonithd) class RemoteMigrate(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteMigrate" def __call__(self, node): '''Perform the 'RemoteMigrate' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) self.migrate_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteMigrate) class RemoteRscFailure(RemoteDriver): def __init__(self, cm): RemoteDriver.__init__(self, cm) self.name = "RemoteRscFailure" def __call__(self, node): '''Perform the 'RemoteRscFailure' test. ''' if not self.start_new_test(node): return self.failure(self.fail_string) # This is an important step. We are migrating the connection # before failing the resource. This verifies that the migration # has properly maintained control over the remote-node. self.migrate_connection(node) self.fail_rsc(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)", r"Dummy.*: No process state file found" ] + super().errors_to_ignore def is_applicable(self): if not RemoteDriver.is_applicable(self): return 0 # This test requires at least three nodes: one to convert to a # remote node, one to host the connection originally, and one # to migrate the connection to. if len(self._env["nodes"]) < 3: return 0 return 1 AllTestClasses.append(RemoteRscFailure) # vim:ts=4:sw=4:et: diff --git a/python/pacemaker/_cts/tests/Makefile.am b/python/pacemaker/_cts/tests/Makefile.am index f12cbc46e6..f4354cbaff 100644 --- a/python/pacemaker/_cts/tests/Makefile.am +++ b/python/pacemaker/_cts/tests/Makefile.am @@ -1,29 +1,36 @@ # # Copyright 2023 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # MAINTAINERCLEANFILES = Makefile.in pkgpythondir = $(pythondir)/$(PACKAGE)/_cts/tests pkgpython_PYTHON = __init__.py \ + componentfail.py \ ctstest.py \ fliptest.py \ + maintenancemode.py \ + nearquorumpointtest.py \ partialstart.py \ + reattach.py \ remotedriver.py \ + resourcerecover.py \ restarttest.py \ restartonebyone.py \ + resynccib.py \ simulstart.py \ simulstop.py \ simulstartlite.py \ simulstoplite.py \ + splitbraintest.py \ standbytest.py \ startonebyone.py \ starttest.py \ stonithdtest.py \ stoptest.py diff --git a/python/pacemaker/_cts/tests/__init__.py b/python/pacemaker/_cts/tests/__init__.py index b2ad0dc8c5..eb78c4527c 100644 --- a/python/pacemaker/_cts/tests/__init__.py +++ b/python/pacemaker/_cts/tests/__init__.py @@ -1,23 +1,30 @@ """ Test classes for the `pacemaker._cts` package. """ __copyright__ = "Copyright 2023 the Pacemaker project contributors" __license__ = "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)" +from pacemaker._cts.tests.componentfail import ComponentFail from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.fliptest import FlipTest +from pacemaker._cts.tests.maintenancemode import MaintenanceMode +from pacemaker._cts.tests.nearquorumpointtest import NearQuorumPointTest from pacemaker._cts.tests.partialstart import PartialStart +from pacemaker._cts.tests.reattach import Reattach from pacemaker._cts.tests.restartonebyone import RestartOnebyOne +from pacemaker._cts.tests.resourcerecover import ResourceRecover from pacemaker._cts.tests.restarttest import RestartTest +from pacemaker._cts.tests.resynccib import ResyncCIB from pacemaker._cts.tests.remotedriver import RemoteDriver from pacemaker._cts.tests.simulstart import SimulStart from pacemaker._cts.tests.simulstop import SimulStop from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.splitbraintest import SplitBrainTest from pacemaker._cts.tests.standbytest import StandbyTest from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.startonebyone import StartOnebyOne from pacemaker._cts.tests.stonithdtest import StonithdTest from pacemaker._cts.tests.stoponebyone import StopOnebyOne from pacemaker._cts.tests.stoptest import StopTest diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py new file mode 100644 index 0000000000..329ba2fa38 --- /dev/null +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -0,0 +1,159 @@ +""" Kill a pacemaker daemon and test how the cluster recovers """ + +__all__ = ["ComponentFail"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class ComponentFail(CTSTest): + """ A concrete test that kills a random pacemaker daemon and waits for the + cluster to recover + """ + + def __init__(self, cm): + """ Create a new ComponentFail instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.is_unsafe = True + self.name = "ComponentFail" + + self._complist = cm.Components() + self._okerrpatterns = [] + self._patterns = [] + self._startall = SimulStartLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + self._patterns = [] + self._okerrpatterns = [] + + # start all nodes + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + if not self._cm.cluster_stable(self._env["StableTime"]): + return self.failure("Setup failed - unstable") + + node_is_dc = self._cm.is_node_dc(node, None) + + # select a component to kill + chosen = self._env.random_gen.choice(self._complist) + while chosen.dc_only and node_is_dc == 0: + chosen = self._env.random_gen.choice(self._complist) + + self.debug("...component %s (dc=%d)" % (chosen.name, node_is_dc)) + self.incr(chosen.name) + + if chosen.name != "corosync": + self._patterns.extend([ self.templates["Pat:ChildKilled"] % (node, chosen.name), + self.templates["Pat:ChildRespawn"] % (node, chosen.name) ]) + + self._patterns.extend(chosen.pats) + if node_is_dc: + self._patterns.extend(chosen.dc_pats) + + # @TODO this should be a flag in the Component + if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: + # Ignore actions for fence devices if fencer will respawn + # (their registration will be lost, and probes will fail) + self._okerrpatterns = [ self.templates["Pat:Fencing_active"] ] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self._cm, line) + + if r.rclass == "stonith": + self._okerrpatterns.extend([ self.templates["Pat:Fencing_recover"] % r.id, + self.templates["Pat:Fencing_probe"] % r.id ]) + + # supply a copy so self.patterns doesn't end up empty + tmp_pats = self._patterns.copy() + self._patterns.extend(chosen.badnews_ignore) + + # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status + stonith_pats = [ self.templates["Pat:Fencing_ok"] % node ] + stonith = self.create_watch(stonith_pats, 0) + stonith.set_watch() + + # set the watch for stable + watch = self.create_watch( + tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + + watch.set_watch() + + # kill the component + chosen.kill(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + self.debug("Waiting for any fenced node to come back up") + self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + self._cm.cluster_stable(self._env["StartTime"]) + + self.debug("Checking if %s was shot" % node) + shot = stonith.look(60) + + if shot: + self.debug("Found: %r" % shot) + self._okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) + + if not self._env["at-boot"]: + self._cm.ShouldBeStatus[node] = "down" + + # If fencing occurred, chances are many (if not all) the expected logs + # will not be sent - or will be lost when the node reboots + return self.success() + + # check for logs indicating a graceful recovery + matched = watch.look_for_all(allow_multiple_matches=True) + if watch.unmatched: + self._logger.log("Patterns not found: %r" % watch.unmatched) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + is_stable = self._cm.cluster_stable(self._env["StartTime"]) + + if not matched: + return self.failure("Didn't find all expected %s patterns" % chosen.name) + + if not is_stable: + return self.failure("Cluster did not become stable after killing %s" % chosen.name) + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # Note that okerrpatterns refers to the last time we ran this test + # The good news is that this works fine for us... + self._okerrpatterns.extend(self._patterns) + return self._okerrpatterns diff --git a/python/pacemaker/_cts/tests/maintenancemode.py b/python/pacemaker/_cts/tests/maintenancemode.py new file mode 100644 index 0000000000..f9ed8678bd --- /dev/null +++ b/python/pacemaker/_cts/tests/maintenancemode.py @@ -0,0 +1,228 @@ +""" Toggle nodes in and out of maintenance mode """ + +__all__ = ["MaintenanceMode"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class MaintenanceMode(CTSTest): + """ A concrete test that toggles nodes in and out of maintenance mode """ + + def __init__(self, cm): + """ Create a new MaintenanceMode instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.benchmark = True + self.name = "MaintenanceMode" + + self._action = "asyncmon" + self._rid = "maintenanceDummy" + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def _toggle_maintenance_mode(self, node, enabled): + """ Toggle maintenance mode on the given node """ + + pats = [ self.templates["Pat:DC_IDLE"] ] + + if enabled: + action = "On" + else: + action = "Off" + + # fail the resource right after turning Maintenance mode on + # verify it is not recovered until maintenance mode is turned off + if enabled: + pats.append(self.templates["Pat:RscOpFail"] % (self._action, self._rid)) + else: + pats.extend([ self.templates["Pat:RscOpOK"] % ("stop", self._rid), + self.templates["Pat:RscOpOK"] % ("start", self._rid) ]) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self.debug("Turning maintenance mode %s" % action) + self._rsh(node, self.templates["MaintenanceMode%s" % action]) + + if enabled: + self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self._rid, node)) + + with Timer(self._logger, self.name, "recover%s" % action): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when turning maintenance mode %s" % action) + return repr(watch.unmatched) + + return "" + + def _insert_maintenance_dummy(self, node): + """ Create a dummy resource on the given node """ + + pats = [ ("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) ] + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self._cm.AddDummyRsc(node, self._rid) + + with Timer(self._logger, self.name, "addDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when adding maintenance dummy resource") + return repr(watch.unmatched) + + return "" + + def _remove_maintenance_dummy(self, node): + """ Remove the previously created dummy resource on the given node """ + + pats = [ self.templates["Pat:RscOpOK"] % ("stop", self._rid) ] + + watch = self.create_watch(pats, 60) + watch.set_watch() + self._cm.RemoveDummyRsc(node, self._rid) + + with Timer(self._logger, self.name, "removeDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when removing maintenance dummy resource") + return repr(watch.unmatched) + + return "" + + def _managed_rscs(self, node): + """ Return a list of all resources managed by the cluster """ + + rscs = [] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + + if tmp.managed: + rscs.append(tmp.id) + + return rscs + + def _verify_resources(self, node, rscs, managed): + """ Verify that all resources in rscList are managed if they are expected + to be, or unmanaged if they are expected to be. + """ + + managed_rscs = rscs + managed_str = "managed" + + if not managed: + managed_str = "unmanaged" + + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + + if managed and not tmp.managed: + continue + + if not managed and tmp.managed: + continue + + if managed_rscs.count(tmp.id): + managed_rscs.remove(tmp.id) + + if not managed_rscs: + self.debug("Found all %s resources on %s" % (managed_str, node)) + return True + + self._logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managed_rscs)) + return False + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + verify_managed = False + verify_unmanaged = False + fail_pat = "" + + if not self._startall(None): + return self.failure("Setup failed") + + # get a list of all the managed resources. We use this list + # after enabling maintenance mode to verify all managed resources + # become un-managed. After maintenance mode is turned off, we use + # this list to verify all the resources become managed again. + managed_rscs = self._managed_rscs(node) + if not managed_rscs: + self._logger.log("No managed resources on %s" % node) + return self.skipped() + + # insert a fake resource we can fail during maintenance mode + # so we can verify recovery does not take place until after maintenance + # mode is disabled. + fail_pat += self._insert_maintenance_dummy(node) + + # toggle maintenance mode ON, then fail dummy resource. + fail_pat += self._toggle_maintenance_mode(node, True) + + # verify all the resources are now unmanaged + if self._verify_resources(node, managed_rscs, False): + verify_unmanaged = True + + # Toggle maintenance mode OFF, verify dummy is recovered. + fail_pat += self._toggle_maintenance_mode(node, False) + + # verify all the resources are now managed again + if self._verify_resources(node, managed_rscs, True): + verify_managed = True + + # Remove our maintenance dummy resource. + fail_pat += self._remove_maintenance_dummy(node) + + self._cm.cluster_stable() + + if fail_pat != "": + return self.failure("Unmatched patterns: %s" % fail_pat) + + if not verify_unmanaged: + return self.failure("Failed to verify resources became unmanaged during maintenance mode") + + if not verify_managed: + return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ r"Updating failcount for %s" % self._rid, + r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self._rid, + r"Unknown operation: fail", + self.templates["Pat:RscOpOK"] % (self._action, self._rid), + r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self._rid, self._action, 0) ] diff --git a/python/pacemaker/_cts/tests/nearquorumpointtest.py b/python/pacemaker/_cts/tests/nearquorumpointtest.py new file mode 100644 index 0000000000..b342415516 --- /dev/null +++ b/python/pacemaker/_cts/tests/nearquorumpointtest.py @@ -0,0 +1,125 @@ +""" Randomly start and stop nodes to bring the cluster close to the quorum point """ + +__all__ = ["NearQuorumPointTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class NearQuorumPointTest(CTSTest): + """ A concrete test that randomly starts and stops nodes to bring the + cluster close to the quorum point + """ + + def __init__(self, cm): + """ Create a new NearQuorumPointTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "NearQuorumPoint" + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + startset = [] + stopset = [] + + stonith = self._cm.prepare_fencing_watcher("NearQuorumPoint") + #decide what to do with each node + for node in self._env["nodes"]: + action = self._env.random_gen.choice(["start", "stop"]) + + if action == "start" : + startset.append(node) + elif action == "stop" : + stopset.append(node) + + self.debug("start nodes:%r" % startset) + self.debug("stop nodes:%r" % stopset) + + #add search patterns + watchpats = [ ] + for node in stopset: + if self._cm.ShouldBeStatus[node] == "up": + watchpats.append(self.templates["Pat:We_stopped"] % node) + + for node in startset: + if self._cm.ShouldBeStatus[node] == "down": + watchpats.append(self.templates["Pat:Local_started"] % node) + else: + for stopping in stopset: + if self._cm.ShouldBeStatus[stopping] == "up": + watchpats.append(self.templates["Pat:They_stopped"] % (node, self._cm.key_for_node(stopping))) + + if not watchpats: + return self.skipped() + + if startset: + watchpats.append(self.templates["Pat:DC_IDLE"]) + + watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + + watch.set_watch() + + #begin actions + for node in stopset: + if self._cm.ShouldBeStatus[node] == "up": + self._cm.StopaCMnoBlock(node) + + for node in startset: + if self._cm.ShouldBeStatus[node] == "down": + self._cm.StartaCMnoBlock(node) + + #get the result + if watch.look_for_all(): + self._cm.cluster_stable() + self._cm.fencing_cleanup("NearQuorumPoint", stonith) + return self.success() + + self._logger.log("Warn: Patterns not found: %r" % watch.unmatched) + + #get the "bad" nodes + upnodes = [] + for node in stopset: + if self._cm.StataCM(node) == 1: + upnodes.append(node) + + downnodes = [] + for node in startset: + if self._cm.StataCM(node) == 0: + downnodes.append(node) + + self._cm.fencing_cleanup("NearQuorumPoint", stonith) + if not upnodes and not downnodes: + self._cm.cluster_stable() + + # Make sure they're completely down with no residule + for node in stopset: + self._rsh(node, self.templates["StopCmd"]) + + return self.success() + + if upnodes: + self._logger.log("Warn: Unstoppable nodes: %r" % upnodes) + + if downnodes: + self._logger.log("Warn: Unstartable nodes: %r" % downnodes) + + return self.failure() diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py new file mode 100644 index 0000000000..d008acdeda --- /dev/null +++ b/python/pacemaker/_cts/tests/reattach.py @@ -0,0 +1,182 @@ +""" Restart the cluster and verify resources remain running """ + +__all__ = ["Reattach"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re +import time + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class Reattach(CTSTest): + """ A concrete test that restarts the cluster and verifies that resources + remain running throughout + """ + + def __init__(self, cm): + """ Create a new Reattach instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "Reattach" + + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def _is_managed(self, node): + """ Are resources managed by the cluster? """ + + (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) + is_managed = is_managed[0].strip() + return is_managed == "true" + + def _set_unmanaged(self, node): + """ Disable resource management """ + + self.debug("Disable resource management") + self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") + + def _set_managed(self, node): + """ Enable resource management """ + + self.debug("Re-enable resource management") + self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") + + def setup(self, node): + """ Setup this test """ + + attempt = 0 + if not self._startall(None): + return self.failure("Startall failed") + + # Make sure we are really _really_ stable and that all + # resources, including those that depend on transient node + # attributes, are started + while not self._cm.cluster_stable(double_check=True): + if attempt < 5: + attempt += 1 + self.debug("Not stable yet, re-testing") + else: + self._logger.log("Cluster is not stable") + return self.failure("Cluster is not stable") + + return self.success() + + def teardown(self, node): + """ Tear down this test """ + + # Make sure 'node' is up + start = StartTest(self._cm) + start(node) + + if not self._is_managed(node): + self._logger.log("Attempting to re-enable resource management on %s" % node) + self._set_managed(node) + self._cm.cluster_stable() + + if not self._is_managed(node): + self._logger.log("Could not re-enable resource management") + return self.failure("Could not re-establish resource management") + + return self.success() + + def can_run_now(self, node): + """ Return True if we can meaningfully run right now """ + + if self._find_ocfs2_resources(node): + self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") + return False + + return True + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + # Conveniently, the scheduler will display this message when disabling + # management, even if fencing is not enabled, so we can rely on it. + managed = self.create_watch(["No fencing will be done"], 60) + managed.set_watch() + + self._set_unmanaged(node) + + if not managed.look_for_all(): + self._logger.log("Patterns not found: %r" % managed.unmatched) + return self.failure("Resource management not disabled") + + pats = [ self.templates["Pat:RscOpOK"] % ("start", ".*"), + self.templates["Pat:RscOpOK"] % ("stop", ".*"), + self.templates["Pat:RscOpOK"] % ("promote", ".*"), + self.templates["Pat:RscOpOK"] % ("demote", ".*"), + self.templates["Pat:RscOpOK"] % ("migrate", ".*") ] + + watch = self.create_watch(pats, 60, "ShutdownActivity") + watch.set_watch() + + self.debug("Shutting down the cluster") + ret = self._stopall(None) + if not ret: + self._set_managed(node) + return self.failure("Couldn't shut down the cluster") + + self.debug("Bringing the cluster back up") + ret = self._startall(None) + time.sleep(5) # allow ping to update the CIB + if not ret: + self._set_managed(node) + return self.failure("Couldn't restart the cluster") + + if self.local_badnews("ResourceActivity:", watch): + self._set_managed(node) + return self.failure("Resources stopped or started during cluster restart") + + watch = self.create_watch(pats, 60, "StartupActivity") + watch.set_watch() + + # Re-enable resource management (and verify it happened). + self._set_managed(node) + self._cm.cluster_stable() + if not self._is_managed(node): + return self.failure("Could not re-enable resource management") + + # Ignore actions for STONITH resources + ignore = [] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self._cm, line) + + if r.rclass == "stonith": + self.debug("Ignoring start actions for %s" % r.id) + ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) + + if self.local_badnews("ResourceActivity:", watch, ignore): + return self.failure("Resources stopped or started after resource management was re-enabled") + + return ret + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ r"resource( was|s were) active at shutdown" ] diff --git a/python/pacemaker/_cts/tests/remotedriver.py b/python/pacemaker/_cts/tests/remotedriver.py index 852976113d..d6e2ca2fde 100644 --- a/python/pacemaker/_cts/tests/remotedriver.py +++ b/python/pacemaker/_cts/tests/remotedriver.py @@ -1,533 +1,538 @@ """ Base classes for CTS tests """ __all__ = ["RemoteDriver"] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import os import time import subprocess import tempfile from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.stoptest import StopTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class RemoteDriver(CTSTest): """ A specialized base class for cluster tests that run on Pacemaker Remote nodes. This builds on top of CTSTest to provide methods for starting and stopping services and resources, and managing remote nodes. This is still just an abstract class -- specific tests need to implement their own specialized behavior. """ def __init__(self, cm): """ Create a new RemoteDriver instance Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self,cm) self.name = "RemoteDriver" self._corosync_enabled = False self._pacemaker_enabled = False self._remote_node = None self._remote_rsc = "remote-rsc" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self._stop = StopTest(cm) self.reset() def reset(self): """ Reset the state of this test back to what it was before the test was run """ self.failed = False self.fail_string = "" self._pcmk_started = False self._remote_node_added = False self._remote_rsc_added = False self._remote_use_reconnect_interval = self._env.random_gen.choice([True,False]) def fail(self, msg): """ Mark test as failed """ self.failed = True # Always log the failure. self._logger.log(msg) # Use first failure as test status, as it's likely to be most useful. if not self.fail_string: self.fail_string = msg def _get_other_node(self, node): """ Get the first cluster node out of the environment that is not the given node. Typically, this is used to find some node that will still be active that we can run cluster commands on. """ for othernode in self._env["nodes"]: if othernode == node: # we don't want to try and use the cib that we just shutdown. # find a cluster node that is not our soon to be remote-node. continue return othernode def _del_rsc(self, node, rsc): """ Delete the given named resource from the cluster. The given `node` is the cluster node on which we should *not* run the delete command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, "crm_resource -D -r %s -t primitive" % rsc) if rc != 0: self.fail("Removal of resource '%s' failed" % rsc) def _add_rsc(self, node, rsc_xml): """ Add a resource given in XML format to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, "cibadmin -C -o resources -X '%s'" % rsc_xml) if rc != 0: self.fail("resource creation failed") def _add_primitive_rsc(self, node): """ Add a primitive heartbeat resource for the remote node to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ rsc_xml = """ """ % { "node": self._remote_rsc } self._add_rsc(node, rsc_xml) if not self.failed: self._remote_rsc_added = True def _add_connection_rsc(self, node): """ Add a primitive connection resource for the remote node to the cluster. The given `node` is teh cluster node on which we should *not* run the add command. """ rsc_xml = """ """ % { "node": self._remote_node, "server": node } if self._remote_use_reconnect_interval: # Set reconnect interval on resource rsc_xml += """ """ % self._remote_node rsc_xml += """ """ % { "node": self._remote_node } self._add_rsc(node, rsc_xml) if not self.failed: self._remote_node_added = True def _disable_services(self, node): """ Disable the corosync and pacemaker services on the given node """ self._corosync_enabled = self._env.service_is_enabled(node, "corosync") if self._corosync_enabled: self._env.disable_service(node, "corosync") self._pacemaker_enabled = self._env.service_is_enabled(node, "pacemaker") if self._pacemaker_enabled: self._env.disable_service(node, "pacemaker") def _enable_services(self, node): """ Enable the corosync and pacemaker services on the given node """ if self._corosync_enabled: self._env.enable_service(node, "corosync") if self._pacemaker_enabled: self._env.enable_service(node, "pacemaker") def _stop_pcmk_remote(self, node): """ Stop the Pacemaker Remote service on the given node """ for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote stop") if rc != 0: time.sleep(6) else: break def _start_pcmk_remote(self, node): """ Start the Pacemaker Remote service on the given node """ for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote start") if rc != 0: time.sleep(6) else: self._pcmk_started = True break def _freeze_pcmk_remote(self, node): """ Simulate a Pacemaker Remote daemon failure """ self._rsh(node, "killall -STOP pacemaker-remoted") def _resume_pcmk_remote(self, node): """ Simulate the Pacemaker Remote daemon recovering """ self._rsh(node, "killall -CONT pacemaker-remoted") def _start_metal(self, node): """ Setup a Pacemaker Remote configuration. Remove any existing connection resources or nodes. Start the pacemaker_remote service. Create a connection resource. """ # Cluster nodes are reused as remote nodes in remote tests. If cluster # services were enabled at boot, in case the remote node got fenced, the # cluster node would join instead of the expected remote one. Meanwhile # pacemaker_remote would not be able to start. Depending on the chances, # the situations might not be able to be orchestrated gracefully any more. # # Temporarily disable any enabled cluster serivces. self._disable_services(node) # make sure the resource doesn't already exist for some reason self._rsh(node, "crm_resource -D -r %s -t primitive" % self._remote_rsc) self._rsh(node, "crm_resource -D -r %s -t primitive" % self._remote_node) if not self._stop(node): self.fail("Failed to shutdown cluster node %s" % node) return self._start_pcmk_remote(node) if not self._pcmk_started: self.fail("Failed to start pacemaker_remote on node %s" % node) return # Convert node to baremetal now that it has shutdown the cluster stack - pats = [ self.templates["Pat:RscOpOK"] % ("start", self._remote_node), - self.templates["Pat:DC_IDLE"] ] + pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() + pats.extend([ self.templates["Pat:RscOpOK"] % ("start", self._remote_node), + self.templates["Pat:DC_IDLE"] ]) + self._add_connection_rsc(node) with Timer(self._logger, self.name, "remoteMetalInit"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def migrate_connection(self, node): """ Move the remote connection resource from the node it's currently running on to any other available node """ if self.failed: return pats = [ self.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), self.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), self.templates["Pat:DC_IDLE"] ] watch = self.create_watch(pats, 120) watch.set_watch() (rc, _) = self._rsh(node, "crm_resource -M -r %s" % self._remote_node, verbose=1) if rc != 0: self.fail("failed to move remote node connection resource") return with Timer(self._logger, self.name, "remoteMetalMigrate"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def fail_rsc(self, node): """ Cause the dummy resource running on a Pacemaker Remote node to fail and verify that the failure is logged correctly """ if self.failed: return watchpats = [ self.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), self.templates["Pat:DC_IDLE"] ] watch = self.create_watch(watchpats, 120) watch.set_watch() self.debug("causing dummy rsc to fail.") self._rsh(node, "rm -f /var/run/resource-agents/Dummy*") with Timer(self._logger, self.name, "remoteRscFail"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched) def fail_connection(self, node): """ Cause the remote connection resource to fail and verify that the node is fenced and the connection resource is restarted on another node. """ if self.failed: return watchpats = [ self.templates["Pat:Fencing_ok"] % self._remote_node, self.templates["Pat:NodeFenced"] % self._remote_node ] watch = self.create_watch(watchpats, 120) watch.set_watch() # freeze the pcmk remote daemon. this will result in fencing self.debug("Force stopped active remote node") self._freeze_pcmk_remote(node) self.debug("Waiting for remote node to be fenced.") with Timer(self._logger, self.name, "remoteMetalFence"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return self.debug("Waiting for the remote node to come back up") self._cm.ns.wait_for_node(node, 120) - pats = [ self.templates["Pat:RscOpOK"] % ("start", self._remote_node) ] + pats = [ ] + + watch = self.create_watch(pats, 240) + watch.set_watch() + + pats.append(self.templates["Pat:RscOpOK"] % ("start", self._remote_node)) if self._remote_rsc_added: pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) - watch = self.create_watch([], 240) - watch.set_watch() - # start the remote node again watch it integrate back into cluster. self._start_pcmk_remote(node) if not self._pcmk_started: self.fail("Failed to start pacemaker_remote on node %s" % node) return self.debug("Waiting for remote node to rejoin cluster after being fenced.") with Timer(self._logger, self.name, "remoteMetalRestart"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def _add_dummy_rsc(self, node): """ Add a dummy resource that runs on the Pacemaker Remote node """ if self.failed: return # verify we can put a resource on the remote node - pats = [ self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), - self.templates["Pat:DC_IDLE"] ] - + pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() + pats.extend([ self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self.templates["Pat:DC_IDLE"] ]) + # Add a resource that must live on remote-node self._add_primitive_rsc(node) # force that rsc to prefer the remote node. (rc, _) = self._cm.rsh(node, "crm_resource -M -r %s -N %s -f" % (self._remote_rsc, self._remote_node), verbose=1) if rc != 0: self.fail("Failed to place remote resource on remote node.") return with Timer(self._logger, self.name, "remoteMetalRsc"): watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def test_attributes(self, node): """ Verify that attributes can be set on the Pacemaker Remote node """ if self.failed: return # This verifies permanent attributes can be set on a remote-node. It also # verifies the remote-node can edit its own cib node section remotely. (rc, line) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % self._remote_node, verbose=1) if rc != 0: self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)) return (rc, _) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % self._remote_node, verbose=1) if rc != 0: self.fail("Failed to get remote-node attribute") return (rc, _) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % self._remote_node, verbose=1) if rc != 0: self.fail("Failed to delete remote-node attribute") def cleanup_metal(self, node): """ Clean up the Pacemaker Remote node configuration previously created by _setup_metal. Stop and remove dummy resources and connection resources. Stop the pacemaker_remote service. Remove the remote node itself. """ self._enable_services(node) if not self._pcmk_started: return pats = [ ] watch = self.create_watch(pats, 120) watch.set_watch() if self._remote_rsc_added: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) if self._remote_node_added: pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) with Timer(self._logger, self.name, "remoteMetalCleanup"): self._resume_pcmk_remote(node) if self._remote_rsc_added: # Remove dummy resource added for remote node tests self.debug("Cleaning up dummy rsc put on remote node") self._rsh(self._get_other_node(node), "crm_resource -U -r %s" % self._remote_rsc) self._del_rsc(node, self._remote_rsc) if self._remote_node_added: # Remove remote node's connection resource self.debug("Cleaning up remote node connection resource") self._rsh(self._get_other_node(node), "crm_resource -U -r %s" % self._remote_node) self._del_rsc(node, self._remote_node) watch.look_for_all() if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) self._stop_pcmk_remote(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self._remote_node_added: # Remove remote node itself self.debug("Cleaning up node entry for remote node") self._rsh(self._get_other_node(node), "crm_node --force --remove %s" % self._remote_node) def _setup_env(self, node): """ Setup the environment to allow Pacemaker Remote to function. This involves generating a key and copying it to all nodes in the cluster. """ self._remote_node = "remote-%s" % node # we are assuming if all nodes have a key, that it is # the right key... If any node doesn't have a remote # key, we regenerate it everywhere. if self._rsh.exists_on_all("/etc/pacemaker/authkey", self._env["nodes"]): return # create key locally (handle, keyfile) = tempfile.mkstemp(".cts") os.close(handle) subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # sync key throughout the cluster for n in self._env["nodes"]: self._rsh(n, "mkdir -p --mode=0750 /etc/pacemaker") self._rsh.copy(keyfile, "root@%s:/etc/pacemaker/authkey" % n) self._rsh(n, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") self._rsh(n, "chmod 0640 /etc/pacemaker/authkey") os.unlink(keyfile) def is_applicable(self): """ Return True if this test is applicable in the current test configuration. """ if not CTSTest.is_applicable(self): return False for node in self._env["nodes"]: (rc, _) = self._rsh(node, "which pacemaker-remoted >/dev/null 2>&1") if rc != 0: return False return True def start_new_test(self, node): """ Prepare a remote test for running by setting up its environment and resources """ self.incr("calls") self.reset() ret = self._startall(None) if not ret: return self.failure("setup failed: could not start all nodes") self._setup_env(node) self._start_metal(node) self._add_dummy_rsc(node) return True def __call__(self, node): """ Perform this test """ raise NotImplementedError @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"""is running on remote.*which isn't allowed""", r"""Connection terminated""", r"""Could not send remote""" ] diff --git a/python/pacemaker/_cts/tests/resourcerecover.py b/python/pacemaker/_cts/tests/resourcerecover.py new file mode 100644 index 0000000000..f04abda463 --- /dev/null +++ b/python/pacemaker/_cts/tests/resourcerecover.py @@ -0,0 +1,171 @@ +""" Fail a random resource and verify its fail count increases """ + +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class ResourceRecover(CTSTest): + """ A concrete test that fails a random resource """ + + def __init__(self, cm): + """ Create a new ResourceRecover instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.benchmark = True + self.name = "ResourceRecover" + + self._action = "asyncmon" + self._interval = 0 + self._rid = None + self._rid_alt = None + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + if not self._startall(None): + return self.failure("Setup failed") + + # List all resources active on the node (skip test if none) + resourcelist = self._cm.active_resources(node) + if not resourcelist: + self._logger.log("No active resources on %s" % node) + return self.skipped() + + # Choose one resource at random + rsc = self._choose_resource(node, resourcelist) + if rsc is None: + return self.failure("Could not get details of resource '%s'" % self._rid) + + if rsc.id == rsc.clone_id: + self.debug("Failing %s" % rsc.id) + else: + self.debug("Failing %s (also known as %s)" % (rsc.id, rsc.clone_id)) + + # Log patterns to watch for (failure, plus restart if managed) + pats = [ self.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) ] + + if rsc.managed: + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._rid)) + + if rsc.unique: + pats.append(self.templates["Pat:RscOpOK"] % ("start", self._rid)) + else: + # Anonymous clones may get restarted with a different clone number + pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) + + # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count + # is incrementing properly, but it might restart on a different node. + # We'd have to temporarily ban it from all other nodes and ensure the + # migration-threshold hasn't been reached.) + if self._fail_resource(rsc, node, pats) is None: + # self.failure() already called + return None + + return self.success() + + def _choose_resource(self, node, resourcelist): + """ Choose a random resource to target """ + + self._rid = self._env.random_gen.choice(resourcelist) + self._rid_alt = self._rid + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if line.startswith("Resource: "): + rsc = AuditResource(self._cm, line) + + if rsc.id == self._rid: + # Handle anonymous clones that get renamed + self._rid = rsc.clone_id + return rsc + + return None + + def _get_failcount(self, node): + """ Check the fail count of targeted resource on given node """ + + cmd = "crm_failcount --quiet --query --resource %s --operation %s --interval %d --node %s" + (rc, lines) = self._rsh(node, cmd % (self._rid, self._action, self._interval, node), + verbose=1) + + if rc != 0 or len(lines) != 1: + lines = [l.strip() for l in lines] + self._logger.log("crm_failcount on %s failed (%d): %s" % (node, rc, " // ".join(lines))) + return -1 + + try: + failcount = int(lines[0]) + except (IndexError, ValueError): + self._logger.log("crm_failcount output on %s unparseable: %s" % (node, " ".join(lines))) + return -1 + + return failcount + + def _fail_resource(self, rsc, node, pats): + """ Fail the targeted resource, and verify as expected """ + + orig_failcount = self._get_failcount(node) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self._rid, node)) + + with Timer(self._logger, self.name, "recover"): + watch.look_for_all() + + self._cm.cluster_stable() + recovered = self._cm.ResourceLocation(self._rid) + + if watch.unmatched: + return self.failure("Patterns not found: %r" % watch.unmatched) + + if rsc.unique and len(recovered) > 1: + return self.failure("%s is now active on more than one node: %r" % (self._rid, recovered)) + + if recovered: + self.debug("%s is running on: %r" % (self._rid, recovered)) + + elif rsc.managed: + return self.failure("%s was not recovered and is inactive" % self._rid) + + new_failcount = self._get_failcount(node) + if new_failcount != orig_failcount + 1: + return self.failure("%s fail count is %d not %d" % (self._rid, + new_failcount, orig_failcount + 1)) + + return 0 # Anything but None is success + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ r"Updating failcount for %s" % self._rid, + r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self._rid, self._rid_alt), + r"Unknown operation: fail", + self.templates["Pat:RscOpOK"] % (self._action, self._rid), + r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self._rid, self._action, self._interval) ] diff --git a/python/pacemaker/_cts/tests/resynccib.py b/python/pacemaker/_cts/tests/resynccib.py new file mode 100644 index 0000000000..3e7179072e --- /dev/null +++ b/python/pacemaker/_cts/tests/resynccib.py @@ -0,0 +1,73 @@ +""" Start the cluster without a CIB and verify it gets copied from another node """ + +__all__ = ["ResyncCIB"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker import BuildOptions +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.restarttest import RestartTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class ResyncCIB(CTSTest): + """ A concrete test that starts the cluster on one node without a CIB and + verifies the CIB is copied over when the remaining nodes join + """ + + def __init__(self, cm): + """ Create a new ResyncCIB instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "ResyncCIB" + + self._restart1 = RestartTest(cm) + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + # Shut down all the nodes... + if not self._stopall(None): + return self.failure("Could not stop all nodes") + + # Test config recovery when the other nodes come up + self._rsh(node, "rm -f %s/cib*" % BuildOptions.CIB_DIR) + + # Start the selected node + if not self._restart1(node): + return self.failure("Could not start %s" % node) + + # Start all remaining nodes + if not self._startall(None): + return self.failure("Could not start the remaining nodes") + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # Errors that occur as a result of the CIB being wiped + return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", + r"error.*: Resource start-up disabled since no STONITH resources have been defined", + r"error.*: Either configure some or disable STONITH with the stonith-enabled option", + r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" ] diff --git a/python/pacemaker/_cts/tests/splitbraintest.py b/python/pacemaker/_cts/tests/splitbraintest.py new file mode 100644 index 0000000000..6664281d7e --- /dev/null +++ b/python/pacemaker/_cts/tests/splitbraintest.py @@ -0,0 +1,213 @@ +""" Create a split brain cluster and verify a resource is multiply managed """ + +__all__ = ["SplitBrainTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import time + +from pacemaker._cts.input import should_continue +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class SplitBrainTest(CTSTest): + """ A concrete test that creates a split brain cluster and verifies that + one node in each partition takes over the resource, resulting in two + nodes running the same resource. + """ + + def __init__(self, cm): + """ Create a new SplitBrainTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.is_experimental = True + self.name = "SplitBrain" + + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def _isolate_partition(self, partition): + """ Create a new partition containing the given nodes """ + + other_nodes = self._env["nodes"].copy() + + for node in partition: + try: + other_nodes.remove(node) + except ValueError: + self._logger.log("Node %s not in %r from %r" % (node,self._env["nodes"], partition)) + + if not other_nodes: + return + + self.debug("Creating partition: %r" % partition) + self.debug("Everyone else: %r" % other_nodes) + + for node in partition: + if not self._cm.isolate_node(node, other_nodes): + self._logger.log("Could not isolate %s" % node) + return + + def _heal_partition(self, partition): + """ Move the given nodes out of their own partition back into the cluster """ + + other_nodes = self._env["nodes"].copy() + + for node in partition: + try: + other_nodes.remove(node) + except ValueError: + self._logger.log("Node %s not in %r" % (node, self._env["nodes"])) + + if len(other_nodes) == 0: + return + + self.debug("Healing partition: %r" % partition) + self.debug("Everyone else: %r" % other_nodes) + + for node in partition: + self._cm.unisolate_node(node, other_nodes) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + self.passed = True + partitions = {} + + if not self._startall(None): + return self.failure("Setup failed") + + while True: + # Retry until we get multiple partitions + partitions = {} + p_max = len(self._env["nodes"]) + + for n in self._env["nodes"]: + p = self._env.random_gen.randint(1, p_max) + + if p not in partitions: + partitions[p] = [] + + partitions[p].append(n) + + p_max = len(partitions) + if p_max > 1: + break + # else, try again + + self.debug("Created %d partitions" % p_max) + for (key, val) in partitions.items(): + self.debug("Partition[%s]:\t%r" % (key, val)) + + # Disabling STONITH to reduce test complexity for now + self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") + + for val in partitions.values(): + self._isolate_partition(val) + + count = 30 + while count > 0: + if len(self._cm.find_partitions()) != p_max: + time.sleep(10) + else: + break + else: + self.failure("Expected partitions were not created") + + # Target number of partitions formed - wait for stability + if not self._cm.cluster_stable(): + self.failure("Partitioned cluster not stable") + + # Now audit the cluster state + self._cm.partitions_expected = p_max + if not self.audit(): + self.failure("Audits failed") + + self._cm.partitions_expected = 1 + + # And heal them again + for val in partitions.values(): + self._heal_partition(val) + + # Wait for a single partition to form + count = 30 + while count > 0: + if len(self._cm.find_partitions()) != 1: + time.sleep(10) + count -= 1 + else: + break + else: + self.failure("Cluster did not reform") + + # Wait for it to have the right number of members + count = 30 + while count > 0: + members = [] + + partitions = self._cm.find_partitions() + if partitions: + members = partitions[0].split() + + if len(members) != len(self._env["nodes"]): + time.sleep(10) + count -= 1 + else: + break + else: + self.failure("Cluster did not completely reform") + + # Wait up to 20 minutes - the delay is more preferable than + # trying to continue with in a messed up state + if not self._cm.cluster_stable(1200): + self.failure("Reformed cluster not stable") + + if not should_continue(self._env): + raise ValueError("Reformed cluster not stable") + + # Turn fencing back on + if self._env["DoFencing"]: + self._rsh(node, "crm_attribute -V -D -n stonith-enabled") + + self._cm.cluster_stable() + + if self.passed: + return self.success() + + return self.failure("See previous errors") + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ r"Another DC detected:", + r"(ERROR|error).*: .*Application of an update diff failed", + r"pacemaker-controld.*:.*not in our membership list", + r"CRIT:.*node.*returning after partition" ] + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not CTSTest.is_applicable(self): + return False + + return len(self._env["nodes"]) > 2