diff --git a/python/pacemaker/_cts/tests/cibsecret.py b/python/pacemaker/_cts/tests/cibsecret.py index 679f8b0dfb..20bc5564f9 100644 --- a/python/pacemaker/_cts/tests/cibsecret.py +++ b/python/pacemaker/_cts/tests/cibsecret.py @@ -1,231 +1,231 @@ """Test managing secrets with cibsecret.""" __all__ = ["CibsecretTest"] __copyright__ = "Copyright 2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # This comes from include/config.h as private API, assuming pacemaker is built # with cibsecrets support. I don't want to expose this value publically, at # least not until we default to including cibsecrets, so it's just set here # for now. SECRETS_DIR = "/var/lib/pacemaker/lrm/secrets" class CibsecretTest(CTSTest): """Test managing secrets with cibsecret.""" def __init__(self, cm): """ Create a new CibsecretTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Cibsecret" self._secret = "passwd" self._secret_val = "SecreT_PASS" self._rid = "secretDummy" self._startall = SimulStartLite(cm) def _insert_dummy(self, node): """Create a dummy resource on the given node.""" pats = [ - f"{node}.*" + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) + f"{node}.*" + (self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.add_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "addDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when adding dummy resource") return repr(watch.unmatched) return "" def _check_cib_value(self, node, expected): """Check that the secret has the expected value.""" (rc, lines) = self._rsh(node, f"crm_resource -r {self._rid} -g {self._secret}", verbose=1) s = " ".join(lines).strip() if rc != 0 or s != expected: return self.failure(f"Secret set to '{s}' in CIB, not '{expected}'") # This is self.success, except without incrementing the success counter return True def _test_check(self, node): """Test the 'cibsecret check' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret check {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure("Failed to check secret") # This is self.success, except without incrementing the success counter return True def _test_delete(self, node): """Test the 'cibsecret delete' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret delete {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure("Failed to delete secret") # This is self.success, except without incrementing the success counter return True def _test_get(self, node, expected): """Test the 'cibsecret get' subcommand.""" (rc, lines) = self._rsh(node, f"cibsecret get {self._rid} {self._secret}", verbose=1) s = " ".join(lines).strip() if rc != 0 or s != expected: return self.failure(f"Secret set to '{s}' in local file, not '{expected}'") # This is self.success, except without incrementing the success counter return True def _test_set(self, node): """Test the 'cibsecret set' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret set {self._rid} {self._secret} {self._secret_val}", verbose=1) if rc != 0: return self.failure("Failed to set secret") # This is self.success, except without incrementing the success counter return True def _test_stash(self, node): """Test the 'cibsecret stash' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret stash {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure(f"Failed to stash secret {self._secret}") # This is self.success, except without incrementing the success counter return True def _test_sync(self, node): """Test the 'cibsecret sync' subcommand.""" (rc, _) = self._rsh(node, "cibsecret sync", verbose=1) if rc != 0: return self.failure("Failed to sync secrets") # This is self.success, except without incrementing the success counter return True def _test_unstash(self, node): """Test the 'cibsecret unstash' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret unstash {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure(f"Failed to unstash secret {self._secret}") # This is self.success, except without incrementing the success counter return True def _test_secrets_removed(self): """Verify that the secret and its checksum file has been removed.""" f = f"{SECRETS_DIR}/{self._rid}/{self._secret}" if not self._rsh.exists_on_none(f, self._env["nodes"]): return self.failure(f"{f} not deleted from all hosts") f = f"{SECRETS_DIR}/{self._rid}/{self._secret}.sign" if not self._rsh.exists_on_none(f, self._env["nodes"]): return self.failure(f"{f} not deleted from all hosts") return True # @TODO: Two improvements that could be made to this test: # # (1) Add a test for the 'cibsecret sync' command. This requires modifying # the test so it brings down one node before creating secrets, then # bringing the node back up, running 'cibsecret sync', and verifying the # secrets are copied over. All of this is possible with ctslab, it's # just kind of a lot of code. # # (2) Add some tests for failure cases like trying to stash a value that's # already secret, etc. def __call__(self, node): """Perform this test.""" self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Start all nodes failed") ret = self._insert_dummy(node) if ret != "": return self.failure(ret) # Test setting a new secret, verifying its value in both the CIB and # the local store on each node. if not self._test_set(node): return False if not self._check_cib_value(node, "lrm://"): return False for n in self._env["nodes"]: if not self._test_get(n, self._secret_val): return False # Test checking the secret on each node. for n in self._env["nodes"]: if not self._test_check(n): return False # Test moving the secret into the CIB, but now we can only verify that # its value in the CIB is correct since it's no longer a secret. We # can also verify that it's been removed from the local store everywhere. if not self._test_unstash(node): return False if not self._check_cib_value(node, self._secret_val): return False self._test_secrets_removed() # Test moving the secret back out of the CIB, again verifying its # value in both places. if not self._test_stash(node): return False if not self._check_cib_value(node, "lrm://"): return False for n in self._env["nodes"]: if not self._test_get(n, self._secret_val): return False # Delete the secret if not self._test_delete(node): return False self._test_secrets_removed() return self.success() @property def errors_to_ignore(self): return [r"Reloading .* \(agent\)"] diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py index dba0bc8a25..ba06124084 100644 --- a/python/pacemaker/_cts/tests/componentfail.py +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -1,164 +1,164 @@ """Kill a pacemaker daemon and test how the cluster recovers.""" __all__ = ["ComponentFail"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # @TODO Separate this into a separate test for each component, so the patterns # can be made specific to each component, investigating failures is a little # easier, and specific testing can be done for each component (for example, # set attributes before and after killing pacemaker-attrd and check values). class ComponentFail(CTSTest): """Kill a random pacemaker daemon and wait for the cluster to recover.""" def __init__(self, cm): """ Create a new ComponentFail instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "ComponentFail" self._complist = cm.components self._okerrpatterns = [] self._patterns = [] self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") self._patterns = [] self._okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") if not self._cm.cluster_stable(self._env["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self._cm.is_node_dc(node, None) # select a component to kill chosen = self._env.random_gen.choice(self._complist) self.debug(f"...component {chosen.name} (dc={node_is_dc})") self.incr(chosen.name) if chosen.name != "corosync": self._patterns.extend([ - self.templates["Pat:ChildKilled"] % (node, chosen.name), - self.templates["Pat:ChildRespawn"] % (node, chosen.name), + self._cm.templates["Pat:ChildKilled"] % (node, chosen.name), + self._cm.templates["Pat:ChildRespawn"] % (node, chosen.name), ]) self._patterns.extend(chosen.pats) if node_is_dc: self._patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self._okerrpatterns = [ - self.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Fencing_active"], ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self._okerrpatterns.extend([ - self.templates["Pat:Fencing_recover"] % r.id, - self.templates["Pat:Fencing_probe"] % r.id, + self._cm.templates["Pat:Fencing_recover"] % r.id, + self._cm.templates["Pat:Fencing_probe"] % r.id, ]) # supply a copy so self.patterns doesn't end up empty tmp_pats = self._patterns.copy() self._patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith_pats = [ - self.templates["Pat:Fencing_ok"] % node + self._cm.templates["Pat:Fencing_ok"] % node ] stonith = self.create_watch(stonith_pats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) watch.set_watch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self._cm.cluster_stable(self._env["StartTime"]) self.debug(f"Checking if {node} was shot") shot = stonith.look(60) if shot: self.debug(f"Found: {shot!r}") - self._okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) + self._okerrpatterns.append(self._cm.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.expected_status[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["StartTime"]) if not matched: return self.failure(f"Didn't find all expected {chosen.name} patterns") if not is_stable: return self.failure(f"Cluster did not become stable after killing {chosen.name}") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self._okerrpatterns.extend(self._patterns) return self._okerrpatterns diff --git a/python/pacemaker/_cts/tests/ctstest.py b/python/pacemaker/_cts/tests/ctstest.py index 9e78baa4fe..95f04f0128 100644 --- a/python/pacemaker/_cts/tests/ctstest.py +++ b/python/pacemaker/_cts/tests/ctstest.py @@ -1,242 +1,240 @@ """Base classes for CTS tests.""" __all__ = ["CTSTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.environment import EnvFactory from pacemaker._cts.logging import LogFactory -from pacemaker._cts.patterns import PatternSelector from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.timer import Timer from pacemaker._cts.watcher import LogWatcher # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. class CTSTest: """ The base class for all cluster tests. This implements a basic set of properties and behaviors like setup, tear down, time keeping, and statistics tracking. It is up to specific tests to implement their own specialized behavior on top of this class. """ def __init__(self, cm): """ Create a new CTSTest instance. Arguments: cm -- A ClusterManager instance """ # pylint: disable=invalid-name self.audits = [] self.name = None - self.templates = PatternSelector(cm.name) self.stats = { "auditfail": 0, "calls": 0, "failure": 0, "skipped": 0, "success": 0 } self._cm = cm self._env = EnvFactory().getInstance() self._rsh = RemoteFactory().getInstance() self._logger = LogFactory() self._timers = {} self.benchmark = True # which tests to benchmark self.failed = False self.is_experimental = False self.is_loop = False self.is_unsafe = False self.passed = True def log(self, args): """Log a message.""" self._logger.log(args) def debug(self, args): """Log a debug message.""" self._logger.debug(args) def get_timer(self, key="test"): """Get the start time of the given timer.""" try: return self._timers[key].start_time except KeyError: return 0 def set_timer(self, key="test"): """Set the start time of the given timer to now, and return that time.""" if key not in self._timers: self._timers[key] = Timer(self._logger, self.name, key) self._timers[key].start() return self._timers[key].start_time def log_timer(self, key="test"): """Log the elapsed time of the given timer.""" if key not in self._timers: return elapsed = self._timers[key].elapsed self.debug(f"{self.name}:{key} runtime: {elapsed:.2f}") del self._timers[key] def incr(self, name): """Increment the given stats key.""" if name not in self.stats: self.stats[name] = 0 self.stats[name] += 1 # Reset the test passed boolean if name == "calls": self.passed = True def failure(self, reason="none"): """Increment the failure count, with an optional failure reason.""" self.passed = False self.incr("failure") self._logger.log(f"{f'Test {self.name}':<35} FAILED: {reason}") return False def success(self): """Increment the success count.""" self.incr("success") return True def skipped(self): """Increment the skipped count.""" self.incr("skipped") return True def __call__(self, node): """Perform this test.""" raise NotImplementedError def audit(self): """Perform all the relevant audits (see ClusterAudit), returning whether or not they all passed.""" passed = True for audit in self.audits: if not audit(): self._logger.log(f"Internal {self.name} Audit {audit.name} FAILED.") self.incr("auditfail") passed = False return passed def setup(self, node): """Set up this test.""" # node is used in subclasses # pylint: disable=unused-argument return self.success() def teardown(self, node): """Tear down this test.""" # node is used in subclasses # pylint: disable=unused-argument return self.success() def create_watch(self, patterns, timeout, name=None): """ Create a new LogWatcher object. This object can be used to search log files for matching patterns during this test's run. Arguments: patterns -- A list of regular expressions to match against the log timeout -- Default number of seconds to watch a log file at a time; this can be overridden by the timeout= parameter to self.look on an as-needed basis name -- A unique name to use when logging about this watch """ if not name: name = self.name return LogWatcher(self._env["LogFileName"], patterns, self._env["nodes"], self._env["log_kind"], name, timeout) def local_badnews(self, prefix, watch, local_ignore=None): """ Search through log files for messages. Arguments: prefix -- The string to look for at the beginning of lines, or "LocalBadNews:" if None. watch -- The LogWatcher object to use for searching. local_ignore -- A list of regexes that, if found in a line, will cause that line to be ignored. Return the number of matches found. """ errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [" CTS: ", prefix] if local_ignore: ignorelist += local_ignore while errcount < 100: match = watch.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: self._logger.log(f"{prefix} {match}") errcount += 1 else: break else: self._logger.log("Too many errors!") watch.end() return errcount def is_applicable(self): """ Return True if this test is applicable in the current test configuration. This method must be implemented by all subclasses. """ if self.is_loop and not self._env["loop-tests"]: return False if self.is_unsafe and not self._env["unsafe-tests"]: return False if self.is_experimental and not self._env["experimental-tests"]: return False if self._env["benchmark"] and not self.benchmark: return False return True @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [] diff --git a/python/pacemaker/_cts/tests/maintenancemode.py b/python/pacemaker/_cts/tests/maintenancemode.py index 5026a6cf0e..4846cf7539 100644 --- a/python/pacemaker/_cts/tests/maintenancemode.py +++ b/python/pacemaker/_cts/tests/maintenancemode.py @@ -1,228 +1,228 @@ """Toggle nodes in and out of maintenance mode.""" __all__ = ["MaintenanceMode"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class MaintenanceMode(CTSTest): """Toggle nodes in and ount of maintenance mode.""" def __init__(self, cm): """ Create a new MaintenanceMode instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "MaintenanceMode" self._action = "asyncmon" self._rid = "maintenanceDummy" self._start = StartTest(cm) self._startall = SimulStartLite(cm) def _toggle_maintenance_mode(self, node, enabled): """Toggle maintenance mode on the given node.""" pats = [ - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:DC_IDLE"] ] if enabled: action = "On" else: action = "Off" # fail the resource right after turning Maintenance mode on # verify it is not recovered until maintenance mode is turned off if enabled: - pats.append(self.templates["Pat:RscOpFail"] % (self._action, self._rid)) + pats.append(self._cm.templates["Pat:RscOpFail"] % (self._action, self._rid)) else: pats.extend([ - self.templates["Pat:RscOpOK"] % ("stop", self._rid), - self.templates["Pat:RscOpOK"] % ("start", self._rid) + self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid), + self._cm.templates["Pat:RscOpOK"] % ("start", self._rid) ]) watch = self.create_watch(pats, 60) watch.set_watch() self.debug(f"Turning maintenance mode {action}") - self._rsh(node, self.templates[f"MaintenanceMode{action}"]) + self._rsh(node, self._cm.templates[f"MaintenanceMode{action}"]) if enabled: self._rsh(node, f"crm_resource -V -F -r {self._rid} -H {node} &>/dev/null") with Timer(self._logger, self.name, f"recover{action}"): watch.look_for_all() if watch.unmatched: self.debug(f"Failed to find patterns when turning maintenance mode {action}") return repr(watch.unmatched) return "" def _insert_maintenance_dummy(self, node): """Create a dummy resource on the given node.""" pats = [ - f"{node}.*" + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) + f"{node}.*" + (self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.add_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "addDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when adding maintenance dummy resource") return repr(watch.unmatched) return "" def _remove_maintenance_dummy(self, node): """Remove the previously created dummy resource on the given node.""" pats = [ - self.templates["Pat:RscOpOK"] % ("stop", self._rid) + self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.remove_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "removeDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when removing maintenance dummy resource") return repr(watch.unmatched) return "" def _managed_rscs(self, node): """Return a list of all resources managed by the cluster.""" rscs = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self._cm, line) if tmp.managed: rscs.append(tmp.id) return rscs def _verify_resources(self, node, rscs, managed): """Verify that all resources are managed or unmanaged as expected.""" managed_rscs = rscs managed_str = "managed" if not managed: managed_str = "unmanaged" (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self._cm, line) if managed and not tmp.managed: continue if not managed and tmp.managed: continue if managed_rscs.count(tmp.id): managed_rscs.remove(tmp.id) if not managed_rscs: self.debug(f"Found all {managed_str} resources on {node}") return True self._logger.log(f"Could not find all {managed_str} resources on {node}. {managed_rscs}") return False def __call__(self, node): """Perform this test.""" self.incr("calls") verify_managed = False verify_unmanaged = False fail_pat = "" if not self._startall(None): return self.failure("Setup failed") # get a list of all the managed resources. We use this list # after enabling maintenance mode to verify all managed resources # become un-managed. After maintenance mode is turned off, we use # this list to verify all the resources become managed again. managed_rscs = self._managed_rscs(node) if not managed_rscs: self._logger.log(f"No managed resources on {node}") return self.skipped() # insert a fake resource we can fail during maintenance mode # so we can verify recovery does not take place until after maintenance # mode is disabled. fail_pat += self._insert_maintenance_dummy(node) # toggle maintenance mode ON, then fail dummy resource. fail_pat += self._toggle_maintenance_mode(node, True) # verify all the resources are now unmanaged if self._verify_resources(node, managed_rscs, False): verify_unmanaged = True # Toggle maintenance mode OFF, verify dummy is recovered. fail_pat += self._toggle_maintenance_mode(node, False) # verify all the resources are now managed again if self._verify_resources(node, managed_rscs, True): verify_managed = True # Remove our maintenance dummy resource. fail_pat += self._remove_maintenance_dummy(node) self._cm.cluster_stable() if fail_pat != "": return self.failure(f"Unmatched patterns: {fail_pat}") if not verify_unmanaged: return self.failure("Failed to verify resources became unmanaged during maintenance mode") if not verify_managed: return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ f"Updating failcount for {self._rid}", fr"schedulerd.*: Recover\s+{self._rid}\s+\(.*\)", r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self._action, self._rid), + self._cm.templates["Pat:RscOpOK"] % (self._action, self._rid), f"(ERROR|error).*: Action {self._rid}_{self._action}_0 .* initiated outside of a transition", ] diff --git a/python/pacemaker/_cts/tests/nearquorumpointtest.py b/python/pacemaker/_cts/tests/nearquorumpointtest.py index 955926a028..74d85230cf 100644 --- a/python/pacemaker/_cts/tests/nearquorumpointtest.py +++ b/python/pacemaker/_cts/tests/nearquorumpointtest.py @@ -1,121 +1,121 @@ """Randomly start and stop nodes to bring the cluster close to the quorum point.""" __all__ = ["NearQuorumPointTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class NearQuorumPointTest(CTSTest): """Randomly start and stop nodes to bring the cluster close to the quorum point.""" def __init__(self, cm): """ Create a new NearQuorumPointTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "NearQuorumPoint" def __call__(self, dummy): """Perform this test.""" self.incr("calls") startset = [] stopset = [] stonith = self._cm.prepare_fencing_watcher() # decide what to do with each node for node in self._env["nodes"]: action = self._env.random_gen.choice(["start", "stop"]) if action == "start": startset.append(node) elif action == "stop": stopset.append(node) self.debug(f"start nodes:{startset!r}") self.debug(f"stop nodes:{stopset!r}") # add search patterns watchpats = [] for node in stopset: if self._cm.expected_status[node] == "up": - watchpats.append(self.templates["Pat:We_stopped"] % node) + watchpats.append(self._cm.templates["Pat:We_stopped"] % node) for node in startset: if self._cm.expected_status[node] == "down": - watchpats.append(self.templates["Pat:Local_started"] % node) + watchpats.append(self._cm.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self._cm.expected_status[stopping] == "up": - watchpats.append(self.templates["Pat:They_stopped"] % (node, stopping)) + watchpats.append(self._cm.templates["Pat:They_stopped"] % (node, stopping)) if not watchpats: return self.skipped() if startset: - watchpats.append(self.templates["Pat:DC_IDLE"]) + watchpats.append(self._cm.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) watch.set_watch() # begin actions for node in stopset: if self._cm.expected_status[node] == "up": self._cm.stop_cm_async(node) for node in startset: if self._cm.expected_status[node] == "down": self._cm.start_cm_async(node) # get the result if watch.look_for_all(): self._cm.cluster_stable() self._cm.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self._logger.log(f"Warn: Patterns not found: {watch.unmatched!r}") # get the "bad" nodes upnodes = [] for node in stopset: if self._cm.stat_cm(node): upnodes.append(node) downnodes = [] for node in startset: if not self._cm.stat_cm(node): downnodes.append(node) self._cm.fencing_cleanup("NearQuorumPoint", stonith) if not upnodes and not downnodes: self._cm.cluster_stable() # Make sure they're completely down with no residule for node in stopset: - self._rsh(node, self.templates["StopCmd"]) + self._rsh(node, self._cm.templates["StopCmd"]) return self.success() if upnodes: self._logger.log(f"Warn: Unstoppable nodes: {upnodes!r}") if downnodes: self._logger.log(f"Warn: Unstartable nodes: {downnodes!r}") return self.failure() diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py index 6d445e9818..cc84107ed3 100644 --- a/python/pacemaker/_cts/tests/reattach.py +++ b/python/pacemaker/_cts/tests/reattach.py @@ -1,207 +1,207 @@ """Restart the cluster and verify resources remain running.""" __all__ = ["Reattach"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker.exitstatus import ExitStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class Reattach(CTSTest): """Restart the cluster and verify that resources remain running throughout.""" def __init__(self, cm): """ Create a new Reattach instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Reattach" self._startall = SimulStartLite(cm) self._stopall = SimulStopLite(cm) def _is_managed(self, node): """Return whether resources are managed by the cluster.""" (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) is_managed = is_managed[0].strip() return is_managed == "true" def _set_unmanaged(self, node): """Disable resource management.""" self.debug("Disable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): """Enable resource management.""" self.debug("Re-enable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def _disable_incompatible_rscs(self, node): """ Disable resources that are incompatible with this test. Starts and stops of stonith-class resources are implemented internally by Pacemaker, which means that they must stop when Pacemaker is stopped, even if unmanaged. Disable them before running the Reattach test so they don't affect resource placement. Set target-role to "Stopped" for any of these resources in the CIB. """ self.debug("Disable incompatible resources") xml = """' ' --scope rsc_defaults""" return self._rsh(node, self._cm.templates['CibAddXml'] % xml) def _enable_incompatible_rscs(self, node): """Re-enable resources that were incompatible with this test.""" self.debug("Re-enable incompatible resources") xml = """""" return self._rsh(node, f"""cibadmin --delete --xml-text '{xml}'""") def _reprobe(self, node): """ Reprobe all resources. The placement of some resources (such as promotable-1 in the lab-generated CIB) is affected by constraints using node-attribute-based rules. An earlier test may have erased the relevant node attribute, so do a reprobe, which should add the attribute back. """ return self._rsh(node, """crm_resource --refresh""") def setup(self, node): """Set up this test.""" if not self._startall(None): return self.failure("Startall failed") (rc, _) = self._disable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to stop incompatible resources") (rc, _) = self._reprobe(node) if rc != ExitStatus.OK: return self.failure("Couldn't reprobe resources") if not self._cm.cluster_stable(double_check=True): return self.failure("Cluster did not stabilize after setup") return self.success() def teardown(self, node): """Tear down this test.""" # Make sure 'node' is up start = StartTest(self._cm) start(node) if not self._is_managed(node): self._set_managed(node) (rc, _) = self._enable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to re-enable incompatible resources") if not self._cm.cluster_stable(): return self.failure("Cluster did not stabilize after teardown") if not self._is_managed(node): return self.failure("Could not re-enable resource management") return self.success() def __call__(self, node): """Perform this test.""" self.incr("calls") # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["No fencing will be done"], 60) managed.set_watch() self._set_unmanaged(node) if not managed.look_for_all(): self._logger.log(f"Patterns not found: {managed.unmatched!r}") return self.failure("Resource management not disabled") pats = [ - self.templates["Pat:RscOpOK"] % ("start", ".*"), - self.templates["Pat:RscOpOK"] % ("stop", ".*"), - self.templates["Pat:RscOpOK"] % ("promote", ".*"), - self.templates["Pat:RscOpOK"] % ("demote", ".*"), - self.templates["Pat:RscOpOK"] % ("migrate", ".*") + self._cm.templates["Pat:RscOpOK"] % ("start", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("stop", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("promote", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("demote", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("migrate", ".*") ] watch = self.create_watch(pats, 60, "ShutdownActivity") watch.set_watch() self.debug("Shutting down the cluster") ret = self._stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self._startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.set_watch() # Re-enable resource management (and verify it happened). self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.debug(f"Ignoring start actions for {r.id}") - ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) + ignore.append(self._cm.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ r"resource( was|s were) active at shutdown" ] diff --git a/python/pacemaker/_cts/tests/remotedriver.py b/python/pacemaker/_cts/tests/remotedriver.py index a0d916d7b4..535eacf1d7 100644 --- a/python/pacemaker/_cts/tests/remotedriver.py +++ b/python/pacemaker/_cts/tests/remotedriver.py @@ -1,542 +1,542 @@ """Base classes for CTS tests.""" __all__ = ["RemoteDriver"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import os import time import subprocess import tempfile from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.stoptest import StopTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class RemoteDriver(CTSTest): """ A specialized base class for cluster tests that run on Pacemaker Remote nodes. This builds on top of CTSTest to provide methods for starting and stopping services and resources, and managing remote nodes. This is still just an abstract class -- specific tests need to implement their own specialized behavior. """ def __init__(self, cm): """ Create a new RemoteDriver instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "RemoteDriver" self._corosync_enabled = False self._pacemaker_enabled = False self._remote_node = None self._remote_rsc = "remote-rsc" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self._stop = StopTest(cm) self.reset() def reset(self): """Reset the state of this test back to what it was before the test was run.""" self.failed = False self.fail_string = "" self._pcmk_started = False self._remote_node_added = False self._remote_rsc_added = False self._remote_use_reconnect_interval = self._env.random_gen.choice([True, False]) def fail(self, msg): """Mark test as failed.""" self.failed = True # Always log the failure. self._logger.log(msg) # Use first failure as test status, as it's likely to be most useful. if not self.fail_string: self.fail_string = msg def _get_other_node(self, node): """ Get the first cluster node out of the environment that is not the given node. Typically, this is used to find some node that will still be active that we can run cluster commands on. """ for othernode in self._env["nodes"]: if othernode == node: # we don't want to try and use the cib that we just shutdown. # find a cluster node that is not our soon to be remote-node. continue return othernode def _del_rsc(self, node, rsc): """ Delete the given named resource from the cluster. The given `node` is the cluster node on which we should *not* run the delete command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, f"crm_resource -D -r {rsc} -t primitive") if rc != 0: self.fail(f"Removal of resource '{rsc}' failed") def _add_rsc(self, node, rsc_xml): """ Add a resource given in XML format to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, f"cibadmin -C -o resources -X '{rsc_xml}'") if rc != 0: self.fail("resource creation failed") def _add_primitive_rsc(self, node): """ Add a primitive heartbeat resource for the remote node to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ rsc_xml = f""" """ self._add_rsc(node, rsc_xml) if not self.failed: self._remote_rsc_added = True def _add_connection_rsc(self, node): """ Add a primitive connection resource for the remote node to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ rsc_xml = f""" """ if self._remote_use_reconnect_interval: # Set reconnect interval on resource rsc_xml += f""" """ rsc_xml += f""" """ self._add_rsc(node, rsc_xml) if not self.failed: self._remote_node_added = True def _disable_services(self, node): """Disable the corosync and pacemaker services on the given node.""" self._corosync_enabled = self._env.service_is_enabled(node, "corosync") if self._corosync_enabled: self._env.disable_service(node, "corosync") self._pacemaker_enabled = self._env.service_is_enabled(node, "pacemaker") if self._pacemaker_enabled: self._env.disable_service(node, "pacemaker") def _enable_services(self, node): """Enable the corosync and pacemaker services on the given node.""" if self._corosync_enabled: self._env.enable_service(node, "corosync") if self._pacemaker_enabled: self._env.enable_service(node, "pacemaker") def _stop_pcmk_remote(self, node): """Stop the Pacemaker Remote service on the given node.""" for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote stop") if rc != 0: time.sleep(6) else: break def _start_pcmk_remote(self, node): """Start the Pacemaker Remote service on the given node.""" for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote start") if rc != 0: time.sleep(6) else: self._pcmk_started = True break def _freeze_pcmk_remote(self, node): """Simulate a Pacemaker Remote daemon failure.""" self._rsh(node, "killall -STOP pacemaker-remoted") def _resume_pcmk_remote(self, node): """Simulate the Pacemaker Remote daemon recovering.""" self._rsh(node, "killall -CONT pacemaker-remoted") def _start_metal(self, node): """ Set up a Pacemaker Remote configuration. Remove any existing connection resources or nodes. Start the pacemaker_remote service. Create a connection resource. """ # Cluster nodes are reused as remote nodes in remote tests. If cluster # services were enabled at boot, in case the remote node got fenced, the # cluster node would join instead of the expected remote one. Meanwhile # pacemaker_remote would not be able to start. Depending on the chances, # the situations might not be able to be orchestrated gracefully any more. # # Temporarily disable any enabled cluster serivces. self._disable_services(node) # make sure the resource doesn't already exist for some reason self._rsh(node, f"crm_resource -D -r {self._remote_rsc} -t primitive") self._rsh(node, f"crm_resource -D -r {self._remote_node} -t primitive") if not self._stop(node): self.fail(f"Failed to shutdown cluster node {node}") return self._start_pcmk_remote(node) if not self._pcmk_started: self.fail(f"Failed to start pacemaker_remote on node {node}") return # Convert node to baremetal now that it has shutdown the cluster stack pats = [] watch = self.create_watch(pats, 120) watch.set_watch() pats.extend([ - self.templates["Pat:RscOpOK"] % ("start", self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscOpOK"] % ("start", self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ]) self._add_connection_rsc(node) with Timer(self._logger, self.name, "remoteMetalInit"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def migrate_connection(self, node): """Move the remote connection resource to any other available node.""" if self.failed: return pats = [ - self.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), - self.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), + self._cm.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ] watch = self.create_watch(pats, 120) watch.set_watch() (rc, _) = self._rsh(node, f"crm_resource -M -r {self._remote_node}", verbose=1) if rc != 0: self.fail("failed to move remote node connection resource") return with Timer(self._logger, self.name, "remoteMetalMigrate"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def fail_rsc(self, node): """ Cause the dummy resource running on a Pacemaker Remote node to fail. Verify that the failure is logged correctly. """ if self.failed: return watchpats = [ - self.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), - self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ] watch = self.create_watch(watchpats, 120) watch.set_watch() self.debug("causing dummy rsc to fail.") self._rsh(node, "rm -f /var/run/resource-agents/Dummy*") with Timer(self._logger, self.name, "remoteRscFail"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns during rsc fail: {watch.unmatched}") def fail_connection(self, node): """ Cause the remote connection resource to fail. Verify that the node is fenced and the connection resource is restarted on another node. """ if self.failed: return watchpats = [ - self.templates["Pat:Fencing_ok"] % self._remote_node, - self.templates["Pat:NodeFenced"] % self._remote_node + self._cm.templates["Pat:Fencing_ok"] % self._remote_node, + self._cm.templates["Pat:NodeFenced"] % self._remote_node ] watch = self.create_watch(watchpats, 120) watch.set_watch() # freeze the pcmk remote daemon. this will result in fencing self.debug("Force stopped active remote node") self._freeze_pcmk_remote(node) self.debug("Waiting for remote node to be fenced.") with Timer(self._logger, self.name, "remoteMetalFence"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") return self.debug("Waiting for the remote node to come back up") self._cm.ns.wait_for_node(node, 120) pats = [] watch = self.create_watch(pats, 240) watch.set_watch() - pats.append(self.templates["Pat:RscOpOK"] % ("start", self._remote_node)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", self._remote_node)) if self._remote_rsc_added: - pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) + pats.append(self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) # start the remote node again watch it integrate back into cluster. self._start_pcmk_remote(node) if not self._pcmk_started: self.fail(f"Failed to start pacemaker_remote on node {node}") return self.debug("Waiting for remote node to rejoin cluster after being fenced.") with Timer(self._logger, self.name, "remoteMetalRestart"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def _add_dummy_rsc(self, node): """Add a dummy resource that runs on the Pacemaker Remote node.""" if self.failed: return # verify we can put a resource on the remote node pats = [] watch = self.create_watch(pats, 120) watch.set_watch() pats.extend([ - self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ]) # Add a resource that must live on remote-node self._add_primitive_rsc(node) # force that rsc to prefer the remote node. (rc, _) = self._cm.rsh(node, f"crm_resource -M -r {self._remote_rsc} -N {self._remote_node} -f", verbose=1) if rc != 0: self.fail("Failed to place remote resource on remote node.") return with Timer(self._logger, self.name, "remoteMetalRsc"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def test_attributes(self, node): """Verify that attributes can be set on the Pacemaker Remote node.""" if self.failed: return # This verifies permanent attributes can be set on a remote-node. It also # verifies the remote-node can edit its own cib node section remotely. (rc, line) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -v testval -N {self._remote_node}", verbose=1) if rc != 0: self.fail(f"Failed to set remote-node attribute. rc:{rc} output:{line}") return (rc, _) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -q -N {self._remote_node}", verbose=1) if rc != 0: self.fail("Failed to get remote-node attribute") return (rc, _) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -D -N {self._remote_node}", verbose=1) if rc != 0: self.fail("Failed to delete remote-node attribute") def cleanup_metal(self, node): """ Clean up the Pacemaker Remote node configuration previously created by _setup_metal. Stop and remove dummy resources and connection resources. Stop the pacemaker_remote service. Remove the remote node itself. """ self._enable_services(node) if not self._pcmk_started: return pats = [] watch = self.create_watch(pats, 120) watch.set_watch() if self._remote_rsc_added: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) if self._remote_node_added: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) with Timer(self._logger, self.name, "remoteMetalCleanup"): self._resume_pcmk_remote(node) if self._remote_rsc_added: # Remove dummy resource added for remote node tests self.debug("Cleaning up dummy rsc put on remote node") self._rsh(self._get_other_node(node), f"crm_resource -U -r {self._remote_rsc}") self._del_rsc(node, self._remote_rsc) if self._remote_node_added: # Remove remote node's connection resource self.debug("Cleaning up remote node connection resource") self._rsh(self._get_other_node(node), f"crm_resource -U -r {self._remote_node}") self._del_rsc(node, self._remote_node) watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") self._stop_pcmk_remote(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self._remote_node_added: # Remove remote node itself self.debug("Cleaning up node entry for remote node") self._rsh(self._get_other_node(node), f"crm_node --force --remove {self._remote_node}") def _setup_env(self, node): """ Set up the environment to allow Pacemaker Remote to function. This involves generating a key and copying it to all nodes in the cluster. """ self._remote_node = f"remote-{node}" # we are assuming if all nodes have a key, that it is # the right key... If any node doesn't have a remote # key, we regenerate it everywhere. if self._rsh.exists_on_all("/etc/pacemaker/authkey", self._env["nodes"]): return # create key locally (handle, keyfile) = tempfile.mkstemp(".cts") os.close(handle) subprocess.check_call(["dd", "if=/dev/urandom", f"of={keyfile}", "bs=4096", "count=1"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # sync key throughout the cluster for n in self._env["nodes"]: self._rsh(n, "mkdir -p --mode=0750 /etc/pacemaker") self._rsh.copy(keyfile, f"root@{n}:/etc/pacemaker/authkey") self._rsh(n, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") self._rsh(n, "chmod 0640 /etc/pacemaker/authkey") os.unlink(keyfile) def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" if not CTSTest.is_applicable(self): return False for node in self._env["nodes"]: (rc, _) = self._rsh(node, "which pacemaker-remoted >/dev/null 2>&1") if rc != 0: return False return True def start_new_test(self, node): """Prepare a remote test for running by setting up its environment and resources.""" self.incr("calls") self.reset() ret = self._startall(None) if not ret: return self.failure("setup failed: could not start all nodes") self._setup_env(node) self._start_metal(node) self._add_dummy_rsc(node) return True def __call__(self, node): """Perform this test.""" raise NotImplementedError @property def errors_to_ignore(self): """Return list of errors which should be ignored.""" return [ r"""is running on remote.*which isn't allowed""", r"""Connection terminated""", r"""Could not send remote""" ] diff --git a/python/pacemaker/_cts/tests/resourcerecover.py b/python/pacemaker/_cts/tests/resourcerecover.py index 2d25900b8f..a14bab5628 100644 --- a/python/pacemaker/_cts/tests/resourcerecover.py +++ b/python/pacemaker/_cts/tests/resourcerecover.py @@ -1,171 +1,171 @@ """Fail a random resource and verify its fail count increases.""" __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class ResourceRecover(CTSTest): """Fail a random resource.""" def __init__(self, cm): """ Create a new ResourceRecover instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "ResourceRecover" self._action = "asyncmon" self._interval = 0 self._rid = None self._rid_alt = None self._start = StartTest(cm) self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if not self._startall(None): return self.failure("Setup failed") # List all resources active on the node (skip test if none) resourcelist = self._cm.active_resources(node) if not resourcelist: self._logger.log(f"No active resources on {node}") return self.skipped() # Choose one resource at random rsc = self._choose_resource(node, resourcelist) if rsc is None: return self.failure(f"Could not get details of resource '{self._rid}'") if rsc.id == rsc.clone_id: self.debug(f"Failing {rsc.id}") else: self.debug(f"Failing {rsc.id} (also known as {rsc.clone_id})") # Log patterns to watch for (failure, plus restart if managed) pats = [ - self.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) + self._cm.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) ] if rsc.managed: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._rid)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid)) if rsc.unique: - pats.append(self.templates["Pat:RscOpOK"] % ("start", self._rid)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) else: # Anonymous clones may get restarted with a different clone number - pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", ".*")) # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count # is incrementing properly, but it might restart on a different node. # We'd have to temporarily ban it from all other nodes and ensure the # migration-threshold hasn't been reached.) if self._fail_resource(rsc, node, pats) is None: # self.failure() already called return None return self.success() def _choose_resource(self, node, resourcelist): """Choose a random resource to target.""" self._rid = self._env.random_gen.choice(resourcelist) self._rid_alt = self._rid (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if line.startswith("Resource: "): rsc = AuditResource(self._cm, line) if rsc.id == self._rid: # Handle anonymous clones that get renamed self._rid = rsc.clone_id return rsc return None def _get_failcount(self, node): """Check the fail count of targeted resource on given node.""" cmd = "crm_failcount --quiet --query --resource %s --operation %s --interval %d --node %s" (rc, lines) = self._rsh(node, cmd % (self._rid, self._action, self._interval, node), verbose=1) if rc != 0 or len(lines) != 1: lines = [line.strip() for line in lines] s = " // ".join(lines) self._logger.log(f"crm_failcount on {node} failed ({rc}): {s}") return -1 try: failcount = int(lines[0]) except (IndexError, ValueError): s = " ".join(lines) self._logger.log(f"crm_failcount output on {node} unparseable: {s}") return -1 return failcount def _fail_resource(self, rsc, node, pats): """Fail the targeted resource, and verify as expected.""" orig_failcount = self._get_failcount(node) watch = self.create_watch(pats, 60) watch.set_watch() self._rsh(node, f"crm_resource -V -F -r {self._rid} -H {node} &>/dev/null") with Timer(self._logger, self.name, "recover"): watch.look_for_all() self._cm.cluster_stable() recovered = self._cm.resource_location(self._rid) if watch.unmatched: return self.failure(f"Patterns not found: {watch.unmatched!r}") if rsc.unique and len(recovered) > 1: return self.failure(f"{self._rid} is now active on more than one node: {recovered!r}") if recovered: self.debug(f"{self._rid} is running on: {recovered!r}") elif rsc.managed: return self.failure(f"{self._rid} was not recovered and is inactive") new_failcount = self._get_failcount(node) if new_failcount != orig_failcount + 1: return self.failure(f"{self._rid} fail count is {new_failcount} not {orig_failcount + 1}") # Anything but None is success return 0 @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ f"Updating failcount for {self._rid}", fr"schedulerd.*: Recover\s+({self._rid}|{self._rid_alt})\s+\(.*\)", r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self._action, self._rid), + self._cm.templates["Pat:RscOpOK"] % (self._action, self._rid), f"(ERROR|error).*: Action {self._rid}_{self._action}_{self._interval} .* initiated outside of a transition", ] diff --git a/python/pacemaker/_cts/tests/simulstartlite.py b/python/pacemaker/_cts/tests/simulstartlite.py index a327e39d1b..3c14050601 100644 --- a/python/pacemaker/_cts/tests/simulstartlite.py +++ b/python/pacemaker/_cts/tests/simulstartlite.py @@ -1,128 +1,128 @@ """Simultaneously start stopped nodes.""" __all__ = ["SimulStartLite"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SimulStartLite(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class starts any stopped nodes more or less simultaneously. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new SimulStartLite instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "SimulStartLite" def __call__(self, dummy): """Return whether starting all stopped nodes more or less simultaneously succeeds.""" self.incr("calls") self.debug(f"Setup: {self.name}") # We ignore the "node" parameter... node_list = [] for node in self._env["nodes"]: if self._cm.expected_status[node] == "down": self.incr("WasStopped") node_list.append(node) self.set_timer() while len(node_list) > 0: # Repeat until all nodes come up - uppat = self.templates["Pat:NonDC_started"] + uppat = self._cm.templates["Pat:NonDC_started"] if self._cm.upcount() == 0: - uppat = self.templates["Pat:Local_started"] + uppat = self._cm.templates["Pat:Local_started"] watchpats = [ - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:DC_IDLE"] ] for node in node_list: watchpats.extend([uppat % node, - self.templates["Pat:InfraUp"] % node, - self.templates["Pat:PacemakerUp"] % node]) + self._cm.templates["Pat:InfraUp"] % node, + self._cm.templates["Pat:PacemakerUp"] % node]) # Start all the nodes - at about the same time... watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) watch.set_watch() stonith = self._cm.prepare_fencing_watcher() for node in node_list: self._cm.start_cm_async(node) watch.look_for_all() node_list = self._cm.fencing_cleanup(self.name, stonith) if node_list is None: return self.failure("Cluster did not stabilize") # Remove node_list messages from watch.unmatched for node in node_list: self._logger.debug(f"Dealing with stonith operations for {node_list}") if watch.unmatched: try: watch.unmatched.remove(uppat % node) except ValueError: self.debug(f"Already matched: {uppat % node}") try: - watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) + watch.unmatched.remove(self._cm.templates["Pat:InfraUp"] % node) except ValueError: - self.debug(f"Already matched: {self.templates['Pat:InfraUp'] % node}") + self.debug(f"Already matched: {self._cm.templates['Pat:InfraUp'] % node}") try: - watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) + watch.unmatched.remove(self._cm.templates["Pat:PacemakerUp"] % node) except ValueError: - self.debug(f"Already matched: {self.templates['Pat:PacemakerUp'] % node}") + self.debug(f"Already matched: {self._cm.templates['Pat:PacemakerUp'] % node}") if watch.unmatched: for regex in watch.unmatched: self._logger.log(f"Warn: Startup pattern not found: {regex}") if not self._cm.cluster_stable(): return self.failure("Cluster did not stabilize") did_fail = False unstable = [] for node in self._env["nodes"]: if not self._cm.stat_cm(node): did_fail = True unstable.append(node) if did_fail: return self.failure(f"Unstarted nodes exist: {unstable}") unstable = [] for node in self._env["nodes"]: if not self._cm.node_stable(node): did_fail = True unstable.append(node) if did_fail: return self.failure(f"Unstable cluster nodes exist: {unstable}") return self.success() def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" return False diff --git a/python/pacemaker/_cts/tests/simulstoplite.py b/python/pacemaker/_cts/tests/simulstoplite.py index 1bb8ddc5a0..b3a2f87ffe 100644 --- a/python/pacemaker/_cts/tests/simulstoplite.py +++ b/python/pacemaker/_cts/tests/simulstoplite.py @@ -1,86 +1,86 @@ """Simultaneously stop running nodes.""" __all__ = ["SimulStopLite"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SimulStopLite(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class stops any running nodes more or less simultaneously. It can be used both to set up a test or to clean up a test. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new SimulStopLite instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "SimulStopLite" def __call__(self, dummy): """Return whether stopping all running nodes more or less simultaneously succeeds.""" self.incr("calls") self.debug(f"Setup: {self.name}") # We ignore the "node" parameter... watchpats = [] for node in self._env["nodes"]: if self._cm.expected_status[node] == "up": self.incr("WasStarted") - watchpats.append(self.templates["Pat:We_stopped"] % node) + watchpats.append(self._cm.templates["Pat:We_stopped"] % node) if len(watchpats) == 0: return self.success() # Stop all the nodes - at about the same time... watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) watch.set_watch() self.set_timer() for node in self._env["nodes"]: if self._cm.expected_status[node] == "up": self._cm.stop_cm_async(node) if watch.look_for_all(): # Make sure they're completely down with no residule for node in self._env["nodes"]: - self._rsh(node, self.templates["StopCmd"]) + self._rsh(node, self._cm.templates["StopCmd"]) return self.success() did_fail = False up_nodes = [] for node in self._env["nodes"]: if self._cm.stat_cm(node): did_fail = True up_nodes.append(node) if did_fail: return self.failure(f"Active nodes exist: {up_nodes}") self._logger.log(f"Warn: All nodes stopped but CTS didn't detect: {watch.unmatched}") return self.failure(f"Missing log message: {watch.unmatched}") def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" return False diff --git a/python/pacemaker/_cts/tests/stonithdtest.py b/python/pacemaker/_cts/tests/stonithdtest.py index c2e59f80bc..8daeb3db84 100644 --- a/python/pacemaker/_cts/tests/stonithdtest.py +++ b/python/pacemaker/_cts/tests/stonithdtest.py @@ -1,141 +1,141 @@ """Fence a running node and wait for it to restart.""" __all__ = ["StonithdTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker.exitstatus import ExitStatus from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StonithdTest(CTSTest): """Fence a running node and wait for it to restart.""" def __init__(self, cm): """ Create a new StonithdTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "Stonithd" self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if len(self._env["nodes"]) < 2: return self.skipped() ret = self._startall(None) if not ret: return self.failure("Setup failed") watchpats = [ - self.templates["Pat:Fencing_ok"] % node, - self.templates["Pat:NodeFenced"] % node, + self._cm.templates["Pat:Fencing_ok"] % node, + self._cm.templates["Pat:NodeFenced"] % node, ] if not self._env["at-boot"]: self.debug(f"Expecting {node} to stay down") self._cm.expected_status[node] = "down" else: self.debug(f"Expecting {node} to come up again {self._env['at-boot']}") watchpats.extend([ f"{node}.* S_STARTING -> S_PENDING", f"{node}.* S_PENDING -> S_NOT_DC", ]) watch = self.create_watch(watchpats, 30 + self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) watch.set_watch() origin = self._env.random_gen.choice(self._env["nodes"]) (rc, _) = self._rsh(origin, f"stonith_admin --reboot {node} -VVVVVV") if rc == ExitStatus.TIMEOUT: # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self._logger.log(f"Fencing command on {origin} to fence {node} timed out") elif origin != node and rc != 0: self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self._logger.log(f"Fencing command on {origin} failed to fence {node} (rc={rc})") elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as expected self._logger.log(f"Locally originated fencing returned {rc}") with Timer(self._logger, self.name, "fence"): matched = watch.look_for_all() self.set_timer("reform") if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") if not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ - self.templates["Pat:Fencing_start"] % ".*", - self.templates["Pat:Fencing_ok"] % ".*", - self.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Fencing_start"] % ".*", + self._cm.templates["Pat:Fencing_ok"] % ".*", + self._cm.templates["Pat:Fencing_active"], r"error.*: Operation 'reboot' targeting .* by .* for stonith_admin.*: Timer expired" ] def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" if not CTSTest.is_applicable(self): return False # pylint gets confused because of EnvFactory here. # pylint: disable=unsupported-membership-test if "DoFencing" in self._env: return self._env["DoFencing"] return True diff --git a/python/pacemaker/_cts/tests/stoptest.py b/python/pacemaker/_cts/tests/stoptest.py index c4d9b559a2..ddac1cc2b3 100644 --- a/python/pacemaker/_cts/tests/stoptest.py +++ b/python/pacemaker/_cts/tests/stoptest.py @@ -1,97 +1,97 @@ """Stop the cluster manager on a given node.""" __all__ = ["StopTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StopTest(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class stops the cluster manager on a given node. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new StopTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Stop" def __call__(self, node): """Stop the given node, returning whether this succeeded or not.""" self.incr("calls") if self._cm.expected_status[node] != "up": return self.skipped() # Technically we should always be able to notice ourselves stopping patterns = [ - self.templates["Pat:We_stopped"] % node, + self._cm.templates["Pat:We_stopped"] % node, ] # Any active node needs to notice this one left # (note that this won't work if we have multiple partitions) for other in self._env["nodes"]: if self._cm.expected_status[other] == "up" and other != node: - patterns.append(self.templates["Pat:They_stopped"] % (other, node)) + patterns.append(self._cm.templates["Pat:They_stopped"] % (other, node)) watch = self.create_watch(patterns, self._env["DeadTime"]) watch.set_watch() if node == self._cm.our_node: self.incr("us") else: if self._cm.upcount() <= 1: self.incr("all") else: self.incr("them") self._cm.stop_cm(node) watch.look_for_all() failreason = None unmatched_str = "||" if watch.unmatched: (_, output) = self._rsh(node, "/bin/ps axf", verbose=1) for line in output: self.debug(line) (_, output) = self._rsh(node, "/usr/sbin/dlm_tool dump 2>/dev/null", verbose=1) for line in output: self.debug(line) for regex in watch.unmatched: self._logger.log(f"ERROR: Shutdown pattern not found: {regex}") unmatched_str += f"{regex}||" failreason = "Missing shutdown pattern" self._cm.cluster_stable(self._env["DeadTime"]) if not watch.unmatched or self._cm.upcount() == 0: return self.success() if len(watch.unmatched) >= self._cm.upcount(): return self.failure(f"no match against ({unmatched_str})") if failreason is None: return self.success() return self.failure(failreason)