diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py index 8bbf4e24f1..819eae654e 100644 --- a/python/pacemaker/_cts/patterns.py +++ b/python/pacemaker/_cts/patterns.py @@ -1,386 +1,386 @@ """Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["PatternSelector"] __copyright__ = "Copyright 2008-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+)" from pacemaker.buildoptions import BuildOptions class BasePatterns: """ The base class for holding a stack-specific set of command and log file/stdout patterns. Stack-specific classes need to be built on top of this one. """ def __init__(self): """Create a new BasePatterns instance which holds a very minimal set of basic patterns.""" self._bad_news = [] self._components = {} self._name = "crm-base" self._ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", # pcs can log this when node is fenced, but fencing is OK in some # tests (and we will catch it in pacemaker logs when not OK) r"pcs.daemon:No response from: .* request: get_configs, error:", # This is overbroad, but there's no way to say that only certain # transition errors are acceptable. We have to rely on causes of a # transition error logging their own error message, which should # always be the case. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", # This message comes up periodically but doesn't actually seem to # be related to any specific test failure, so just ignore it. r"pacemaker-based.* Local CIB .* differs from", ] self._commands = { "StatusCmd": "crmadmin -t 60 -S %s 2>/dev/null", "CibQuery": "cibadmin -Q", "CibAddXml": "cibadmin --modify -c --xml-text %s", "CibDelXpath": "cibadmin --delete --xpath %s", "RscRunning": BuildOptions.DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile": "%s:" + BuildOptions.CIB_DIR + "/cib.xml", "TmpDir": "/tmp", "BreakCommCmd": "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd": "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", "MaintenanceModeOn": "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff": "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd": "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd": "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self._search = { "Pat:DC_IDLE": r"pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started": r"%s\W.*controller successfully started", "Pat:NonDC_started": r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started": r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped": r"%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped": r"%s\W.*LOST:.* %s ", "Pat:They_dead": r"node %s.*: is dead", "Pat:They_up": r"%s %s\W.*OVERRIDE THIS PATTERN", "Pat:TransitionComplete": "Transition status: Complete: complete", "Pat:Fencing_start": r"Requesting peer fencing .* targeting %s", "Pat:Fencing_ok": r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK", "Pat:Fencing_recover": r"pacemaker-schedulerd.*: Recover\s+%s", - "Pat:Fencing_active": r"stonith resource .* is active on 2 nodes (attempting recovery)", + "Pat:Resource_active": r"resource .* might be active on \d+ nodes \(attempting recovery\)", "Pat:Fencing_probe": r"pacemaker-controld.* Result of probe operation for %s on .*: Error", "Pat:RscOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?OK", "Pat:RscOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ", "Pat:CloneOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ", "Pat:RscRemoteOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?OK", "Pat:NodeFenced": r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", } def get_component(self, key): """ Return the patterns for a single component as a list, given by key. This is typically the name of some subprogram (pacemaker-based, pacemaker-fenced, etc.) or various special purpose keys. If key is unknown, return an empty list. """ if key in self._components: return self._components[key] print(f"Unknown component '{key}' for {self._name}") return [] def get_patterns(self, key): """ Return various patterns supported by this object, given by key. Depending on the key, this could either be a list or a hash. If key is unknown, return None. """ if key == "BadNews": return self._bad_news if key == "BadNewsIgnore": return self._ignore if key == "Commands": return self._commands if key == "Search": return self._search if key == "Components": return self._components print(f"Unknown pattern '{key}' for {self._name}") return None def __getitem__(self, key): if key in self._commands: return self._commands[key] if key in self._search: return self._search[key] print(f"Unknown template '{key}' for {self._name}") return None class Corosync2Patterns(BasePatterns): """Patterns for Corosync version 2 cluster manager class.""" # @FIXME Some of the templates here look like they start with # incorrect daemon names. Also, many of them aren't Corosync- # specific and should probably go in BasePatterns. def __init__(self): BasePatterns.__init__(self) self._name = "crm-corosync" self._commands.update({ "StartCmd": "service corosync start && service pacemaker start", "StopCmd": "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd": "crm_node -e", "QuorumCmd": "crm_node -q", "PartitionCmd": "crm_node -p", }) self._search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped": r"%s\W.*Unloading all Corosync service engines", "Pat:They_stopped": r"%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead": r"pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_up": r"\W%s\W.*pacemaker-controld.*Node %s state is now member", "Pat:ChildExit": r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() "Pat:ChildKilled": r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)", "Pat:ChildRespawn": r"%s\W.*pacemakerd.*Respawning subdaemon %s after unexpected exit", "Pat:InfraUp": r"%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp": r"%s\W.*pacemakerd.*Starting Pacemaker", }) self._ignore += [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* using FencingFail returned ", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self._bad_news = [ r"[^(]error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"pacemakerd.*\[[0-9]+\] terminated( with signal|$)", r"pacemakerd.*\[[0-9]+\] .* will now be killed", r"pacemaker-schedulerd.*Recover\s+.*\(.* -\> .*\)", r"rsyslogd.* lost .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", r"Election storm", r"stalled the FSA with pending inputs", ] components_common_ignore = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Exiting now due to errors", r".*:\s*Requesting fencing \([^)]+\) targeting node ", r"(Blackbox dump requested|Problem detected)", ] self._components["corosync-ignore"] = components_common_ignore + [ r"Could not connect to Corosync CFG: CS_ERR_LIBRARY", r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"\[[0-9]+\] terminated with signal 15", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*cib_(shm|rw) IPC provider disconnected while waiting", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error: Lost fencer connection", ] self._components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*warning:.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (Corosync process group|the CIB manager)", r"pacemaker-based.*:\s*crit:.*Exiting immediately after losing connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling node .* for fencing", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self._components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-attrd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-based after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-fenced after unexpected exit", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*handle_cib_disconnect", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self._components["pacemaker-based-ignore"] = components_common_ignore + [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", r"pacemaker-controld.*:Could not connect to attrd: Connection refused", ] self._components["pacemaker-execd"] = [ r"pacemaker-controld.*Lost connection to local executor", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-execd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", ] self._components["pacemaker-execd-ignore"] = components_common_ignore + [ r"pacemaker-(attrd|controld).*Connection to lrmd.* (failed|closed)", r"pacemaker-(attrd|controld).*Could not execute alert", ] self._components["pacemaker-controld"] = [ r"State transition .* -> S_IDLE", ] self._components["pacemaker-controld-ignore"] = components_common_ignore self._components["pacemaker-attrd"] = [] self._components["pacemaker-attrd-ignore"] = components_common_ignore + [ r"pacemaker-controld.*Connection to attrd (IPC failed|closed)", ] self._components["pacemaker-schedulerd"] = [ r"State transition .* S_RECOVERY", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", r"pacemaker-controld.*Lost connection to the scheduler", r"pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self._components["pacemaker-schedulerd-ignore"] = components_common_ignore + [ r"Connection to pengine.* (failed|closed)", ] self._components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Lost fencer connection", r"pacemaker-controld.*Fencer successfully connected", ] self._components["pacemaker-fenced-ignore"] = components_common_ignore + [ r"(error|warning):.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error:.*Lost fencer connection", r"error:.*Fencer connection failed \(will retry\)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", ] patternVariants = { "crm-base": BasePatterns, "crm-corosync": Corosync2Patterns } class PatternSelector: """Choose from among several Pattern objects and return the information from that object.""" def __init__(self, name="crm-corosync"): """ Create a new PatternSelector object. Instantiate whatever class is given by name. Defaults to Corosync2Patterns for "crm-corosync" or None. While other objects could be supported in the future, only this and the base object are supported at this time. """ self._name = name # If no name was given, use the default. Otherwise, look up the appropriate # class in patternVariants, instantiate it, and use that. if not name: self._base = Corosync2Patterns() else: self._base = patternVariants[name]() def __getitem__(self, key): """ Return a single pattern from the previously instantiated pattern object. If no pattern exists for the given key, return None. """ return self._base[key] def get_patterns(self, kind): """Call get_patterns on the previously instantiated pattern object.""" return self._base.get_patterns(kind) def get_component(self, kind): """Call get_component on the previously instantiated pattern object.""" return self._base.get_component(kind) diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py index 2edb4aee83..e3c610193f 100644 --- a/python/pacemaker/_cts/tests/componentfail.py +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -1,161 +1,161 @@ """Kill a pacemaker daemon and test how the cluster recovers.""" __all__ = ["ComponentFail"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # @TODO Separate this into a separate test for each component, so the patterns # can be made specific to each component, investigating failures is a little # easier, and specific testing can be done for each component (for example, # set attributes before and after killing pacemaker-attrd and check values). class ComponentFail(CTSTest): """Kill a random pacemaker daemon and wait for the cluster to recover.""" def __init__(self, cm): """ Create a new ComponentFail instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "ComponentFail" self._complist = cm.components self._okerrpatterns = [] self._patterns = [] self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") self._patterns = [] self._okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") if not self._cm.cluster_stable(self._env["stable_time"]): return self.failure("Setup failed - unstable") # select a component to kill chosen = self._env.random_gen.choice(self._complist) node_is_dc = self._cm.is_node_dc(node, None) self.debug(f"...component {chosen.name} (dc={node_is_dc})") self.incr(chosen.name) if chosen.name != "corosync": self._patterns.extend([ self._cm.templates["Pat:ChildKilled"] % (node, chosen.name), self._cm.templates["Pat:ChildRespawn"] % (node, chosen.name), ]) self._patterns.extend(chosen.pats) # @TODO this should be a flag in the Component if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self._okerrpatterns = [ - self._cm.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Resource_active"], ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self._okerrpatterns.extend([ self._cm.templates["Pat:Fencing_recover"] % r.id, self._cm.templates["Pat:Fencing_probe"] % r.id, ]) # supply a copy so self.patterns doesn't end up empty tmp_pats = self._patterns.copy() self._patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith_pats = [ self._cm.templates["Pat:Fencing_ok"] % node ] stonith = self.create_watch(stonith_pats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( tmp_pats, self._env["dead_time"] + self._env["stable_time"] + self._env["start_time"]) watch.set_watch() # kill the component chosen.signal("KILL", node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self._cm.cluster_stable(self._env["start_time"]) self.debug(f"Checking if {node} was shot") shot = stonith.look(60) if shot: self.debug(f"Found: {shot!r}") self._okerrpatterns.append(self._cm.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.expected_status[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["start_time"]) if not matched: return self.failure(f"Didn't find all expected {chosen.name} patterns") if not is_stable: return self.failure(f"Cluster did not become stable after killing {chosen.name}") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self._okerrpatterns.extend(self._patterns) return self._okerrpatterns diff --git a/python/pacemaker/_cts/tests/splitbraintest.py b/python/pacemaker/_cts/tests/splitbraintest.py index 102ac156b3..99b5dc5d7c 100644 --- a/python/pacemaker/_cts/tests/splitbraintest.py +++ b/python/pacemaker/_cts/tests/splitbraintest.py @@ -1,211 +1,212 @@ """Create a split brain cluster and verify a resource is multiply managed.""" __all__ = ["SplitBrainTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import time from pacemaker._cts.input import should_continue from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SplitBrainTest(CTSTest): """ Create a split brain cluster. This test verifies that one node in each partition takes over the resource, resulting in two nodes running the same resource. """ def __init__(self, cm): """ Create a new SplitBrainTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "SplitBrain" self._start = StartTest(cm) self._startall = SimulStartLite(cm) def _isolate_partition(self, partition): """Create a new partition containing the given nodes.""" other_nodes = self._env["nodes"].copy() for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log(f"Node {node} not in {self._env['nodes']!r} from {partition!r}") if not other_nodes: return self.debug(f"Creating partition: {partition!r}") self.debug(f"Everyone else: {other_nodes!r}") for node in partition: if not self._cm.isolate_node(node, other_nodes): self._logger.log(f"Could not isolate {node}") return def _heal_partition(self, partition): """Move the given nodes out of their own partition back into the cluster.""" other_nodes = self._env["nodes"].copy() for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log(f"Node {node} not in {self._env['nodes']!r}") if len(other_nodes) == 0: return self.debug(f"Healing partition: {partition!r}") self.debug(f"Everyone else: {other_nodes!r}") for node in partition: self._cm.unisolate_node(node, other_nodes) def __call__(self, node): """Perform this test.""" self.incr("calls") self.passed = True partitions = {} if not self._startall(None): return self.failure("Setup failed") while True: # Retry until we get multiple partitions partitions = {} p_max = len(self._env["nodes"]) for n in self._env["nodes"]: p = self._env.random_gen.randint(1, p_max) if p not in partitions: partitions[p] = [] partitions[p].append(n) p_max = len(partitions) if p_max > 1: break # else, try again self.debug(f"Created {p_max} partitions") for (key, val) in partitions.items(): self.debug(f"Partition[{key}]:\t{val!r}") # Disabling STONITH to reduce test complexity for now self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") for val in partitions.values(): self._isolate_partition(val) count = 30 while count > 0: if len(self._cm.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self._cm.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self._cm.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self._cm.partitions_expected = 1 # And heal them again for val in partitions.values(): self._heal_partition(val) # Wait for a single partition to form count = 30 while count > 0: if len(self._cm.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self._cm.find_partitions() if partitions: members = partitions[0].split() if len(members) != len(self._env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self._cm.cluster_stable(1200): self.failure("Reformed cluster not stable") if not should_continue(self._env): raise ValueError("Reformed cluster not stable") # Turn fencing back on if self._env["fencing_enabled"]: self._rsh(node, "crm_attribute -V -D -n stonith-enabled") self._cm.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"pacemaker-controld.*:.*not in our membership list", - r"CRIT:.*node.*returning after partition" + r"CRIT:.*node.*returning after partition", + self._cm.templates["Pat:Resource_active"], ] def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" if not CTSTest.is_applicable(self): return False return len(self._env["nodes"]) > 2 diff --git a/python/pacemaker/_cts/tests/stonithdtest.py b/python/pacemaker/_cts/tests/stonithdtest.py index facc0133d5..f5f43e300f 100644 --- a/python/pacemaker/_cts/tests/stonithdtest.py +++ b/python/pacemaker/_cts/tests/stonithdtest.py @@ -1,134 +1,134 @@ """Fence a running node and wait for it to restart.""" __all__ = ["StonithdTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker.exitstatus import ExitStatus from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StonithdTest(CTSTest): """Fence a running node and wait for it to restart.""" def __init__(self, cm): """ Create a new StonithdTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "Stonithd" self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if len(self._env["nodes"]) < 2: return self.skipped() ret = self._startall(None) if not ret: return self.failure("Setup failed") watchpats = [ self._cm.templates["Pat:Fencing_ok"] % node, self._cm.templates["Pat:NodeFenced"] % node, ] if not self._env["at-boot"]: self.debug(f"Expecting {node} to stay down") self._cm.expected_status[node] = "down" else: self.debug(f"Expecting {node} to come up again {self._env['at-boot']}") watchpats.extend([ f"{node}.* S_STARTING -> S_PENDING", f"{node}.* S_PENDING -> S_NOT_DC", ]) watch = self.create_watch(watchpats, 30 + self._env["dead_time"] + self._env["stable_time"] + self._env["start_time"]) watch.set_watch() origin = self._env.random_gen.choice(self._env["nodes"]) (rc, _) = self._rsh(origin, f"stonith_admin --reboot {node} -VVVVVV") if rc == ExitStatus.TIMEOUT: # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self._logger.log(f"Fencing command on {origin} to fence {node} timed out") elif origin != node and rc != 0: self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self._logger.log(f"Fencing command on {origin} failed to fence {node} (rc={rc})") elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as expected self._logger.log(f"Locally originated fencing returned {rc}") with Timer(self._logger, self.name, "fence"): matched = watch.look_for_all() self.set_timer("reform") if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["start_time"]) if not matched: return self.failure("Didn't find all expected patterns") if not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ self._cm.templates["Pat:Fencing_start"] % ".*", self._cm.templates["Pat:Fencing_ok"] % ".*", - self._cm.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Resource_active"], r"error.*: Operation 'reboot' targeting .* by .* for stonith_admin.*: Timer expired" ] def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" return self._env["fencing_enabled"] and CTSTest.is_applicable(self)