diff --git a/python/pacemaker/_cts/CTS.py b/python/pacemaker/_cts/CTS.py index 6c3fcc9fbb..81b50ab50e 100644 --- a/python/pacemaker/_cts/CTS.py +++ b/python/pacemaker/_cts/CTS.py @@ -1,227 +1,221 @@ """Main classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["CtsLab", "NodeStatus", "Process"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys import time import traceback from pacemaker.exitstatus import ExitStatus from pacemaker._cts.environment import EnvFactory from pacemaker._cts.input import should_continue from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory class CtsLab: """ A class that defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. This is where you define the set of nodes that are in your test lab, what kind of reset mechanism you use, etc. All this data is stored as key/value pairs in an Environment instance constructed from arguments passed to this class. The CTS code ignores names it doesn't know about or need. Individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions, or other information to the tests through this mechanism. """ def __init__(self, args=None): """ Create a new CtsLab instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: args -- A list of command line parameters, minus the program name. """ self._env = EnvFactory().getInstance(args) self._logger = LogFactory() def dump(self): """Print the current environment.""" self._env.dump() def __contains__(self, key): """Return True if the given environment key exists.""" # pylint gets confused because of EnvFactory here. # pylint: disable=unsupported-membership-test return key in self._env def __getitem__(self, key): """Return the given environment key, or raise KeyError if it does not exist.""" # Throughout this file, pylint has trouble understanding that EnvFactory # and RemoteFactory are singleton instances that can be treated as callable # and subscriptable objects. Various warnings are disabled because of this. # See also a comment about self._rsh in environment.py. # pylint: disable=unsubscriptable-object return self._env[key] def __setitem__(self, key, value): """Set the given environment key to the given value, overriding any previous value.""" # pylint: disable=unsupported-assignment-operation self._env[key] = value def run(self, scenario, iterations): """ Run the given scenario the given number of times. Returns ExitStatus.OK on success, or ExitStatus.ERROR on error. """ if not scenario: self._logger.log("No scenario was defined") return ExitStatus.ERROR self._logger.log("Cluster nodes: ") # pylint: disable=unsubscriptable-object for node in self._env["nodes"]: self._logger.log(f" * {node}") if not scenario.setup(): return ExitStatus.ERROR # We want to alert on any exceptions caused by running a scenario, so # here it's okay to disable the pylint warning. # pylint: disable=bare-except try: scenario.run(iterations) except: # noqa: E722 self._logger.log(f"Exception by {sys.exc_info()[0]}") self._logger.traceback(traceback) scenario.summarize() scenario.teardown() return ExitStatus.ERROR scenario.teardown() scenario.summarize() if scenario.stats["failure"] > 0: return ExitStatus.ERROR if scenario.stats["success"] != iterations: self._logger.log("No failure count but success != requested iterations") return ExitStatus.ERROR return ExitStatus.OK class NodeStatus: """ A class for querying the status of cluster nodes. Are nodes up? Do they respond to SSH connections? """ def __init__(self, env): """ Create a new NodeStatus instance. Arguments: env -- An Environment instance """ self._env = env def _node_booted(self, node): """Return True if the given node is booted (responds to pings).""" # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()("localhost", f"ping -nq -c1 -w1 {node}", verbose=0) return rc == 0 def _sshd_up(self, node): """Return true if sshd responds on the given node.""" # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()(node, "true", verbose=0) return rc == 0 def wait_for_node(self, node, timeout=300): """ Wait for a node to become available. Should the timeout be reached, the user will be given a choice whether to continue or not. If not, ValueError will be raised. Returns True when the node is available, or False if the timeout is reached. """ initial_timeout = timeout anytimeouts = False while timeout > 0: if self._node_booted(node) and self._sshd_up(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) LogFactory().debug(f"Node {node} now up") return True time.sleep(30) if not anytimeouts: LogFactory().debug(f"Waiting for node {node} to come up") anytimeouts = True timeout -= 1 LogFactory().log(f"{node} did not come up within {initial_timeout} tries") if not should_continue(self._env["continue"]): raise ValueError(f"{node} did not come up within {initial_timeout} tries") return False def wait_for_all_nodes(self, nodes, timeout=300): """Return True when all nodes come up, or False if the timeout is reached.""" for node in nodes: if not self.wait_for_node(node, timeout): return False return True class Process: """A class for managing a Pacemaker daemon.""" # pylint: disable=invalid-name - def __init__(self, cm, name, pats=None, dc_pats=None, badnews_ignore=None): + def __init__(self, cm, name, pats=None, badnews_ignore=None): """ Create a new Process instance. Arguments: cm -- A ClusterManager instance name -- The command being run pats -- Regexes we expect to find in log files - dc_pats -- Additional DC-specific regexes we expect to find - in log files badnews_ignore -- Regexes for lines in the log that can be ignored """ self._cm = cm self.badnews_ignore = badnews_ignore - self.dc_pats = dc_pats self.name = name self.pats = pats if self.badnews_ignore is None: self.badnews_ignore = [] - if self.dc_pats is None: - self.dc_pats = [] - if self.pats is None: self.pats = [] def kill(self, node): """Kill the instance of this process running on the given node.""" (rc, _) = self._cm.rsh(node, f"killall -9 {self.name}") if rc != 0: self._cm.log(f"ERROR: Kill {self.name} failed on node {node}") diff --git a/python/pacemaker/_cts/cmcorosync.py b/python/pacemaker/_cts/cmcorosync.py index 9430d9347b..3cd102c6cb 100644 --- a/python/pacemaker/_cts/cmcorosync.py +++ b/python/pacemaker/_cts/cmcorosync.py @@ -1,60 +1,53 @@ """Corosync-specific class for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["Corosync2"] __copyright__ = "Copyright 2007-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.CTS import Process from pacemaker._cts.clustermanager import ClusterManager # Throughout this file, pylint has trouble understanding that EnvFactory # is a singleton instance that can be treated as a subscriptable object. # Various warnings are disabled because of this. See also a comment about # self._rsh in environment.py. # pylint: disable=unsubscriptable-object class Corosync2(ClusterManager): """A subclass of ClusterManager specialized to handle corosync2 and later based clusters.""" def __init__(self): """Create a new Corosync2 instance.""" ClusterManager.__init__(self) self._components = {} @property def components(self): """Return a list of patterns that should be ignored for the cluster's components.""" if not self._components: common_ignore = self.templates.get_component("common-ignore") daemons = [ "pacemaker-based", "pacemaker-controld", "pacemaker-attrd", "pacemaker-execd", "pacemaker-fenced" ] for c in daemons: badnews = self.templates.get_component(f"{c}-ignore") + common_ignore proc = Process(self, c, pats=self.templates.get_component(c), badnews_ignore=badnews) self._components[c] = proc - # the scheduler uses dc_pats instead of pats - badnews = self.templates.get_component("pacemaker-schedulerd-ignore") + common_ignore - proc = Process(self, "pacemaker-schedulerd", - dc_pats=self.templates.get_component("pacemaker-schedulerd"), - badnews_ignore=badnews) - self._components["pacemaker-schedulerd"] = proc - # add (or replace) extra components badnews = self.templates.get_component("corosync-ignore") + common_ignore proc = Process(self, "corosync", pats=self.templates.get_component("corosync"), badnews_ignore=badnews) self._components["corosync"] = proc if self.env["DoFencing"]: return list(self._components.values()) return [v for k, v in self._components.items() if k != "pacemaker-fenced"] diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py index 12414f551c..74e3c683ad 100644 --- a/python/pacemaker/_cts/patterns.py +++ b/python/pacemaker/_cts/patterns.py @@ -1,384 +1,388 @@ """Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["PatternSelector"] __copyright__ = "Copyright 2008-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+)" from pacemaker.buildoptions import BuildOptions class BasePatterns: """ The base class for holding a stack-specific set of command and log file/stdout patterns. Stack-specific classes need to be built on top of this one. """ def __init__(self): """Create a new BasePatterns instance which holds a very minimal set of basic patterns.""" self._bad_news = [] self._components = {} self._name = "crm-base" self._ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", # pcs can log this when node is fenced, but fencing is OK in some # tests (and we will catch it in pacemaker logs when not OK) r"pcs.daemon:No response from: .* request: get_configs, error:", # This is overbroad, but there's no way to say that only certain # transition errors are acceptable. We have to rely on causes of a # transition error logging their own error message, which should # always be the case. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", # This message comes up periodically but doesn't actually seem to # be related to any specific test failure, so just ignore it. r"pacemaker-based.* Local CIB .* differs from", ] self._commands = { "StatusCmd": "crmadmin -t 60 -S %s 2>/dev/null", "CibQuery": "cibadmin -Q", "CibAddXml": "cibadmin --modify -c --xml-text %s", "CibDelXpath": "cibadmin --delete --xpath %s", "RscRunning": BuildOptions.DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile": "%s:" + BuildOptions.CIB_DIR + "/cib.xml", "TmpDir": "/tmp", "BreakCommCmd": "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd": "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", "MaintenanceModeOn": "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff": "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd": "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd": "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self._search = { "Pat:DC_IDLE": r"pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started": r"%s\W.*controller successfully started", "Pat:NonDC_started": r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started": r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped": r"%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped": r"%s\W.*LOST:.* %s ", "Pat:They_dead": r"node %s.*: is dead", "Pat:They_up": r"%s %s\W.*OVERRIDE THIS PATTERN", "Pat:TransitionComplete": "Transition status: Complete: complete", "Pat:Fencing_start": r"Requesting peer fencing .* targeting %s", "Pat:Fencing_ok": r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK", "Pat:Fencing_recover": r"pacemaker-schedulerd.*: Recover\s+%s", "Pat:Fencing_active": r"stonith resource .* is active on 2 nodes (attempting recovery)", "Pat:Fencing_probe": r"pacemaker-controld.* Result of probe operation for %s on .*: Error", "Pat:RscOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?OK", "Pat:RscOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ", "Pat:CloneOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ", "Pat:RscRemoteOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?OK", "Pat:NodeFenced": r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", } def get_component(self, key): """ Return the patterns for a single component as a list, given by key. This is typically the name of some subprogram (pacemaker-based, pacemaker-fenced, etc.) or various special purpose keys. If key is unknown, return an empty list. """ if key in self._components: return self._components[key] print(f"Unknown component '{key}' for {self._name}") return [] def get_patterns(self, key): """ Return various patterns supported by this object, given by key. Depending on the key, this could either be a list or a hash. If key is unknown, return None. """ if key == "BadNews": return self._bad_news if key == "BadNewsIgnore": return self._ignore if key == "Commands": return self._commands if key == "Search": return self._search if key == "Components": return self._components print(f"Unknown pattern '{key}' for {self._name}") return None def __getitem__(self, key): if key in self._commands: return self._commands[key] if key in self._search: return self._search[key] print(f"Unknown template '{key}' for {self._name}") return None class Corosync2Patterns(BasePatterns): """Patterns for Corosync version 2 cluster manager class.""" + # @FIXME Some of the templates here look like they start with + # incorrect daemon names. Also, many of them aren't Corosync- + # specific and should probably go in BasePatterns. + def __init__(self): BasePatterns.__init__(self) self._name = "crm-corosync" self._commands.update({ "StartCmd": "service corosync start && service pacemaker start", "StopCmd": "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd": "crm_node -e", "QuorumCmd": "crm_node -q", "PartitionCmd": "crm_node -p", }) self._search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped": r"%s\W.*Unloading all Corosync service engines", "Pat:They_stopped": r"%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead": r"pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_up": r"\W%s\W.*pacemaker-controld.*Node %s state is now member", "Pat:ChildExit": r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() "Pat:ChildKilled": r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)", "Pat:ChildRespawn": r"%s\W.*pacemakerd.*Respawning subdaemon %s after unexpected exit", "Pat:InfraUp": r"%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp": r"%s\W.*pacemakerd.*Starting Pacemaker", }) self._ignore += [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* using FencingFail returned ", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self._bad_news = [ r"[^(]error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"pacemakerd.*\[[0-9]+\] terminated( with signal|$)", r"pacemakerd.*\[[0-9]+\] .* will now be killed", r"pacemaker-schedulerd.*Recover\s+.*\(.* -\> .*\)", r"rsyslogd.* lost .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", r"Election storm", r"stalled the FSA with pending inputs", ] self._components["common-ignore"] = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Exiting now due to errors", r".*:\s*Requesting fencing \([^)]+\) targeting node ", r"(Blackbox dump requested|Problem detected)", ] self._components["corosync-ignore"] = [ r"Could not connect to Corosync CFG: CS_ERR_LIBRARY", r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"\[[0-9]+\] terminated with signal 15", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*cib_(shm|rw) IPC provider disconnected while waiting", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error: Lost fencer connection", ] self._components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*warning:.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (Corosync process group|the CIB manager)", r"pacemaker-based.*:\s*crit:.*Exiting immediately after losing connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling node .* for fencing", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self._components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-attrd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-based after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-fenced after unexpected exit", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*handle_cib_disconnect", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self._components["pacemaker-based-ignore"] = [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", r"pacemaker-controld.*:Could not connect to attrd: Connection refused", ] self._components["pacemaker-execd"] = [ r"pacemaker-controld.*Lost connection to local executor", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-execd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", ] self._components["pacemaker-execd-ignore"] = [ r"pacemaker-(attrd|controld).*Connection to lrmd.* (failed|closed)", r"pacemaker-(attrd|controld).*Could not execute alert", ] self._components["pacemaker-controld"] = [ r"State transition .* -> S_IDLE", ] self._components["pacemaker-controld-ignore"] = [] self._components["pacemaker-attrd"] = [] self._components["pacemaker-attrd-ignore"] = [ r"pacemaker-controld.*Connection to attrd (IPC failed|closed)", ] self._components["pacemaker-schedulerd"] = [ r"State transition .* S_RECOVERY", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", r"pacemaker-controld.*Lost connection to the scheduler", r"pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self._components["pacemaker-schedulerd-ignore"] = [ r"Connection to pengine.* (failed|closed)", ] self._components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Lost fencer connection", r"pacemaker-controld.*Fencer successfully connected", ] self._components["pacemaker-fenced-ignore"] = [ r"(error|warning):.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error:.*Lost fencer connection", r"error:.*Fencer connection failed \(will retry\)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", ] self._components["pacemaker-fenced-ignore"].extend(self._components["common-ignore"]) patternVariants = { "crm-base": BasePatterns, "crm-corosync": Corosync2Patterns } class PatternSelector: """Choose from among several Pattern objects and return the information from that object.""" def __init__(self, name="crm-corosync"): """ Create a new PatternSelector object. Instantiate whatever class is given by name. Defaults to Corosync2Patterns for "crm-corosync" or None. While other objects could be supported in the future, only this and the base object are supported at this time. """ self._name = name # If no name was given, use the default. Otherwise, look up the appropriate # class in patternVariants, instantiate it, and use that. if not name: self._base = Corosync2Patterns() else: self._base = patternVariants[name]() def __getitem__(self, key): """ Return a single pattern from the previously instantiated pattern object. If no pattern exists for the given key, return None. """ return self._base[key] def get_patterns(self, kind): """Call get_patterns on the previously instantiated pattern object.""" return self._base.get_patterns(kind) def get_component(self, kind): """Call get_component on the previously instantiated pattern object.""" return self._base.get_component(kind) diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py index ba06124084..c39782ab45 100644 --- a/python/pacemaker/_cts/tests/componentfail.py +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -1,164 +1,161 @@ """Kill a pacemaker daemon and test how the cluster recovers.""" __all__ = ["ComponentFail"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # @TODO Separate this into a separate test for each component, so the patterns # can be made specific to each component, investigating failures is a little # easier, and specific testing can be done for each component (for example, # set attributes before and after killing pacemaker-attrd and check values). class ComponentFail(CTSTest): """Kill a random pacemaker daemon and wait for the cluster to recover.""" def __init__(self, cm): """ Create a new ComponentFail instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "ComponentFail" self._complist = cm.components self._okerrpatterns = [] self._patterns = [] self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") self._patterns = [] self._okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") if not self._cm.cluster_stable(self._env["StableTime"]): return self.failure("Setup failed - unstable") - node_is_dc = self._cm.is_node_dc(node, None) - # select a component to kill chosen = self._env.random_gen.choice(self._complist) + node_is_dc = self._cm.is_node_dc(node, None) self.debug(f"...component {chosen.name} (dc={node_is_dc})") self.incr(chosen.name) if chosen.name != "corosync": self._patterns.extend([ self._cm.templates["Pat:ChildKilled"] % (node, chosen.name), self._cm.templates["Pat:ChildRespawn"] % (node, chosen.name), ]) self._patterns.extend(chosen.pats) - if node_is_dc: - self._patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self._okerrpatterns = [ self._cm.templates["Pat:Fencing_active"], ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self._okerrpatterns.extend([ self._cm.templates["Pat:Fencing_recover"] % r.id, self._cm.templates["Pat:Fencing_probe"] % r.id, ]) # supply a copy so self.patterns doesn't end up empty tmp_pats = self._patterns.copy() self._patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith_pats = [ self._cm.templates["Pat:Fencing_ok"] % node ] stonith = self.create_watch(stonith_pats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) watch.set_watch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self._cm.cluster_stable(self._env["StartTime"]) self.debug(f"Checking if {node} was shot") shot = stonith.look(60) if shot: self.debug(f"Found: {shot!r}") self._okerrpatterns.append(self._cm.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.expected_status[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["StartTime"]) if not matched: return self.failure(f"Didn't find all expected {chosen.name} patterns") if not is_stable: return self.failure(f"Cluster did not become stable after killing {chosen.name}") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self._okerrpatterns.extend(self._patterns) return self._okerrpatterns