diff --git a/python/pacemaker/_cts/CTS.py b/python/pacemaker/_cts/CTS.py index 86bb725d8f..520b777575 100644 --- a/python/pacemaker/_cts/CTS.py +++ b/python/pacemaker/_cts/CTS.py @@ -1,239 +1,239 @@ """ Main classes for Pacemaker's Cluster Test Suite (CTS) """ __all__ = ["CtsLab", "NodeStatus", "Process"] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys import time import traceback from pacemaker.exitstatus import ExitStatus from pacemaker._cts.environment import EnvFactory from pacemaker._cts.input import should_continue from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory class CtsLab: """ A class that defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. This is where you define the set of nodes that are in your test lab, what kind of reset mechanism you use, etc. All this data is stored as key/value pairs in an Environment instance constructed from arguments passed to this class. The CTS code ignores names it doesn't know about or need. Individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions, or other information to the tests through this mechanism. """ def __init__(self, args=None): """ Create a new CtsLab instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: args -- A list of command line parameters, minus the program name. """ self._env = EnvFactory().getInstance(args) self._logger = LogFactory() def dump(self): """ Print the current environment """ self._env.dump() def __contains__(self, key): """ Does the given environment key exist? """ # pylint gets confused because of EnvFactory here. # pylint: disable=unsupported-membership-test return key in self._env def __getitem__(self, key): """ Return the given environment key, or raise KeyError if it does not exist """ # Throughout this file, pylint has trouble understanding that EnvFactory # and RemoteFactory are singleton instances that can be treated as callable # and subscriptable objects. Various warnings are disabled because of this. # See also a comment about self._rsh in environment.py. # pylint: disable=unsubscriptable-object return self._env[key] def __setitem__(self, key, value): """ Set the given environment key to the given value, overriding any previous value """ # pylint: disable=unsupported-assignment-operation self._env[key] = value def run(self, scenario, iterations): """ Run the given scenario the given number of times. Returns: ExitStatus.OK on success, or ExitStatus.ERROR on error """ if not scenario: self._logger.log("No scenario was defined") return ExitStatus.ERROR self._logger.log("Cluster nodes: ") # pylint: disable=unsubscriptable-object for node in self._env["nodes"]: self._logger.log(" * %s" % (node)) if not scenario.setup(): return ExitStatus.ERROR # We want to alert on any exceptions caused by running a scenario, so # here it's okay to disable the pylint warning. # pylint: disable=bare-except try: scenario.run(iterations) except: self._logger.log("Exception by %s" % sys.exc_info()[0]) self._logger.traceback(traceback) scenario.summarize() scenario.teardown() return ExitStatus.ERROR scenario.teardown() scenario.summarize() - if scenario.Stats["failure"] > 0: + if scenario.stats["failure"] > 0: return ExitStatus.ERROR - if scenario.Stats["success"] != iterations: + if scenario.stats["success"] != iterations: self._logger.log("No failure count but success != requested iterations") return ExitStatus.ERROR return ExitStatus.OK class NodeStatus: """ A class for querying the status of cluster nodes - are nodes up? Do they respond to SSH connections? """ def __init__(self, env): """ Create a new NodeStatus instance Arguments: env -- An Environment instance """ self._env = env def _node_booted(self, node): """ Return True if the given node is booted (responds to pings) """ # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()("localhost", "ping -nq -c1 -w1 %s" % node, verbose=0) return rc == 0 def _sshd_up(self, node): """ Return true if sshd responds on the given node """ # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()(node, "true", verbose=0) return rc == 0 def wait_for_node(self, node, timeout=300): """ Wait for a node to become available. Should the timeout be reached, the user will be given a choice whether to continue or not. If not, ValueError will be raised. Returns: True when the node is available, or False if the timeout is reached. """ initial_timeout = timeout anytimeouts = False while timeout > 0: if self._node_booted(node) and self._sshd_up(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) LogFactory().debug("Node %s now up" % node) return True time.sleep(30) if not anytimeouts: LogFactory().debug("Waiting for node %s to come up" % node) anytimeouts = True timeout -= 1 LogFactory().log("%s did not come up within %d tries" % (node, initial_timeout)) if not should_continue(self._env["continue"]): raise ValueError("%s did not come up within %d tries" % (node, initial_timeout)) return False def wait_for_all_nodes(self, nodes, timeout=300): """ Return True when all nodes come up, or False if the timeout is reached """ for node in nodes: if not self.wait_for_node(node, timeout): return False return True class Process: """ A class for managing a Pacemaker daemon """ # pylint: disable=invalid-name def __init__(self, cm, name, dc_only=False, pats=None, dc_pats=None, badnews_ignore=None): """ Create a new Process instance. Arguments: cm -- A ClusterManager instance name -- The command being run dc_only -- Should this daemon be killed only on the DC? pats -- Regexes we expect to find in log files dc_pats -- Additional DC-specific regexes we expect to find in log files badnews_ignore -- Regexes for lines in the log that can be ignored """ self._cm = cm self.badnews_ignore = badnews_ignore self.dc_only = dc_only self.dc_pats = dc_pats self.name = name self.pats = pats if self.badnews_ignore is None: self.badnews_ignore = [] if self.dc_pats is None: self.dc_pats = [] if self.pats is None: self.pats = [] def kill(self, node): """ Kill the instance of this process running on the given node """ (rc, _) = self._cm.rsh(node, "killall -9 %s" % self.name) if rc != 0: self._cm.log ("ERROR: Kill %s failed on node %s" % (self.name, node)) diff --git a/python/pacemaker/_cts/scenarios.py b/python/pacemaker/_cts/scenarios.py index d2cbf10de7..69bb3c5468 100644 --- a/python/pacemaker/_cts/scenarios.py +++ b/python/pacemaker/_cts/scenarios.py @@ -1,361 +1,359 @@ """ Test scenario classes for Pacemaker's Cluster Test Suite (CTS) """ __all__ = [ "AllOnce", "Boot", "BootCluster", "LeaveBooted", "RandomTests", "Sequence" ] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker._cts.audits import ClusterAudit from pacemaker._cts.input import should_continue from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.watcher import LogWatcher class ScenarioComponent: def __init__(self, cm, env): self._cm = cm self._env = env def is_applicable(self): '''Return True if the current ScenarioComponent is applicable in the given LabEnvironment given to the constructor. ''' raise NotImplementedError def setup(self): '''Set up the given ScenarioComponent''' raise NotImplementedError def teardown(self): '''Tear down (undo) the given ScenarioComponent''' raise NotImplementedError class Scenario: ( '''The basic idea of a scenario is that of an ordered list of ScenarioComponent objects. Each ScenarioComponent is setup() in turn, and then after the tests have been run, they are torn down using teardown() (in reverse order). A Scenario is applicable to a particular cluster manager iff each ScenarioComponent is applicable. A partially set up scenario is torn down if it fails during setup. ''') - def __init__(self, ClusterManager, Components, Audits, Tests): + def __init__(self, cm, components, audits, tests): "Initialize the Scenario from the list of ScenarioComponents" - self.ClusterManager = ClusterManager - self.Components = Components - self.Audits = Audits - self.Tests = Tests + self.stats = { "success": 0, "failure": 0, "BadNews": 0, "skipped": 0 } + self.tests = tests - self.BadNews = None - self.TestSets = [] - self.Stats = {"success":0, "failure":0, "BadNews":0, "skipped":0} - self.Sets = [] + self._audits = audits + self._bad_news = None + self._cm = cm + self._components = components - for comp in Components: + for comp in components: if not issubclass(comp.__class__, ScenarioComponent): raise ValueError("Init value must be subclass of ScenarioComponent") - for audit in Audits: + for audit in audits: if not issubclass(audit.__class__, ClusterAudit): raise ValueError("Init value must be subclass of ClusterAudit") - for test in Tests: + for test in tests: if not issubclass(test.__class__, CTSTest): raise ValueError("Init value must be a subclass of CTSTest") def is_applicable(self): ( '''A Scenario is_applicable() iff each of its ScenarioComponents is_applicable() ''' ) - for comp in self.Components: + for comp in self._components: if not comp.is_applicable(): return False return True def setup(self): '''Set up the Scenario. Return TRUE on success.''' - self.ClusterManager.prepare() + self._cm.prepare() self.audit() # Also detects remote/local log config - self.ClusterManager.ns.wait_for_all_nodes(self.ClusterManager.Env["nodes"]) + self._cm.ns.wait_for_all_nodes(self._cm.Env["nodes"]) self.audit() - self.ClusterManager.install_support() + self._cm.install_support() - self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"], - self.ClusterManager.templates.get_patterns("BadNews"), - self.ClusterManager.Env["nodes"], - self.ClusterManager.Env["LogWatcher"], + self._bad_news = LogWatcher(self._cm.Env["LogFileName"], + self._cm.templates.get_patterns("BadNews"), + self._cm.Env["nodes"], + self._cm.Env["LogWatcher"], "BadNews", 0) - self.BadNews.set_watch() # Call after we've figured out what type of log watching to do in LogAudit + self._bad_news.set_watch() # Call after we've figured out what type of log watching to do in LogAudit j = 0 - while j < len(self.Components): - if not self.Components[j].setup(): + while j < len(self._components): + if not self._components[j].setup(): # OOPS! We failed. Tear partial setups down. self.audit() - self.ClusterManager.log("Tearing down partial setup") + self._cm.log("Tearing down partial setup") self.teardown(j) return False j += 1 self.audit() return True def teardown(self, n_components=None): '''Tear Down the Scenario - in reverse order.''' if not n_components: - n_components = len(self.Components)-1 + n_components = len(self._components)-1 j = n_components while j >= 0: - self.Components[j].teardown() + self._components[j].teardown() j -= 1 self.audit() - self.ClusterManager.install_support("uninstall") + self._cm.install_support("uninstall") def incr(self, name): '''Increment (or initialize) the value associated with the given name''' - if not name in self.Stats: - self.Stats[name] = 0 - self.Stats[name] += 1 + if not name in self.stats: + self.stats[name] = 0 + self.stats[name] += 1 def run(self, Iterations): - self.ClusterManager.oprofileStart() + self._cm.oprofileStart() try: self.run_loop(Iterations) - self.ClusterManager.oprofileStop() + self._cm.oprofileStop() except: - self.ClusterManager.oprofileStop() + self._cm.oprofileStop() raise def run_loop(self, Iterations): raise ValueError("Abstract Class member (run_loop)") def run_test(self, test, testcount): - nodechoice = self.ClusterManager.Env.random_node() + nodechoice = self._cm.Env.random_node() ret = True did_run = False - self.ClusterManager.instance_errorstoignore_clear() + self._cm.instance_errorstoignore_clear() choice = "(%s)" % nodechoice - self.ClusterManager.log("Running test {:<22} {:<15} [{:>3}]".format(test.name, choice, testcount)) + self._cm.log("Running test {:<22} {:<15} [{:>3}]".format(test.name, choice, testcount)) starttime = test.set_timer() if not test.setup(nodechoice): - self.ClusterManager.log("Setup failed") + self._cm.log("Setup failed") ret = False elif not test.can_run_now(nodechoice): - self.ClusterManager.log("Skipped") + self._cm.log("Skipped") test.skipped() else: did_run = True ret = test(nodechoice) if not test.teardown(nodechoice): - self.ClusterManager.log("Teardown failed") + self._cm.log("Teardown failed") - if not should_continue(self.ClusterManager.Env): + if not should_continue(self._cm.Env): raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice)) ret = False stoptime = time.time() - self.ClusterManager.oprofileSave(testcount) + self._cm.oprofileSave(testcount) elapsed_time = stoptime - starttime test_time = stoptime - test.get_timer() if "min_time" not in test.stats: test.stats["elapsed_time"] = elapsed_time test.stats["min_time"] = test_time test.stats["max_time"] = test_time else: test.stats["elapsed_time"] += elapsed_time if test_time < test.stats["min_time"]: test.stats["min_time"] = test_time if test_time > test.stats["max_time"]: test.stats["max_time"] = test_time if ret: self.incr("success") test.log_timer() else: self.incr("failure") - self.ClusterManager.statall() + self._cm.statall() did_run = True # Force the test count to be incremented anyway so test extraction works self.audit(test.errors_to_ignore) return did_run def summarize(self): - self.ClusterManager.log("****************") - self.ClusterManager.log("Overall Results:%r" % self.Stats) - self.ClusterManager.log("****************") + self._cm.log("****************") + self._cm.log("Overall Results:%r" % self.stats) + self._cm.log("****************") stat_filter = { "calls":0, "failure":0, "skipped":0, "auditfail":0, } - self.ClusterManager.log("Test Summary") - for test in self.Tests: + self._cm.log("Test Summary") + for test in self.tests: for key in list(stat_filter.keys()): stat_filter[key] = test.stats[key] name = "Test %s:" % test.name - self.ClusterManager.log("{:<25} {!r}".format(name, stat_filter)) + self._cm.log("{:<25} {!r}".format(name, stat_filter)) - self.ClusterManager.debug("Detailed Results") - for test in self.Tests: + self._cm.debug("Detailed Results") + for test in self.tests: name = "Test %s:" % test.name - self.ClusterManager.debug("{:<25} {!r}".format(name, stat_filter)) + self._cm.debug("{:<25} {!r}".format(name, stat_filter)) - self.ClusterManager.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") + self._cm.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") def audit(self, local_ignore=None): errcount = 0 ignorelist = ["CTS:"] if local_ignore: ignorelist.extend(local_ignore) - ignorelist.extend(self.ClusterManager.errorstoignore()) - ignorelist.extend(self.ClusterManager.instance_errorstoignore()) + ignorelist.extend(self._cm.errorstoignore()) + ignorelist.extend(self._cm.instance_errorstoignore()) # This makes sure everything is stabilized before starting... failed = 0 - for audit in self.Audits: + for audit in self._audits: if not audit(): - self.ClusterManager.log("Audit %s FAILED." % audit.name) + self._cm.log("Audit %s FAILED." % audit.name) failed += 1 else: - self.ClusterManager.debug("Audit %s passed." % audit.name) + self._cm.debug("Audit %s passed." % audit.name) while errcount < 1000: match = None - if self.BadNews: - match = self.BadNews.look(0) + if self._bad_news: + match = self._bad_news.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: - self.ClusterManager.log("BadNews: %s" % match) + self._cm.log("BadNews: %s" % match) self.incr("BadNews") errcount += 1 else: break else: print("Big problems") - if not should_continue(self.ClusterManager.Env): - self.ClusterManager.log("Shutting down.") + if not should_continue(self._cm.Env): + self._cm.log("Shutting down.") self.summarize() self.teardown() raise ValueError("Looks like we hit a BadNews jackpot!") - if self.BadNews: - self.BadNews.end() + if self._bad_news: + self._bad_news.end() return failed class AllOnce(Scenario): '''Every Test Once''' # Accessable as __doc__ def run_loop(self, Iterations): testcount = 1 - for test in self.Tests: + for test in self.tests: self.run_test(test, testcount) testcount += 1 class RandomTests(Scenario): '''Random Test Execution''' def run_loop(self, Iterations): testcount = 1 while testcount <= Iterations: - test = self.ClusterManager.Env.random_gen.choice(self.Tests) + test = self._cm.Env.random_gen.choice(self.tests) self.run_test(test, testcount) testcount += 1 class Sequence(Scenario): '''Named Tests in Sequence''' def run_loop(self, Iterations): testcount = 1 while testcount <= Iterations: - for test in self.Tests: + for test in self.tests: self.run_test(test, testcount) testcount += 1 class Boot(Scenario): '''Start the Cluster''' def run_loop(self, Iterations): testcount = 0 class BootCluster(ScenarioComponent): ( '''BootCluster is the most basic of ScenarioComponents. This ScenarioComponent simply starts the cluster manager on all the nodes. It is fairly robust as it waits for all nodes to come up before starting as they might have been rebooted or crashed for some reason beforehand. ''') def is_applicable(self): '''BootCluster is so generic it is always Applicable''' return True def setup(self): '''Basic Cluster Manager startup. Start everything''' self._cm.prepare() # Clear out the cobwebs ;-) self._cm.stopall(verbose=True, force=True) # Now start the Cluster Manager on all the nodes. self._cm.log("Starting Cluster Manager on all nodes.") return self._cm.startall(verbose=True, quick=True) def teardown(self): '''Set up the given ScenarioComponent''' # Stop the cluster manager everywhere self._cm.log("Stopping Cluster Manager on all nodes") self._cm.stopall(verbose=True, force=False) class LeaveBooted(BootCluster): def teardown(self): '''Set up the given ScenarioComponent''' # Stop the cluster manager everywhere self._cm.log("Leaving Cluster running on all nodes")