diff --git a/python/pacemaker/_cts/scenarios.py b/python/pacemaker/_cts/scenarios.py index bfc2839304..769b2d0968 100644 --- a/python/pacemaker/_cts/scenarios.py +++ b/python/pacemaker/_cts/scenarios.py @@ -1,425 +1,422 @@ """ Test scenario classes for Pacemaker's Cluster Test Suite (CTS) """ __all__ = [ "AllOnce", "Boot", "BootCluster", "LeaveBooted", "RandomTests", "Sequence", ] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker._cts.audits import ClusterAudit from pacemaker._cts.input import should_continue from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.watcher import LogWatcher class ScenarioComponent: """ The base class for all scenario components. A scenario component is one single step in a scenario. Each component is basically just a setup and teardown method. """ def __init__(self, cm, env): """ Create a new ScenarioComponent instance Arguments: cm -- A ClusterManager instance env -- An Environment instance """ # pylint: disable=invalid-name self._cm = cm self._env = env def is_applicable(self): """ Return True if this component is applicable in the given Environment. This method must be provided by all subclasses. """ raise NotImplementedError def setup(self): """ Set up the component, returning True on success. This method must be provided by all subclasses. """ raise NotImplementedError def teardown(self): """ Tear down the given component. This method must be provided by all subclasses. """ raise NotImplementedError class Scenario: """ The base class for scenario. A scenario is an ordered list of ScenarioComponent objects. A scenario proceeds by setting up all its components in sequence, running a list of tests and audits, and then tearing down its components in reverse. """ def __init__(self, cm, components, audits, tests): """ Create a new Scenario instance Arguments: cm -- A ClusterManager instance components -- A list of ScenarioComponents comprising this Scenario audits -- A list of ClusterAudits that will be performed as part of this Scenario tests -- A list of CTSTests that will be run """ # pylint: disable=invalid-name self.stats = { "success": 0, "failure": 0, "BadNews": 0, "skipped": 0 } self.tests = tests self._audits = audits self._bad_news = None self._cm = cm self._components = components for comp in components: if not issubclass(comp.__class__, ScenarioComponent): raise ValueError("Init value must be subclass of ScenarioComponent") for audit in audits: if not issubclass(audit.__class__, ClusterAudit): raise ValueError("Init value must be subclass of ClusterAudit") for test in tests: if not issubclass(test.__class__, CTSTest): raise ValueError("Init value must be a subclass of CTSTest") def is_applicable(self): """ Return True if all ScenarioComponents are applicable """ for comp in self._components: if not comp.is_applicable(): return False return True def setup(self): """ Set up the scenario, returning True on success. If setup fails at some point, tear down those components that did successfully set up. """ self._cm.prepare() self.audit() # Also detects remote/local log config self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) self.audit() self._cm.install_support() self._bad_news = LogWatcher(self._cm.env["LogFileName"], self._cm.templates.get_patterns("BadNews"), self._cm.env["nodes"], self._cm.env["LogWatcher"], "BadNews", 0) self._bad_news.set_watch() # Call after we've figured out what type of log watching to do in LogAudit j = 0 while j < len(self._components): if not self._components[j].setup(): # OOPS! We failed. Tear partial setups down. self.audit() self._cm.log("Tearing down partial setup") self.teardown(j) return False j += 1 self.audit() return True def teardown(self, n_components=None): """ Tear down the scenario in the reverse order it was set up. If n_components is not None, only tear down that many components. """ if not n_components: n_components = len(self._components)-1 j = n_components while j >= 0: self._components[j].teardown() j -= 1 self.audit() self._cm.install_support("uninstall") def incr(self, name): """ Increment the given stats key """ if not name in self.stats: self.stats[name] = 0 self.stats[name] += 1 def run(self, iterations): """ Run all tests in the scenario the given number of times """ self._cm.oprofile_start() try: self._run_loop(iterations) self._cm.oprofile_stop() except: self._cm.oprofile_stop() raise def _run_loop(self, iterations): """ Do the hard part of the run method - actually run all the tests the given number of times. """ raise NotImplementedError def run_test(self, test, testcount): """ Run the given test. testcount is the number of tests (including this one) that have been run across all iterations. """ nodechoice = self._cm.env.random_node() ret = True did_run = False self._cm.clear_instance_errors_to_ignore() choice = "(%s)" % nodechoice self._cm.log("Running test {:<22} {:<15} [{:>3}]".format(test.name, choice, testcount)) starttime = test.set_timer() if not test.setup(nodechoice): self._cm.log("Setup failed") ret = False - elif not test.can_run_now(nodechoice): - self._cm.log("Skipped") - test.skipped() else: did_run = True ret = test(nodechoice) if not test.teardown(nodechoice): self._cm.log("Teardown failed") if not should_continue(self._cm.env): raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice)) ret = False stoptime = time.time() self._cm.oprofile_save(testcount) elapsed_time = stoptime - starttime test_time = stoptime - test.get_timer() if "min_time" not in test.stats: test.stats["elapsed_time"] = elapsed_time test.stats["min_time"] = test_time test.stats["max_time"] = test_time else: test.stats["elapsed_time"] += elapsed_time if test_time < test.stats["min_time"]: test.stats["min_time"] = test_time if test_time > test.stats["max_time"]: test.stats["max_time"] = test_time if ret: self.incr("success") test.log_timer() else: self.incr("failure") self._cm.statall() did_run = True # Force the test count to be incremented anyway so test extraction works self.audit(test.errors_to_ignore) return did_run def summarize(self): """ Output scenario results """ self._cm.log("****************") self._cm.log("Overall Results:%r" % self.stats) self._cm.log("****************") stat_filter = { "calls": 0, "failure": 0, "skipped": 0, "auditfail": 0, } self._cm.log("Test Summary") for test in self.tests: for key in stat_filter: stat_filter[key] = test.stats[key] name = "Test %s:" % test.name self._cm.log("{:<25} {!r}".format(name, stat_filter)) self._cm.debug("Detailed Results") for test in self.tests: name = "Test %s:" % test.name self._cm.debug("{:<25} {!r}".format(name, stat_filter)) self._cm.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") def audit(self, local_ignore=None): """ Perform all scenario audits and log results. If there are too many failures, prompt the user to confirm that the scenario should continue running. """ errcount = 0 ignorelist = ["CTS:"] if local_ignore: ignorelist.extend(local_ignore) ignorelist.extend(self._cm.errors_to_ignore) ignorelist.extend(self._cm.instance_errors_to_ignore) # This makes sure everything is stabilized before starting... failed = 0 for audit in self._audits: if not audit(): self._cm.log("Audit %s FAILED." % audit.name) failed += 1 else: self._cm.debug("Audit %s passed." % audit.name) while errcount < 1000: match = None if self._bad_news: match = self._bad_news.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: self._cm.log("BadNews: %s" % match) self.incr("BadNews") errcount += 1 else: break else: print("Big problems") if not should_continue(self._cm.env): self._cm.log("Shutting down.") self.summarize() self.teardown() raise ValueError("Looks like we hit a BadNews jackpot!") if self._bad_news: self._bad_news.end() return failed class AllOnce(Scenario): """ Every Test Once """ def _run_loop(self, iterations): testcount = 1 for test in self.tests: self.run_test(test, testcount) testcount += 1 class RandomTests(Scenario): """ Random Test Execution """ def _run_loop(self, iterations): testcount = 1 while testcount <= iterations: test = self._cm.env.random_gen.choice(self.tests) self.run_test(test, testcount) testcount += 1 class Sequence(Scenario): """ Named Tests in Sequence """ def _run_loop(self, iterations): testcount = 1 while testcount <= iterations: for test in self.tests: self.run_test(test, testcount) testcount += 1 class Boot(Scenario): """ Start the Cluster """ def _run_loop(self, iterations): return class BootCluster(ScenarioComponent): """ The BootCluster component simply starts the cluster manager on all nodes, waiting for each to come up before starting given that a node might have been rebooted or crashed beforehand. """ def is_applicable(self): """ BootCluster is always applicable """ return True def setup(self): """ Set up the component, returning True on success """ self._cm.prepare() # Clear out the cobwebs ;-) self._cm.stopall(verbose=True, force=True) # Now start the Cluster Manager on all the nodes. self._cm.log("Starting Cluster Manager on all nodes.") return self._cm.startall(verbose=True, quick=True) def teardown(self): """ Tear down the component """ self._cm.log("Stopping Cluster Manager on all nodes") self._cm.stopall(verbose=True, force=False) class LeaveBooted(BootCluster): """ The LeaveBooted component leaves all nodes up when the scenario is complete. """ def teardown(self): """ Tear down the component """ self._cm.log("Leaving Cluster running on all nodes") diff --git a/python/pacemaker/_cts/tests/ctstest.py b/python/pacemaker/_cts/tests/ctstest.py index cf9eaa5e4d..8669e48b5b 100644 --- a/python/pacemaker/_cts/tests/ctstest.py +++ b/python/pacemaker/_cts/tests/ctstest.py @@ -1,260 +1,252 @@ """ Base classes for CTS tests """ __all__ = ["CTSTest"] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.environment import EnvFactory from pacemaker._cts.logging import LogFactory from pacemaker._cts.patterns import PatternSelector from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.timer import Timer from pacemaker._cts.watcher import LogWatcher # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class CTSTest: """ The base class for all cluster tests. This implements a basic set of properties and behaviors like setup, tear down, time keeping, and statistics tracking. It is up to specific tests to implement their own specialized behavior on top of this class. """ def __init__(self, cm): """ Create a new CTSTest instance Arguments: cm -- A ClusterManager instance """ # pylint: disable=invalid-name self.audits = [] self.name = None self.templates = PatternSelector(cm["Name"]) self.stats = { "auditfail": 0, "calls": 0, "failure": 0, "skipped": 0, "success": 0 } self._cm = cm self._env = EnvFactory().getInstance() self._r_o2cb = None self._r_ocfs2 = [] self._rsh = RemoteFactory().getInstance() self._logger = LogFactory() self._timers = {} self.benchmark = True # which tests to benchmark self.failed = False self.is_experimental = False self.is_loop = False self.is_unsafe = False self.is_valgrind = False self.passed = True def log(self, args): """ Log a message """ self._logger.log(args) def debug(self, args): """ Log a debug message """ self._logger.debug(args) def get_timer(self, key="test"): """ Get the start time of the given timer """ try: return self._timers[key].start_time except KeyError: return 0 def set_timer(self, key="test"): """ Set the start time of the given timer to now, and return that time """ if key not in self._timers: self._timers[key] = Timer(self._logger, self.name, key) self._timers[key].start() return self._timers[key].start_time def log_timer(self, key="test"): """ Log the elapsed time of the given timer """ if key not in self._timers: return elapsed = self._timers[key].elapsed self.debug("%s:%s runtime: %.2f" % (self.name, key, elapsed)) del self._timers[key] def incr(self, name): """ Increment the given stats key """ if name not in self.stats: self.stats[name] = 0 self.stats[name] += 1 # Reset the test passed boolean if name == "calls": self.passed = True def failure(self, reason="none"): """ Increment the failure count, with an optional failure reason """ self.passed = False self.incr("failure") self._logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason) return False def success(self): """ Increment the success count """ self.incr("success") return True def skipped(self): """ Increment the skipped count """ self.incr("skipped") return True def __call__(self, node): """ Perform this test """ raise NotImplementedError def audit(self): """ Perform all the relevant audits (see ClusterAudit), returning whether or not they all passed. """ passed = True for audit in self.audits: if not audit(): self._logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name)) self.incr("auditfail") passed = False return passed def setup(self, node): """ Setup this test """ # node is used in subclasses # pylint: disable=unused-argument return self.success() def teardown(self, node): """ Tear down this test """ # node is used in subclasses # pylint: disable=unused-argument return self.success() def create_watch(self, patterns, timeout, name=None): """ Create a new LogWatcher object with the given patterns, timeout, and optional name. This object can be used to search log files for matching patterns during this test's run. """ if not name: name = self.name return LogWatcher(self._env["LogFileName"], patterns, self._env["nodes"], self._env["LogWatcher"], name, timeout) def local_badnews(self, prefix, watch, local_ignore=None): """ Use the given watch object to search through log files for messages starting with the given prefix. If no prefix is given, use "LocalBadNews:" by default. The optional local_ignore list should be a list of regexes that, if found in a line, will cause that line to be ignored. Return the number of matches found. """ errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [" CTS: ", prefix] if local_ignore: ignorelist += local_ignore while errcount < 100: match = watch.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: self._logger.log("%s %s" % (prefix, match)) errcount += 1 else: break else: self._logger.log("Too many errors!") watch.end() return errcount def is_applicable(self): """ Return True if this test is applicable in the current test configuration. This method must be implemented by all subclasses. """ if self.is_loop and not self._env["loop-tests"]: return False if self.is_unsafe and not self._env["unsafe-tests"]: return False if self.is_valgrind and not self._env["valgrind-tests"]: return False if self.is_experimental and not self._env["experimental-tests"]: return False if self._env["benchmark"] and not self.benchmark: return False return True - def can_run_now(self, node): - """ Return True if we can meaningfully run right now """ - - # node is used in subclasses - # pylint: disable=unused-argument - - return True - @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [] diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py index dbf23e5fca..4452bc0396 100644 --- a/python/pacemaker/_cts/tests/reattach.py +++ b/python/pacemaker/_cts/tests/reattach.py @@ -1,226 +1,221 @@ """ Restart the cluster and verify resources remain running """ __all__ = ["Reattach"] __copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker.exitstatus import ExitStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class Reattach(CTSTest): """ A concrete test that restarts the cluster and verifies that resources remain running throughout """ def __init__(self, cm): """ Create a new Reattach instance Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Reattach" self._startall = SimulStartLite(cm) self._stopall = SimulStopLite(cm) def _is_managed(self, node): """ Are resources managed by the cluster? """ (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) is_managed = is_managed[0].strip() return is_managed == "true" def _set_unmanaged(self, node): """ Disable resource management """ self.debug("Disable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): """ Enable resource management """ self.debug("Re-enable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def _disable_incompatible_rscs(self, node): """ Disable resources that are incompatible with this test Starts and stops of stonith-class resources are implemented internally by Pacemaker, which means that they must stop when Pacemaker is stopped, even if unmanaged. Disable them before running the Reattach test so they don't affect resource placement. OCFS2 resources must be disabled too for some reason. Set target-role to "Stopped" for any of these resources in the CIB. """ self.debug("Disable incompatible (stonith/OCFS2) resources") xml = """' ' --scope rsc_defaults""" return self._rsh(node, self._cm.templates['CibAddXml'] % xml) def _enable_incompatible_rscs(self, node): """ Re-enable resources that were incompatible with this test """ self.debug("Re-enable incompatible (stonith/OCFS2) resources") xml = """""" return self._rsh(node, """cibadmin --delete --xml-text '%s'""" % xml) def _reprobe(self, node): """ Reprobe all resources The placement of some resources (such as promotable-1 in the lab-generated CIB) is affected by constraints using node-attribute-based rules. An earlier test may have erased the relevant node attribute, so do a reprobe, which should add the attribute back. """ return self._rsh(node, """crm_resource --refresh""") def setup(self, node): """ Setup this test """ if not self._startall(None): return self.failure("Startall failed") (rc, _) = self._disable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to stop incompatible resources") (rc, _) = self._reprobe(node) if rc != ExitStatus.OK: return self.failure("Couldn't reprobe resources") if not self._cm.cluster_stable(double_check=True): return self.failure("Cluster did not stabilize after setup") return self.success() def teardown(self, node): """ Tear down this test """ # Make sure 'node' is up start = StartTest(self._cm) start(node) if not self._is_managed(node): self._set_managed(node) (rc, _) = self._enable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to re-enable incompatible resources") if not self._cm.cluster_stable(): return self.failure("Cluster did not stabilize after teardown") if not self._is_managed(node): return self.failure("Could not re-enable resource management") return self.success() - def can_run_now(self, node): - """ Return True if we can meaningfully run right now """ - - return True - def __call__(self, node): """ Perform this test """ self.incr("calls") # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["No fencing will be done"], 60) managed.set_watch() self._set_unmanaged(node) if not managed.look_for_all(): self._logger.log("Patterns not found: %r" % managed.unmatched) return self.failure("Resource management not disabled") pats = [ self.templates["Pat:RscOpOK"] % ("start", ".*"), self.templates["Pat:RscOpOK"] % ("stop", ".*"), self.templates["Pat:RscOpOK"] % ("promote", ".*"), self.templates["Pat:RscOpOK"] % ("demote", ".*"), self.templates["Pat:RscOpOK"] % ("migrate", ".*") ] watch = self.create_watch(pats, 60, "ShutdownActivity") watch.set_watch() self.debug("Shutting down the cluster") ret = self._stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self._startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.set_watch() # Re-enable resource management (and verify it happened). self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.debug("Ignoring start actions for %s" % r.id) ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret @property def errors_to_ignore(self): """ Return list of errors which should be ignored """ return [ r"resource( was|s were) active at shutdown" ]