diff --git a/python/pacemaker/_cts/scenarios.py b/python/pacemaker/_cts/scenarios.py
index bfc2839304..769b2d0968 100644
--- a/python/pacemaker/_cts/scenarios.py
+++ b/python/pacemaker/_cts/scenarios.py
@@ -1,425 +1,422 @@
""" Test scenario classes for Pacemaker's Cluster Test Suite (CTS) """
__all__ = [
"AllOnce",
"Boot",
"BootCluster",
"LeaveBooted",
"RandomTests",
"Sequence",
]
__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import re
import time
from pacemaker._cts.audits import ClusterAudit
from pacemaker._cts.input import should_continue
from pacemaker._cts.tests.ctstest import CTSTest
from pacemaker._cts.watcher import LogWatcher
class ScenarioComponent:
""" The base class for all scenario components. A scenario component is
one single step in a scenario. Each component is basically just a setup
and teardown method.
"""
def __init__(self, cm, env):
""" Create a new ScenarioComponent instance
Arguments:
cm -- A ClusterManager instance
env -- An Environment instance
"""
# pylint: disable=invalid-name
self._cm = cm
self._env = env
def is_applicable(self):
""" Return True if this component is applicable in the given Environment.
This method must be provided by all subclasses.
"""
raise NotImplementedError
def setup(self):
""" Set up the component, returning True on success. This method must be
provided by all subclasses.
"""
raise NotImplementedError
def teardown(self):
""" Tear down the given component. This method must be provided by all
subclasses.
"""
raise NotImplementedError
class Scenario:
""" The base class for scenario. A scenario is an ordered list of
ScenarioComponent objects. A scenario proceeds by setting up all its
components in sequence, running a list of tests and audits, and then
tearing down its components in reverse.
"""
def __init__(self, cm, components, audits, tests):
""" Create a new Scenario instance
Arguments:
cm -- A ClusterManager instance
components -- A list of ScenarioComponents comprising this Scenario
audits -- A list of ClusterAudits that will be performed as
part of this Scenario
tests -- A list of CTSTests that will be run
"""
# pylint: disable=invalid-name
self.stats = {
"success": 0,
"failure": 0,
"BadNews": 0,
"skipped": 0
}
self.tests = tests
self._audits = audits
self._bad_news = None
self._cm = cm
self._components = components
for comp in components:
if not issubclass(comp.__class__, ScenarioComponent):
raise ValueError("Init value must be subclass of ScenarioComponent")
for audit in audits:
if not issubclass(audit.__class__, ClusterAudit):
raise ValueError("Init value must be subclass of ClusterAudit")
for test in tests:
if not issubclass(test.__class__, CTSTest):
raise ValueError("Init value must be a subclass of CTSTest")
def is_applicable(self):
""" Return True if all ScenarioComponents are applicable """
for comp in self._components:
if not comp.is_applicable():
return False
return True
def setup(self):
""" Set up the scenario, returning True on success. If setup fails at
some point, tear down those components that did successfully set up.
"""
self._cm.prepare()
self.audit() # Also detects remote/local log config
self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
self.audit()
self._cm.install_support()
self._bad_news = LogWatcher(self._cm.env["LogFileName"],
self._cm.templates.get_patterns("BadNews"),
self._cm.env["nodes"],
self._cm.env["LogWatcher"],
"BadNews", 0)
self._bad_news.set_watch() # Call after we've figured out what type of log watching to do in LogAudit
j = 0
while j < len(self._components):
if not self._components[j].setup():
# OOPS! We failed. Tear partial setups down.
self.audit()
self._cm.log("Tearing down partial setup")
self.teardown(j)
return False
j += 1
self.audit()
return True
def teardown(self, n_components=None):
""" Tear down the scenario in the reverse order it was set up. If
n_components is not None, only tear down that many components.
"""
if not n_components:
n_components = len(self._components)-1
j = n_components
while j >= 0:
self._components[j].teardown()
j -= 1
self.audit()
self._cm.install_support("uninstall")
def incr(self, name):
""" Increment the given stats key """
if not name in self.stats:
self.stats[name] = 0
self.stats[name] += 1
def run(self, iterations):
""" Run all tests in the scenario the given number of times """
self._cm.oprofile_start()
try:
self._run_loop(iterations)
self._cm.oprofile_stop()
except:
self._cm.oprofile_stop()
raise
def _run_loop(self, iterations):
""" Do the hard part of the run method - actually run all the tests the
given number of times.
"""
raise NotImplementedError
def run_test(self, test, testcount):
""" Run the given test. testcount is the number of tests (including
this one) that have been run across all iterations.
"""
nodechoice = self._cm.env.random_node()
ret = True
did_run = False
self._cm.clear_instance_errors_to_ignore()
choice = "(%s)" % nodechoice
self._cm.log("Running test {:<22} {:<15} [{:>3}]".format(test.name, choice, testcount))
starttime = test.set_timer()
if not test.setup(nodechoice):
self._cm.log("Setup failed")
ret = False
- elif not test.can_run_now(nodechoice):
- self._cm.log("Skipped")
- test.skipped()
else:
did_run = True
ret = test(nodechoice)
if not test.teardown(nodechoice):
self._cm.log("Teardown failed")
if not should_continue(self._cm.env):
raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice))
ret = False
stoptime = time.time()
self._cm.oprofile_save(testcount)
elapsed_time = stoptime - starttime
test_time = stoptime - test.get_timer()
if "min_time" not in test.stats:
test.stats["elapsed_time"] = elapsed_time
test.stats["min_time"] = test_time
test.stats["max_time"] = test_time
else:
test.stats["elapsed_time"] += elapsed_time
if test_time < test.stats["min_time"]:
test.stats["min_time"] = test_time
if test_time > test.stats["max_time"]:
test.stats["max_time"] = test_time
if ret:
self.incr("success")
test.log_timer()
else:
self.incr("failure")
self._cm.statall()
did_run = True # Force the test count to be incremented anyway so test extraction works
self.audit(test.errors_to_ignore)
return did_run
def summarize(self):
""" Output scenario results """
self._cm.log("****************")
self._cm.log("Overall Results:%r" % self.stats)
self._cm.log("****************")
stat_filter = {
"calls": 0,
"failure": 0,
"skipped": 0,
"auditfail": 0,
}
self._cm.log("Test Summary")
for test in self.tests:
for key in stat_filter:
stat_filter[key] = test.stats[key]
name = "Test %s:" % test.name
self._cm.log("{:<25} {!r}".format(name, stat_filter))
self._cm.debug("Detailed Results")
for test in self.tests:
name = "Test %s:" % test.name
self._cm.debug("{:<25} {!r}".format(name, stat_filter))
self._cm.log("<<<<<<<<<<<<<<<< TESTS COMPLETED")
def audit(self, local_ignore=None):
""" Perform all scenario audits and log results. If there are too many
failures, prompt the user to confirm that the scenario should continue
running.
"""
errcount = 0
ignorelist = ["CTS:"]
if local_ignore:
ignorelist.extend(local_ignore)
ignorelist.extend(self._cm.errors_to_ignore)
ignorelist.extend(self._cm.instance_errors_to_ignore)
# This makes sure everything is stabilized before starting...
failed = 0
for audit in self._audits:
if not audit():
self._cm.log("Audit %s FAILED." % audit.name)
failed += 1
else:
self._cm.debug("Audit %s passed." % audit.name)
while errcount < 1000:
match = None
if self._bad_news:
match = self._bad_news.look(0)
if match:
add_err = True
for ignore in ignorelist:
if add_err and re.search(ignore, match):
add_err = False
if add_err:
self._cm.log("BadNews: %s" % match)
self.incr("BadNews")
errcount += 1
else:
break
else:
print("Big problems")
if not should_continue(self._cm.env):
self._cm.log("Shutting down.")
self.summarize()
self.teardown()
raise ValueError("Looks like we hit a BadNews jackpot!")
if self._bad_news:
self._bad_news.end()
return failed
class AllOnce(Scenario):
""" Every Test Once """
def _run_loop(self, iterations):
testcount = 1
for test in self.tests:
self.run_test(test, testcount)
testcount += 1
class RandomTests(Scenario):
""" Random Test Execution """
def _run_loop(self, iterations):
testcount = 1
while testcount <= iterations:
test = self._cm.env.random_gen.choice(self.tests)
self.run_test(test, testcount)
testcount += 1
class Sequence(Scenario):
""" Named Tests in Sequence """
def _run_loop(self, iterations):
testcount = 1
while testcount <= iterations:
for test in self.tests:
self.run_test(test, testcount)
testcount += 1
class Boot(Scenario):
""" Start the Cluster """
def _run_loop(self, iterations):
return
class BootCluster(ScenarioComponent):
""" The BootCluster component simply starts the cluster manager on all
nodes, waiting for each to come up before starting given that a node
might have been rebooted or crashed beforehand.
"""
def is_applicable(self):
""" BootCluster is always applicable """
return True
def setup(self):
""" Set up the component, returning True on success """
self._cm.prepare()
# Clear out the cobwebs ;-)
self._cm.stopall(verbose=True, force=True)
# Now start the Cluster Manager on all the nodes.
self._cm.log("Starting Cluster Manager on all nodes.")
return self._cm.startall(verbose=True, quick=True)
def teardown(self):
""" Tear down the component """
self._cm.log("Stopping Cluster Manager on all nodes")
self._cm.stopall(verbose=True, force=False)
class LeaveBooted(BootCluster):
""" The LeaveBooted component leaves all nodes up when the scenario
is complete.
"""
def teardown(self):
""" Tear down the component """
self._cm.log("Leaving Cluster running on all nodes")
diff --git a/python/pacemaker/_cts/tests/ctstest.py b/python/pacemaker/_cts/tests/ctstest.py
index cf9eaa5e4d..8669e48b5b 100644
--- a/python/pacemaker/_cts/tests/ctstest.py
+++ b/python/pacemaker/_cts/tests/ctstest.py
@@ -1,260 +1,252 @@
""" Base classes for CTS tests """
__all__ = ["CTSTest"]
__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import re
from pacemaker._cts.environment import EnvFactory
from pacemaker._cts.logging import LogFactory
from pacemaker._cts.patterns import PatternSelector
from pacemaker._cts.remote import RemoteFactory
from pacemaker._cts.timer import Timer
from pacemaker._cts.watcher import LogWatcher
# Disable various pylint warnings that occur in so many places throughout this
# file it's easiest to just take care of them globally. This does introduce the
# possibility that we'll miss some other cause of the same warning, but we'll
# just have to be careful.
# pylint doesn't understand that self._rsh is callable.
# pylint: disable=not-callable
class CTSTest:
""" The base class for all cluster tests. This implements a basic set of
properties and behaviors like setup, tear down, time keeping, and
statistics tracking. It is up to specific tests to implement their own
specialized behavior on top of this class.
"""
def __init__(self, cm):
""" Create a new CTSTest instance
Arguments:
cm -- A ClusterManager instance
"""
# pylint: disable=invalid-name
self.audits = []
self.name = None
self.templates = PatternSelector(cm["Name"])
self.stats = {
"auditfail": 0,
"calls": 0,
"failure": 0,
"skipped": 0,
"success": 0
}
self._cm = cm
self._env = EnvFactory().getInstance()
self._r_o2cb = None
self._r_ocfs2 = []
self._rsh = RemoteFactory().getInstance()
self._logger = LogFactory()
self._timers = {}
self.benchmark = True # which tests to benchmark
self.failed = False
self.is_experimental = False
self.is_loop = False
self.is_unsafe = False
self.is_valgrind = False
self.passed = True
def log(self, args):
""" Log a message """
self._logger.log(args)
def debug(self, args):
""" Log a debug message """
self._logger.debug(args)
def get_timer(self, key="test"):
""" Get the start time of the given timer """
try:
return self._timers[key].start_time
except KeyError:
return 0
def set_timer(self, key="test"):
""" Set the start time of the given timer to now, and return
that time
"""
if key not in self._timers:
self._timers[key] = Timer(self._logger, self.name, key)
self._timers[key].start()
return self._timers[key].start_time
def log_timer(self, key="test"):
""" Log the elapsed time of the given timer """
if key not in self._timers:
return
elapsed = self._timers[key].elapsed
self.debug("%s:%s runtime: %.2f" % (self.name, key, elapsed))
del self._timers[key]
def incr(self, name):
""" Increment the given stats key """
if name not in self.stats:
self.stats[name] = 0
self.stats[name] += 1
# Reset the test passed boolean
if name == "calls":
self.passed = True
def failure(self, reason="none"):
""" Increment the failure count, with an optional failure reason """
self.passed = False
self.incr("failure")
self._logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
return False
def success(self):
""" Increment the success count """
self.incr("success")
return True
def skipped(self):
""" Increment the skipped count """
self.incr("skipped")
return True
def __call__(self, node):
""" Perform this test """
raise NotImplementedError
def audit(self):
""" Perform all the relevant audits (see ClusterAudit), returning
whether or not they all passed.
"""
passed = True
for audit in self.audits:
if not audit():
self._logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name))
self.incr("auditfail")
passed = False
return passed
def setup(self, node):
""" Setup this test """
# node is used in subclasses
# pylint: disable=unused-argument
return self.success()
def teardown(self, node):
""" Tear down this test """
# node is used in subclasses
# pylint: disable=unused-argument
return self.success()
def create_watch(self, patterns, timeout, name=None):
""" Create a new LogWatcher object with the given patterns, timeout,
and optional name. This object can be used to search log files
for matching patterns during this test's run.
"""
if not name:
name = self.name
return LogWatcher(self._env["LogFileName"], patterns, self._env["nodes"], self._env["LogWatcher"], name, timeout)
def local_badnews(self, prefix, watch, local_ignore=None):
""" Use the given watch object to search through log files for messages
starting with the given prefix. If no prefix is given, use
"LocalBadNews:" by default. The optional local_ignore list should
be a list of regexes that, if found in a line, will cause that line
to be ignored.
Return the number of matches found.
"""
errcount = 0
if not prefix:
prefix = "LocalBadNews:"
ignorelist = [" CTS: ", prefix]
if local_ignore:
ignorelist += local_ignore
while errcount < 100:
match = watch.look(0)
if match:
add_err = True
for ignore in ignorelist:
if add_err and re.search(ignore, match):
add_err = False
if add_err:
self._logger.log("%s %s" % (prefix, match))
errcount += 1
else:
break
else:
self._logger.log("Too many errors!")
watch.end()
return errcount
def is_applicable(self):
""" Return True if this test is applicable in the current test configuration.
This method must be implemented by all subclasses.
"""
if self.is_loop and not self._env["loop-tests"]:
return False
if self.is_unsafe and not self._env["unsafe-tests"]:
return False
if self.is_valgrind and not self._env["valgrind-tests"]:
return False
if self.is_experimental and not self._env["experimental-tests"]:
return False
if self._env["benchmark"] and not self.benchmark:
return False
return True
- def can_run_now(self, node):
- """ Return True if we can meaningfully run right now """
-
- # node is used in subclasses
- # pylint: disable=unused-argument
-
- return True
-
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
return []
diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py
index dbf23e5fca..4452bc0396 100644
--- a/python/pacemaker/_cts/tests/reattach.py
+++ b/python/pacemaker/_cts/tests/reattach.py
@@ -1,226 +1,221 @@
""" Restart the cluster and verify resources remain running """
__all__ = ["Reattach"]
__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import re
import time
from pacemaker.exitstatus import ExitStatus
from pacemaker._cts.audits import AuditResource
from pacemaker._cts.tests.ctstest import CTSTest
from pacemaker._cts.tests.simulstartlite import SimulStartLite
from pacemaker._cts.tests.simulstoplite import SimulStopLite
from pacemaker._cts.tests.starttest import StartTest
# Disable various pylint warnings that occur in so many places throughout this
# file it's easiest to just take care of them globally. This does introduce the
# possibility that we'll miss some other cause of the same warning, but we'll
# just have to be careful.
# pylint doesn't understand that self._rsh is callable.
# pylint: disable=not-callable
class Reattach(CTSTest):
""" A concrete test that restarts the cluster and verifies that resources
remain running throughout
"""
def __init__(self, cm):
""" Create a new Reattach instance
Arguments:
cm -- A ClusterManager instance
"""
CTSTest.__init__(self, cm)
self.name = "Reattach"
self._startall = SimulStartLite(cm)
self._stopall = SimulStopLite(cm)
def _is_managed(self, node):
""" Are resources managed by the cluster? """
(_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1)
is_managed = is_managed[0].strip()
return is_managed == "true"
def _set_unmanaged(self, node):
""" Disable resource management """
self.debug("Disable resource management")
self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
def _set_managed(self, node):
""" Enable resource management """
self.debug("Re-enable resource management")
self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
def _disable_incompatible_rscs(self, node):
""" Disable resources that are incompatible with this test
Starts and stops of stonith-class resources are implemented internally
by Pacemaker, which means that they must stop when Pacemaker is
stopped, even if unmanaged. Disable them before running the Reattach
test so they don't affect resource placement.
OCFS2 resources must be disabled too for some reason.
Set target-role to "Stopped" for any of these resources in the CIB.
"""
self.debug("Disable incompatible (stonith/OCFS2) resources")
xml = """'
' --scope rsc_defaults"""
return self._rsh(node, self._cm.templates['CibAddXml'] % xml)
def _enable_incompatible_rscs(self, node):
""" Re-enable resources that were incompatible with this test """
self.debug("Re-enable incompatible (stonith/OCFS2) resources")
xml = """"""
return self._rsh(node, """cibadmin --delete --xml-text '%s'""" % xml)
def _reprobe(self, node):
""" Reprobe all resources
The placement of some resources (such as promotable-1 in the
lab-generated CIB) is affected by constraints using node-attribute-based
rules. An earlier test may have erased the relevant node attribute, so
do a reprobe, which should add the attribute back.
"""
return self._rsh(node, """crm_resource --refresh""")
def setup(self, node):
""" Setup this test """
if not self._startall(None):
return self.failure("Startall failed")
(rc, _) = self._disable_incompatible_rscs(node)
if rc != ExitStatus.OK:
return self.failure("Couldn't modify CIB to stop incompatible resources")
(rc, _) = self._reprobe(node)
if rc != ExitStatus.OK:
return self.failure("Couldn't reprobe resources")
if not self._cm.cluster_stable(double_check=True):
return self.failure("Cluster did not stabilize after setup")
return self.success()
def teardown(self, node):
""" Tear down this test """
# Make sure 'node' is up
start = StartTest(self._cm)
start(node)
if not self._is_managed(node):
self._set_managed(node)
(rc, _) = self._enable_incompatible_rscs(node)
if rc != ExitStatus.OK:
return self.failure("Couldn't modify CIB to re-enable incompatible resources")
if not self._cm.cluster_stable():
return self.failure("Cluster did not stabilize after teardown")
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
return self.success()
- def can_run_now(self, node):
- """ Return True if we can meaningfully run right now """
-
- return True
-
def __call__(self, node):
""" Perform this test """
self.incr("calls")
# Conveniently, the scheduler will display this message when disabling
# management, even if fencing is not enabled, so we can rely on it.
managed = self.create_watch(["No fencing will be done"], 60)
managed.set_watch()
self._set_unmanaged(node)
if not managed.look_for_all():
self._logger.log("Patterns not found: %r" % managed.unmatched)
return self.failure("Resource management not disabled")
pats = [
self.templates["Pat:RscOpOK"] % ("start", ".*"),
self.templates["Pat:RscOpOK"] % ("stop", ".*"),
self.templates["Pat:RscOpOK"] % ("promote", ".*"),
self.templates["Pat:RscOpOK"] % ("demote", ".*"),
self.templates["Pat:RscOpOK"] % ("migrate", ".*")
]
watch = self.create_watch(pats, 60, "ShutdownActivity")
watch.set_watch()
self.debug("Shutting down the cluster")
ret = self._stopall(None)
if not ret:
self._set_managed(node)
return self.failure("Couldn't shut down the cluster")
self.debug("Bringing the cluster back up")
ret = self._startall(None)
time.sleep(5) # allow ping to update the CIB
if not ret:
self._set_managed(node)
return self.failure("Couldn't restart the cluster")
if self.local_badnews("ResourceActivity:", watch):
self._set_managed(node)
return self.failure("Resources stopped or started during cluster restart")
watch = self.create_watch(pats, 60, "StartupActivity")
watch.set_watch()
# Re-enable resource management (and verify it happened).
self._set_managed(node)
self._cm.cluster_stable()
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
# Ignore actions for STONITH resources
ignore = []
(_, lines) = self._rsh(node, "crm_resource -c", verbose=1)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self._cm, line)
if r.rclass == "stonith":
self.debug("Ignoring start actions for %s" % r.id)
ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
if self.local_badnews("ResourceActivity:", watch, ignore):
return self.failure("Resources stopped or started after resource management was re-enabled")
return ret
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
return [
r"resource( was|s were) active at shutdown"
]