diff --git a/cts/cts-exec.in b/cts/cts-exec.in index dceab9ee36..bcf2dd72bd 100644 --- a/cts/cts-exec.in +++ b/cts/cts-exec.in @@ -1,924 +1,929 @@ #!@PYTHON@ """Regression tests for Pacemaker's pacemaker-execd.""" # pylint doesn't like the module name "cts-execd" which is an invalid complaint for this file # but probably something we want to continue warning about elsewhere # pylint: disable=invalid-name # pacemaker imports need to come after we modify sys.path, which pylint will complain about. # pylint: disable=wrong-import-position __copyright__ = "Copyright 2012-2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse import os import stat import sys import subprocess import shutil import tempfile # Where to find test binaries # Prefer the source tree if available TEST_DIR = sys.path[0] # These imports allow running from a source checkout after running `make`. # Note that while this doesn't necessarily mean it will successfully run tests, # but being able to see --help output can be useful. if os.path.exists("@abs_top_srcdir@/python"): sys.path.insert(0, "@abs_top_srcdir@/python") # pylint: disable=comparison-of-constants,comparison-with-itself,condition-evals-to-constant if os.path.exists("@abs_top_builddir@/python") and "@abs_top_builddir@" != "@abs_top_srcdir@": sys.path.insert(0, "@abs_top_builddir@/python") from pacemaker.buildoptions import BuildOptions from pacemaker.exitstatus import ExitStatus from pacemaker._cts.corosync import Corosync from pacemaker._cts.process import killall, exit_if_proc_running, stdout_from_command from pacemaker._cts.test import Test, Tests # File permissions for executable scripts we create EXECMODE = stat.S_IRUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH def update_path(): # pylint: disable=protected-access """Set the PATH environment variable appropriately for the tests.""" new_path = os.environ['PATH'] if os.path.exists("%s/cts-exec.in" % TEST_DIR): print("Running tests from the source tree: %s (%s)" % (BuildOptions._BUILD_DIR, TEST_DIR)) # For pacemaker-execd, cts-exec-helper, and pacemaker-remoted new_path = "%s/daemons/execd:%s" % (BuildOptions._BUILD_DIR, new_path) new_path = "%s/tools:%s" % (BuildOptions._BUILD_DIR, new_path) # For crm_resource # For pacemaker-fenced new_path = "%s/daemons/fenced:%s" % (BuildOptions._BUILD_DIR, new_path) # For cts-support new_path = "%s/cts/support:%s" % (BuildOptions._BUILD_DIR, new_path) else: print("Running tests from the install tree: %s (not %s)" % (BuildOptions.DAEMON_DIR, TEST_DIR)) # For cts-exec-helper, cts-support, pacemaker-execd, pacemaker-fenced, # and pacemaker-remoted new_path = "%s:%s" % (BuildOptions.DAEMON_DIR, new_path) print('Using PATH="%s"' % new_path) os.environ['PATH'] = new_path class ExecTest(Test): """Executor for a single pacemaker-execd regression test.""" def __init__(self, name, description, **kwargs): """Create a new ExecTest instance. Arguments: name -- A unique name for this test. This can be used on the command line to specify that only a specific test should be executed. description -- A meaningful description for the test. Keyword arguments: tls -- Enable pacemaker-remoted. """ Test.__init__(self, name, description, **kwargs) self.tls = kwargs.get("tls", False) # If we are going to run the stonith resource tests, we will need to # launch and track Corosync and pacemaker-fenced. self._corosync = None self._fencer = None self._is_stonith_test = "stonith" in self.name if self.tls: self._daemon_location = "pacemaker-remoted" else: self._daemon_location = "pacemaker-execd" if self._is_stonith_test: self._corosync = Corosync(self.verbose, self.logdir, "cts-exec") self._test_tool_location = "cts-exec-helper" def _kill_daemons(self): killall([ "corosync", "pacemaker-fenced", "lt-pacemaker-fenced", "pacemaker-execd", "lt-pacemaker-execd", "cts-exec-helper", "lt-cts-exec-helper", "pacemaker-remoted", ]) def _start_daemons(self): if self._corosync: self._corosync.start(kill_first=True) # pylint: disable=consider-using-with self._fencer = subprocess.Popen(["pacemaker-fenced", "-s"]) cmd = [self._daemon_location, "-l", self.logpath] if self.verbose: cmd += ["-V"] # pylint: disable=consider-using-with self._daemon_process = subprocess.Popen(cmd) def clean_environment(self): """Clean up the host after running a test.""" if self._daemon_process: self._daemon_process.terminate() self._daemon_process.wait() if self.verbose: print("Daemon Output Start") with open(self.logpath, "rt", errors="replace", encoding="utf-8") as logfile: for line in logfile: print(line.strip()) print("Daemon Output End") if self._corosync: self._fencer.terminate() self._fencer.wait() self._corosync.stop() self._daemon_process = None self._fencer = None self._corosync = None def add_cmd(self, cmd=None, **kwargs): """Add a cts-exec-helper command to be executed as part of this test.""" if cmd is None: cmd = self._test_tool_location if cmd == self._test_tool_location: if self.verbose: kwargs["args"] += " -V " if self.tls: kwargs["args"] += " -S " kwargs["validate"] = False kwargs["check_rng"] = False kwargs["check_stderr"] = False Test.add_cmd(self, cmd, **kwargs) def run(self): """Execute this test.""" if self.tls and self._is_stonith_test: self._result_txt = "SKIPPED - '%s' - disabled when testing pacemaker_remote" % (self.name) print(self._result_txt) return Test.run(self) class ExecTests(Tests): """Collection of all pacemaker-execd regression tests.""" def __init__(self, **kwargs): """ Create a new ExecTests instance. Keyword arguments: tls -- Enable pacemaker-remoted. """ Tests.__init__(self, **kwargs) self.tls = kwargs.get("tls", False) self._action_timeout = " -t 9000 " self._installed_files = [] self._rsc_classes = self._setup_rsc_classes() print("Testing resource classes %r" % self._rsc_classes) if "lsb" in self._rsc_classes: service_agent = "LSBDummy" elif "systemd" in self._rsc_classes: service_agent = "pacemaker-cts-dummyd@3" else: service_agent = "unsupported" self._common_cmds = { "ocf_reg_line": '-c register_rsc -r ocf_test_rsc ' + self._action_timeout + ' -C ocf -P pacemaker -T Dummy', "ocf_reg_event": '-l "NEW_EVENT event_type:register rsc_id:ocf_test_rsc action:none rc:ok op_status:Done"', "ocf_unreg_line": '-c unregister_rsc -r ocf_test_rsc ' + self._action_timeout, "ocf_unreg_event": '-l "NEW_EVENT event_type:unregister rsc_id:ocf_test_rsc action:none rc:ok op_status:Done"', "ocf_start_line": '-c exec -r ocf_test_rsc -a start ' + self._action_timeout, "ocf_start_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:start rc:ok op_status:Done" ', "ocf_stop_line": '-c exec -r ocf_test_rsc -a stop ' + self._action_timeout, "ocf_stop_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:stop rc:ok op_status:Done" ', "ocf_monitor_line": '-c exec -r ocf_test_rsc -a monitor -i 2s ' + self._action_timeout, "ocf_monitor_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, "ocf_cancel_line": '-c cancel -r ocf_test_rsc -a monitor -i 2s ' + self._action_timeout, "ocf_cancel_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:Cancelled" ', "systemd_reg_line": '-c register_rsc -r systemd_test_rsc ' + self._action_timeout + ' -C systemd -T pacemaker-cts-dummyd@3', "systemd_reg_event": '-l "NEW_EVENT event_type:register rsc_id:systemd_test_rsc action:none rc:ok op_status:Done"', "systemd_unreg_line": '-c unregister_rsc -r systemd_test_rsc ' + self._action_timeout, "systemd_unreg_event": '-l "NEW_EVENT event_type:unregister rsc_id:systemd_test_rsc action:none rc:ok op_status:Done"', "systemd_start_line": '-c exec -r systemd_test_rsc -a start ' + self._action_timeout, "systemd_start_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:start rc:ok op_status:Done" ', "systemd_stop_line": '-c exec -r systemd_test_rsc -a stop ' + self._action_timeout, "systemd_stop_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:stop rc:ok op_status:Done" ', "systemd_monitor_line": '-c exec -r systemd_test_rsc -a monitor -i 2s ' + self._action_timeout, "systemd_monitor_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:Done" -t 15000 ', "systemd_cancel_line": '-c cancel -r systemd_test_rsc -a monitor -i 2s ' + self._action_timeout, "systemd_cancel_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:Cancelled" ', "service_reg_line": '-c register_rsc -r service_test_rsc ' + self._action_timeout + ' -C service -T %s' % service_agent, "service_reg_event": '-l "NEW_EVENT event_type:register rsc_id:service_test_rsc action:none rc:ok op_status:Done"', "service_unreg_line": '-c unregister_rsc -r service_test_rsc ' + self._action_timeout, "service_unreg_event": '-l "NEW_EVENT event_type:unregister rsc_id:service_test_rsc action:none rc:ok op_status:Done"', "service_start_line": '-c exec -r service_test_rsc -a start ' + self._action_timeout, "service_start_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:start rc:ok op_status:Done" ', "service_stop_line": '-c exec -r service_test_rsc -a stop ' + self._action_timeout, "service_stop_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:stop rc:ok op_status:Done" ', "service_monitor_line": '-c exec -r service_test_rsc -a monitor -i 2s ' + self._action_timeout, "service_monitor_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, "service_cancel_line": '-c cancel -r service_test_rsc -a monitor -i 2s ' + self._action_timeout, "service_cancel_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:Cancelled" ', "lsb_reg_line": '-c register_rsc -r lsb_test_rsc ' + self._action_timeout + ' -C lsb -T LSBDummy', "lsb_reg_event": '-l "NEW_EVENT event_type:register rsc_id:lsb_test_rsc action:none rc:ok op_status:Done" ', "lsb_unreg_line": '-c unregister_rsc -r lsb_test_rsc ' + self._action_timeout, "lsb_unreg_event": '-l "NEW_EVENT event_type:unregister rsc_id:lsb_test_rsc action:none rc:ok op_status:Done"', "lsb_start_line": '-c exec -r lsb_test_rsc -a start ' + self._action_timeout, "lsb_start_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:start rc:ok op_status:Done" ', "lsb_stop_line": '-c exec -r lsb_test_rsc -a stop ' + self._action_timeout, "lsb_stop_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:stop rc:ok op_status:Done" ', "lsb_monitor_line": '-c exec -r lsb_test_rsc -a status -i 2s ' + self._action_timeout, "lsb_monitor_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:Done" ' + self._action_timeout, "lsb_cancel_line": '-c cancel -r lsb_test_rsc -a status -i 2s ' + self._action_timeout, "lsb_cancel_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:Cancelled" ', "stonith_reg_line": '-c register_rsc -r stonith_test_rsc ' + self._action_timeout + ' -C stonith -P pacemaker -T fence_dummy', "stonith_reg_event": '-l "NEW_EVENT event_type:register rsc_id:stonith_test_rsc action:none rc:ok op_status:Done" ', "stonith_unreg_line": '-c unregister_rsc -r stonith_test_rsc ' + self._action_timeout, "stonith_unreg_event": '-l "NEW_EVENT event_type:unregister rsc_id:stonith_test_rsc action:none rc:ok op_status:Done"', "stonith_start_line": '-c exec -r stonith_test_rsc -a start ' + self._action_timeout, "stonith_start_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:start rc:ok op_status:Done" ', "stonith_stop_line": '-c exec -r stonith_test_rsc -a stop ' + self._action_timeout, "stonith_stop_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:stop rc:ok op_status:Done" ', "stonith_monitor_line": '-c exec -r stonith_test_rsc -a monitor -i 2s ' + self._action_timeout, "stonith_monitor_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, "stonith_cancel_line": '-c cancel -r stonith_test_rsc -a monitor -i 2s ' + self._action_timeout, "stonith_cancel_event": '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Cancelled" ', } def _setup_rsc_classes(self): """Determine which resource classes are supported.""" classes = stdout_from_command(["crm_resource", "--list-standards"]) # Strip trailing empty line classes = classes[:-1] if self.tls: classes.remove("stonith") if "systemd" in classes: try: # This code doesn't need this import, but pacemaker-cts-dummyd # does, so ensure the dependency is available rather than cause # all systemd tests to fail. # pylint: disable=import-outside-toplevel,unused-import import systemd.daemon except ImportError: print("Python systemd bindings not found.") print("The tests for systemd class are not going to be run.") classes.remove("systemd") return classes def new_test(self, name, description): """Create a named test.""" test = ExecTest(name, description, verbose=self.verbose, tls=self.tls, timeout=self.timeout, force_wait=self.force_wait, logdir=self.logdir) self._tests.append(test) return test def setup_environment(self): """Prepare the host before executing any tests.""" + if BuildOptions.REMOTE_ENABLED: + # @TODO Use systemctl when available, and use the subprocess module + # with an argument array instead of os.system() os.system("service pacemaker_remote stop") self.cleanup_environment() # @TODO Support the option of using specified existing certificates authkey = "%s/authkey" % BuildOptions.PACEMAKER_CONFIG_DIR if self.tls and not os.path.isfile(authkey): print("Installing %s ..." % authkey) + # @TODO Use os.mkdir() instead os.system("mkdir -p %s" % BuildOptions.PACEMAKER_CONFIG_DIR) + # @TODO Use the subprocess module with an argument array instead os.system("dd if=/dev/urandom of=%s bs=4096 count=1" % authkey) self._installed_files.append(authkey) # If we're in build directory, install agents if not already installed # pylint: disable=protected-access if os.path.exists("%s/cts/cts-exec.in" % BuildOptions._BUILD_DIR): if not os.path.exists("%s/pacemaker" % BuildOptions.OCF_RA_INSTALL_DIR): # @TODO remember which components were created and remove them os.makedirs("%s/pacemaker" % BuildOptions.OCF_RA_INSTALL_DIR, 0o755) for agent in ["Dummy", "Stateful", "ping"]: agent_source = "%s/extra/resources/%s" % (BuildOptions._BUILD_DIR, agent) agent_dest = "%s/pacemaker/%s" % (BuildOptions.OCF_RA_INSTALL_DIR, agent) if not os.path.exists(agent_dest): print("Installing %s ..." % agent_dest) shutil.copyfile(agent_source, agent_dest) os.chmod(agent_dest, EXECMODE) self._installed_files.append(agent_dest) subprocess.call(["cts-support", "install"]) def cleanup_environment(self): """Clean up the host after executing desired tests.""" for installed_file in self._installed_files: print("Removing %s ..." % installed_file) os.remove(installed_file) subprocess.call(["cts-support", "uninstall"]) def _build_cmd_str(self, rsc, ty): """Construct a command string for the given resource and type.""" return "%s %s" % (self._common_cmds["%s_%s_line" % (rsc, ty)], self._common_cmds["%s_%s_event" % (rsc, ty)]) def build_generic_tests(self): """Register tests that apply to all resource classes.""" common_cmds = self._common_cmds # register/unregister tests for rsc in self._rsc_classes: test = self.new_test("generic_registration_%s" % rsc, "Simple resource registration test for %s standard" % rsc) test.add_cmd(args=self._build_cmd_str(rsc, "reg")) test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) # start/stop tests for rsc in self._rsc_classes: test = self.new_test("generic_start_stop_%s" % rsc, "Simple start and stop test for %s standard" % rsc) test.add_cmd(args=self._build_cmd_str(rsc, "reg")) test.add_cmd(args=self._build_cmd_str(rsc, "start")) test.add_cmd(args=self._build_cmd_str(rsc, "stop")) test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) # monitor cancel test for rsc in self._rsc_classes: test = self.new_test("generic_monitor_cancel_%s" % rsc, "Simple monitor cancel test for %s standard" % rsc) test.add_cmd(args=self._build_cmd_str(rsc, "reg")) test.add_cmd(args=self._build_cmd_str(rsc, "start")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) test.add_cmd(args=self._build_cmd_str(rsc, "cancel")) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args=self._build_cmd_str(rsc, "stop")) test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) # monitor duplicate test for rsc in self._rsc_classes: test = self.new_test("generic_monitor_duplicate_%s" % rsc, "Test creation and canceling of duplicate monitors for %s standard" % rsc) test.add_cmd(args=self._build_cmd_str(rsc, "reg")) test.add_cmd(args=self._build_cmd_str(rsc, "start")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) # Add the duplicate monitors test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) # verify we still get update events # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) # cancel the monitor, if the duplicate merged with the original, we should no longer see monitor updates test.add_cmd(args=self._build_cmd_str(rsc, "cancel")) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args=self._build_cmd_str(rsc, "stop")) test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) # stop implies cancel test for rsc in self._rsc_classes: test = self.new_test("generic_stop_implies_cancel_%s" % rsc, "Verify stopping a resource implies cancel of recurring ops for %s standard" % rsc) test.add_cmd(args=self._build_cmd_str(rsc, "reg")) test.add_cmd(args=self._build_cmd_str(rsc, "start")) test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) # If this fails, that means the monitor may not be getting rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) test.add_cmd(args=self._build_cmd_str(rsc, "stop")) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) # If this happens the monitor did not actually cancel correctly test.add_cmd(args=common_cmds["%s_monitor_event" % rsc], expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) def build_multi_rsc_tests(self): """Register complex tests that involve managing multiple resouces of different types.""" common_cmds = self._common_cmds # do not use service and systemd at the same time, it is the same resource. # register start monitor stop unregister resources of each type at the same time test = self.new_test("multi_rsc_start_stop_all_including_stonith", "Start, monitor, and stop resources of multiple types and classes") for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "reg")) for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "start")) for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "monitor")) for rsc in self._rsc_classes: # If this fails, that means the monitor is not being rescheduled test.add_cmd(args=common_cmds["%s_monitor_event" % rsc]) for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "cancel")) for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "stop")) for rsc in self._rsc_classes: test.add_cmd(args=self._build_cmd_str(rsc, "unreg")) def build_negative_tests(self): """Register tests related to how pacemaker-execd handles failures.""" # ocf start timeout test test = self.new_test("ocf_start_timeout", "Force start timeout to occur, verify start failure.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') # -t must be less than self._action_timeout test.add_cmd(args='-c exec -r test_rsc -a start -k op_sleep -v 5 -t 1000 -w') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:Error occurred op_status:Timed out" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -a stop ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:Done" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # stonith start timeout test test = self.new_test("stonith_start_timeout", "Force start timeout to occur, verify start failure.") test.add_cmd(args='-c register_rsc -r test_rsc -C stonith -P pacemaker -T fence_dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done"') # -t must be less than self._action_timeout test.add_cmd(args='-c exec -r test_rsc -a start -k monitor_delay -v 30 -t 1000 -w') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:Error occurred op_status:Timed out" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -a stop ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:Done" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # stonith component fail test = self.new_test("stonith_component_fail", "Kill stonith component after pacemaker-execd connects") test.add_cmd(args=self._build_cmd_str("stonith", "reg")) test.add_cmd(args=self._build_cmd_str("stonith", "start")) test.add_cmd(args='-c exec -r stonith_test_rsc -a monitor -i 600s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:Error occurred op_status:error" -t 15000', kill="killall -9 -q pacemaker-fenced lt-pacemaker-fenced") test.add_cmd(args=self._build_cmd_str("stonith", "unreg")) # monitor fail for ocf resources test = self.new_test("monitor_fail_ocf", "Force ocf monitor to fail, verify failure is reported.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a monitor -i 1s ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done"') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done"' + self._action_timeout) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done"' + self._action_timeout) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done" ' + self._action_timeout, kill="rm -f %s/run/Dummy-test_rsc.state" % BuildOptions.LOCAL_STATE_DIR) test.add_cmd(args='-c cancel -r test_rsc -a monitor -i 1s ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # verify notify changes only for monitor operation test = self.new_test("monitor_changes_only", "Verify when flag is set, only monitor changes are notified.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + ' -o ' '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a monitor -i 1s ' + self._action_timeout + ' -o -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done"' + self._action_timeout, kill='rm -f %s/run/Dummy-test_rsc.state' % BuildOptions.LOCAL_STATE_DIR) test.add_cmd(args='-c cancel -r test_rsc -a monitor -i 1s' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done"') # monitor fail for systemd resource if "systemd" in self._rsc_classes: test = self.new_test("monitor_fail_systemd", "Force systemd monitor to fail, verify failure is reported..") test.add_cmd(args='-c register_rsc -r test_rsc -C systemd -T pacemaker-cts-dummyd@3 ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a monitor -i 1s ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done"' + self._action_timeout, kill="pkill -9 -f pacemaker-cts-dummyd") test.add_cmd(args='-c cancel -r test_rsc -a monitor -i 1s' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Cancel non-existent operation on a resource test = self.new_test("cancel_non_existent_op", "Attempt to cancel the wrong monitor operation, verify expected failure") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a monitor -i 1s ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout) # interval is wrong, should fail test.add_cmd(args='-c cancel -r test_rsc -a monitor -i 2s' + self._action_timeout + ' -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled" ', expected_exitcode=ExitStatus.ERROR) # action name is wrong, should fail test.add_cmd(args='-c cancel -r test_rsc -a stop -i 1s' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Attempt to invoke non-existent rsc id test = self.new_test("invoke_non_existent_rsc", "Attempt to perform operations on a non-existent rsc id.") test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:error op_status:Done" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c exec -r test_rsc -a stop ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:Done" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c exec -r test_rsc -a monitor -i 6s ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c cancel -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Register and start a resource that doesn't exist, systemd if "systemd" in self._rsc_classes: test = self.new_test("start_uninstalled_systemd", "Register uninstalled systemd agent, try to start, verify expected failure") test.add_cmd(args='-c register_rsc -r test_rsc -C systemd -T this_is_fake1234 ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Register and start a resource that doesn't exist, ocf test = self.new_test("start_uninstalled_ocf", "Register uninstalled ocf agent, try to start, verify expected failure.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T this_is_fake1234 ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Register ocf with non-existent provider test = self.new_test("start_ocf_bad_provider", "Register ocf agent with a non-existent provider, verify expected failure.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pancakes -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Register ocf with empty provider field test = self.new_test("start_ocf_no_provider", "Register ocf agent with a no provider, verify expected failure.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Error" ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') def build_stress_tests(self): """Register stress tests.""" timeout = "-t 20000" iterations = 25 test = self.new_test("ocf_stress", "Verify OCF agent handling works under load") for i in range(iterations): test.add_cmd(args='-c register_rsc -r rsc_%s %s -C ocf -P heartbeat -T Dummy -l "NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c exec -r rsc_%s -a start %s -l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c exec -r rsc_%s -a monitor %s -i 1s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:Done"' % (i, timeout, i)) for i in range(iterations): test.add_cmd(args='-c exec -r rsc_%s -a stop %s -l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c unregister_rsc -r rsc_%s %s -l "NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:Done"' % (i, timeout, i)) if "systemd" in self._rsc_classes: test = self.new_test("systemd_stress", "Verify systemd dbus connection works under load") for i in range(iterations): test.add_cmd(args='-c register_rsc -r rsc_%s %s -C systemd -T pacemaker-cts-dummyd@3 -l "NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c exec -r rsc_%s -a start %s -l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c exec -r rsc_%s -a monitor %s -i 1s ' '-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:Done"' % (i, timeout, i)) for i in range(iterations): test.add_cmd(args='-c exec -r rsc_%s -a stop %s -l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:Done"' % (i, timeout, i)) test.add_cmd(args='-c unregister_rsc -r rsc_%s %s -l "NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:Done"' % (i, timeout, i)) iterations = 9 timeout = "-t 30000" # Verify recurring op in-flight collision is handled in series properly test = self.new_test("rsc_inflight_collision", "Verify recurring ops do not collide with other operations for the same rsc.") test.add_cmd(args='-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy ' '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -a start %s -k op_sleep -v 1 -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done"' % timeout) for i in range(iterations): test.add_cmd(args='-c exec -r test_rsc -a monitor %s -i 100%dms -k op_sleep -v 2 ' '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done"' % (timeout, i)) test.add_cmd(args='-c exec -r test_rsc -a stop %s -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:Done"' % timeout) test.add_cmd(args='-c unregister_rsc -r test_rsc %s -l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done"' % timeout) def build_custom_tests(self): """Register tests that target specific cases.""" # verify resource temporary folder is created and used by OCF agents test = self.new_test("rsc_tmp_dir", "Verify creation and use of rsc temporary state directory") test.add_cmd("ls", args="-al %s" % BuildOptions.RSC_TMP_DIR) test.add_cmd(args='-c register_rsc -r test_rsc -P heartbeat -C ocf -T Dummy ' '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -a start -t 4000') test.add_cmd("ls", args="-al %s" % BuildOptions.RSC_TMP_DIR) test.add_cmd("ls", args="%s/Dummy-test_rsc.state" % BuildOptions.RSC_TMP_DIR) test.add_cmd(args='-c exec -r test_rsc -a stop -t 4000') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # start delay then stop test test = self.new_test("start_delay", "Verify start delay works as expected.") test.add_cmd(args='-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy ' '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -s 6000 -a start -w -t 6000') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" -t 2000', expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" -t 6000') test.add_cmd(args='-c exec -r test_rsc -a stop ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:Done" ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # start delay, but cancel before it gets a chance to start test = self.new_test("start_delay_cancel", "Using start_delay, start a rsc, but cancel the start op before execution.") test.add_cmd(args='-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy ' '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ' + self._action_timeout) test.add_cmd(args='-c exec -r test_rsc -s 5000 -a start -w -t 4000') test.add_cmd(args='-c cancel -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled" ') test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" -t 5000', expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # Register a bunch of resources, verify we can get info on them test = self.new_test("verify_get_rsc_info", "Register multiple resources, verify retrieval of rsc info.") if "systemd" in self._rsc_classes: test.add_cmd(args='-c register_rsc -r rsc1 -C systemd -T pacemaker-cts-dummyd@3 ' + self._action_timeout) test.add_cmd(args='-c get_rsc_info -r rsc1 ') test.add_cmd(args='-c unregister_rsc -r rsc1 ' + self._action_timeout) test.add_cmd(args='-c get_rsc_info -r rsc1 ', expected_exitcode=ExitStatus.ERROR) test.add_cmd(args='-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker ' + self._action_timeout) test.add_cmd(args='-c get_rsc_info -r rsc2 ') test.add_cmd(args='-c unregister_rsc -r rsc2 ' + self._action_timeout) test.add_cmd(args='-c get_rsc_info -r rsc2 ', expected_exitcode=ExitStatus.ERROR) # Register duplicate, verify only one entry exists and can still be removed test = self.new_test("duplicate_registration", "Register resource multiple times, verify only one entry exists and can be removed.") test.add_cmd(args='-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker ' + self._action_timeout) test.add_cmd(args="-c get_rsc_info -r rsc2 ", stdout_match="id:rsc2 class:ocf provider:pacemaker type:Dummy") test.add_cmd(args='-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker ' + self._action_timeout) test.add_cmd(args="-c get_rsc_info -r rsc2 ", stdout_match="id:rsc2 class:ocf provider:pacemaker type:Dummy") test.add_cmd(args='-c register_rsc -r rsc2 -C ocf -T Stateful -P pacemaker ' + self._action_timeout) test.add_cmd(args="-c get_rsc_info -r rsc2 ", stdout_match="id:rsc2 class:ocf provider:pacemaker type:Stateful") test.add_cmd(args='-c unregister_rsc -r rsc2 ' + self._action_timeout) test.add_cmd(args='-c get_rsc_info -r rsc2 ', expected_exitcode=ExitStatus.ERROR) # verify the option to only send notification to the original client test = self.new_test("notify_orig_client_only", "Verify option to only send notifications to the client originating the action.") test.add_cmd(args='-c register_rsc -r test_rsc -C ocf -P pacemaker -T Dummy ' + self._action_timeout + '-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a start ' + self._action_timeout + '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Done" ') test.add_cmd(args='-c exec -r test_rsc -a monitor -i 1s ' + self._action_timeout + ' -n ' '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done"') # this will fail because the monitor notifications should only go to the original caller, which no longer exists. test.add_cmd(args='-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:Done" ' + self._action_timeout, expected_exitcode=ExitStatus.TIMEOUT) test.add_cmd(args='-c cancel -r test_rsc -a monitor -i 1s -t 6000 ') test.add_cmd(args='-c unregister_rsc -r test_rsc ' + self._action_timeout + '-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:Done" ') # get metadata test = self.new_test("get_ocf_metadata", "Retrieve metadata for a resource") test.add_cmd(args="-c metadata -C ocf -P pacemaker -T Dummy", stdout_match="resource-agent name=\"Dummy\"") test.add_cmd(args="-c metadata -C ocf -P pacemaker -T Stateful") test.add_cmd(args="-c metadata -P pacemaker -T Stateful", expected_exitcode=ExitStatus.ERROR) test.add_cmd(args="-c metadata -C ocf -P pacemaker -T fake_agent", expected_exitcode=ExitStatus.ERROR) # get stonith metadata test = self.new_test("get_stonith_metadata", "Retrieve stonith metadata for a resource") test.add_cmd(args="-c metadata -C stonith -P pacemaker -T fence_dummy", stdout_match="resource-agent name=\"fence_dummy\"") # get lsb metadata if "lsb" in self._rsc_classes: test = self.new_test("get_lsb_metadata", "Retrieve metadata for an LSB resource") test.add_cmd(args="-c metadata -C lsb -T LSBDummy", stdout_match="resource-agent name='LSBDummy'") # get metadata if "systemd" in self._rsc_classes: test = self.new_test("get_systemd_metadata", "Retrieve metadata for a resource") test.add_cmd(args="-c metadata -C systemd -T pacemaker-cts-dummyd@", stdout_match="resource-agent name=\"pacemaker-cts-dummyd@\"") # get ocf providers test = self.new_test("list_ocf_providers", "Retrieve list of available resource providers, verifies pacemaker is a provider.") test.add_cmd(args="-c list_ocf_providers ", stdout_match="pacemaker") test.add_cmd(args="-c list_ocf_providers -T ping", stdout_match="pacemaker") # Verify agents only exist in their lists test = self.new_test("verify_agent_lists", "Verify the agent lists contain the right data.") if "ocf" in self._rsc_classes: test.add_cmd(args="-c list_agents ", stdout_match="Stateful") test.add_cmd(args="-c list_agents -C ocf", stdout_match="Stateful", stdout_no_match="pacemaker-cts-dummyd@|fence_dummy") if "service" in self._rsc_classes: test.add_cmd(args="-c list_agents -C service", stdout_match="", stdout_no_match="Stateful|fence_dummy") if "lsb" in self._rsc_classes: test.add_cmd(args="-c list_agents", stdout_match="LSBDummy") test.add_cmd(args="-c list_agents -C lsb", stdout_match="LSBDummy", stdout_no_match="pacemaker-cts-dummyd@|Stateful|fence_dummy") test.add_cmd(args="-c list_agents -C service", stdout_match="LSBDummy") if "systemd" in self._rsc_classes: test.add_cmd(args="-c list_agents ", stdout_match="pacemaker-cts-dummyd@") # systemd test.add_cmd(args="-c list_agents -C systemd", stdout_match="", stdout_no_match="Stateful") # should not exist test.add_cmd(args="-c list_agents -C systemd", stdout_match="pacemaker-cts-dummyd@") test.add_cmd(args="-c list_agents -C systemd", stdout_match="", stdout_no_match="fence_dummy") # should not exist if "stonith" in self._rsc_classes: test.add_cmd(args="-c list_agents -C stonith", stdout_match="fence_dummy") # stonith test.add_cmd(args="-c list_agents -C stonith", stdout_match="", # should not exist stdout_no_match="pacemaker-cts-dummyd@") test.add_cmd(args="-c list_agents -C stonith", stdout_match="", stdout_no_match="Stateful") # should not exist test.add_cmd(args="-c list_agents ", stdout_match="fence_dummy") def build_options(): """Handle command line arguments.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="Run pacemaker-execd regression tests", epilog="Example: Run only the test 'start_stop'\n" "\t " + sys.argv[0] + " --run-only start_stop\n\n" "Example: Run only the tests with the string 'systemd' present in them\n" "\t " + sys.argv[0] + " --run-only-pattern systemd") parser.add_argument("-l", "--list-tests", action="store_true", help="Print out all registered tests") parser.add_argument("-p", "--run-only-pattern", metavar='PATTERN', help="Run only tests matching the given pattern") parser.add_argument("-r", "--run-only", metavar='TEST', help="Run a specific test") parser.add_argument("-t", "--timeout", type=float, default=2, help="Up to how many seconds each test case waits for the daemon to " "be initialized. Defaults to 2. The value 0 means no limit.") parser.add_argument("-w", "--force-wait", action="store_true", help="Each test case waits the default/specified --timeout for the " "daemon without tracking the log") if BuildOptions.REMOTE_ENABLED: parser.add_argument("-R", "--pacemaker-remote", action="store_true", help="Test pacemaker-remoted binary instead of pacemaker-execd") parser.add_argument("-V", "--verbose", action="store_true", help="Verbose output") args = parser.parse_args() return args def main(): """Run pacemaker-execd regression tests as specified by arguments.""" update_path() # Ensure all command output is in portable locale for comparison os.environ['LC_ALL'] = "C" opts = build_options() if opts.pacemaker_remote: exit_if_proc_running("pacemaker-remoted") else: exit_if_proc_running("corosync") exit_if_proc_running("pacemaker-execd") exit_if_proc_running("pacemaker-fenced") # Create a temporary directory for log files (the directory will # automatically be erased when done) with tempfile.TemporaryDirectory(prefix="cts-exec-") as logdir: tests = ExecTests(verbose=opts.verbose, tls=opts.pacemaker_remote, timeout=opts.timeout, force_wait=opts.force_wait, logdir=logdir) tests.build_generic_tests() tests.build_multi_rsc_tests() tests.build_negative_tests() tests.build_custom_tests() tests.build_stress_tests() if opts.list_tests: tests.print_list() sys.exit(ExitStatus.OK) print("Starting ...") tests.setup_environment() if opts.run_only_pattern: tests.run_tests_matching(opts.run_only_pattern) tests.print_results() elif opts.run_only: tests.run_single(opts.run_only) tests.print_results() else: tests.run_tests() tests.print_results() tests.cleanup_environment() tests.exit() if __name__ == "__main__": main() diff --git a/python/pacemaker/_cts/environment.py b/python/pacemaker/_cts/environment.py index 9538da1faf..7b5f47df33 100644 --- a/python/pacemaker/_cts/environment.py +++ b/python/pacemaker/_cts/environment.py @@ -1,642 +1,644 @@ """Test environment classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["EnvFactory"] __copyright__ = "Copyright 2014-2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse from contextlib import suppress import os import random import socket import sys import time from pacemaker.buildoptions import BuildOptions from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.watcher import LogKind class Environment: """ A class for managing the CTS environment. This consists largely of processing and storing command line parameters. """ # pylint doesn't understand that self._rsh is callable (it stores the # singleton instance of RemoteExec, as returned by the getInstance method # of RemoteFactory). It's possible we could fix this with type annotations, # but those were introduced with python 3.5 and we only support python 3.4. # I think we could also fix this by getting rid of the getInstance methods, # but that's a project for another day. For now, just disable the warning. # pylint: disable=not-callable def __init__(self, args): """ Create a new Environment instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: args -- A list of command line parameters, minus the program name. If None, sys.argv will be used. """ self.data = {} self._nodes = [] # Set some defaults before processing command line arguments. These are # either not set by any command line parameter, or they need a default # that can't be set in add_argument. self["DeadTime"] = 300 self["StartTime"] = 300 self["StableTime"] = 30 self["tests"] = [] self["IPagent"] = "IPaddr2" self["DoFencing"] = True self["ClobberCIB"] = False self["CIBfilename"] = None self["CIBResource"] = False self["log_kind"] = None self["node-limit"] = 0 self["scenario"] = "random" self.random_gen = random.Random() self._logger = LogFactory() self._rsh = RemoteFactory().getInstance() self._target = "localhost" self._seed_random() self._parse_args(args) if not self["ListTests"]: self._validate() self._discover() def _seed_random(self, seed=None): """ Initialize the random number generator. Arguments: seed -- Use this to see the random number generator, or use the current time if None. """ if not seed: seed = int(time.time()) self["RandSeed"] = seed self.random_gen.seed(str(seed)) def dump(self): """Print the current environment.""" keys = [] for key in list(self.data.keys()): keys.append(key) keys.sort() for key in keys: s = "Environment[%s]" % key self._logger.debug("{key:35}: {val}".format(key=s, val=str(self[key]))) def keys(self): """Return a list of all environment keys stored in this instance.""" return list(self.data.keys()) def __contains__(self, key): """Return True if the given key exists in the environment.""" if key == "nodes": return True return key in self.data def __getitem__(self, key): """Return the given environment key, or None if it does not exist.""" if str(key) == "0": raise ValueError("Bad call to 'foo in X', should reference 'foo in X.keys()' instead") if key == "nodes": return self._nodes if key == "Name": return self._get_stack_short() return self.data.get(key) def __setitem__(self, key, value): """Set the given environment key to the given value, overriding any previous value.""" if key == "Stack": self._set_stack(value) elif key == "node-limit": self.data[key] = value self._filter_nodes() elif key == "nodes": self._nodes = [] for node in value: # I don't think I need the IP address, etc. but this validates # the node name against /etc/hosts and/or DNS, so it's a # GoodThing(tm). try: n = node.strip() + # @TODO This only handles IPv4, use getaddrinfo() instead + # (here and in _discover()) socket.gethostbyname_ex(n) self._nodes.append(n) except socket.herror: self._logger.log("%s not found in DNS... aborting" % node) raise self._filter_nodes() else: self.data[key] = value def random_node(self): """Choose a random node from the cluster.""" return self.random_gen.choice(self["nodes"]) def get(self, key, default=None): """Return the value for key if key is in the environment, else default.""" if key == "nodes": return self._nodes return self.data.get(key, default) def _set_stack(self, name): """Normalize the given cluster stack name.""" if name in ["corosync", "cs", "mcp"]: self.data["Stack"] = "corosync 2+" else: raise ValueError("Unknown stack: %s" % name) def _get_stack_short(self): """Return the short name for the currently set cluster stack.""" if "Stack" not in self.data: return "unknown" if self.data["Stack"] == "corosync 2+": return "crm-corosync" LogFactory().log("Unknown stack: %s" % self["stack"]) raise ValueError("Unknown stack: %s" % self["stack"]) def _detect_systemd(self): """Detect whether systemd is in use on the target node.""" if "have_systemd" not in self.data: (rc, _) = self._rsh(self._target, "systemctl list-units", verbose=0) self["have_systemd"] = rc == 0 def _detect_syslog(self): """Detect the syslog variant in use on the target node (if any).""" if "syslogd" in self.data: return if self["have_systemd"]: # Systemd (_, lines) = self._rsh(self._target, r"systemctl list-units | grep syslog.*\.service.*active.*running | sed 's:.service.*::'", verbose=1) else: # SYS-V (_, lines) = self._rsh(self._target, "chkconfig --list | grep syslog.*on | awk '{print $1}' | head -n 1", verbose=1) with suppress(IndexError): self["syslogd"] = lines[0].strip() def disable_service(self, node, service): """Disable the given service on the given node.""" if self["have_systemd"]: # Systemd (rc, _) = self._rsh(node, "systemctl disable %s" % service) return rc # SYS-V (rc, _) = self._rsh(node, "chkconfig %s off" % service) return rc def enable_service(self, node, service): """Enable the given service on the given node.""" if self["have_systemd"]: # Systemd (rc, _) = self._rsh(node, "systemctl enable %s" % service) return rc # SYS-V (rc, _) = self._rsh(node, "chkconfig %s on" % service) return rc def service_is_enabled(self, node, service): """Return True if the given service is enabled on the given node.""" if self["have_systemd"]: # Systemd # With "systemctl is-enabled", we should check if the service is # explicitly "enabled" instead of the return code. For example it returns # 0 if the service is "static" or "indirect", but they don't really count # as "enabled". (rc, _) = self._rsh(node, "systemctl is-enabled %s | grep enabled" % service) return rc == 0 # SYS-V (rc, _) = self._rsh(node, "chkconfig --list | grep -e %s.*on" % service) return rc == 0 def _detect_at_boot(self): """Detect if the cluster starts at boot.""" if "at-boot" not in self.data: self["at-boot"] = self.service_is_enabled(self._target, "corosync") \ or self.service_is_enabled(self._target, "pacemaker") def _detect_ip_offset(self): """Detect the offset for IPaddr resources.""" if self["CIBResource"] and "IPBase" not in self.data: (_, lines) = self._rsh(self._target, "ip addr | grep inet | grep -v -e link -e inet6 -e '/32' -e ' lo' | awk '{print $2}'", verbose=0) network = lines[0].strip() (_, lines) = self._rsh(self._target, "nmap -sn -n %s | grep 'scan report' | awk '{print $NF}' | sed 's:(::' | sed 's:)::' | sort -V | tail -n 1" % network, verbose=0) try: self["IPBase"] = lines[0].strip() except (IndexError, TypeError): self["IPBase"] = None if not self["IPBase"]: self["IPBase"] = " fe80::1234:56:7890:1000" self._logger.log("Could not determine an offset for IPaddr resources. Perhaps nmap is not installed on the nodes.") self._logger.log("Defaulting to '%s', use --test-ip-base to override" % self["IPBase"]) return # pylint thinks self["IPBase"] is a list, not a string, which causes it # to error out because a list doesn't have split(). # pylint: disable=no-member if int(self["IPBase"].split('.')[3]) >= 240: self._logger.log("Could not determine an offset for IPaddr resources. Upper bound is too high: %s %s" % (self["IPBase"], self["IPBase"].split('.')[3])) self["IPBase"] = " fe80::1234:56:7890:1000" self._logger.log("Defaulting to '%s', use --test-ip-base to override" % self["IPBase"]) def _filter_nodes(self): """ Filter the list of cluster nodes. If --limit-nodes is given, keep that many nodes from the front of the list of cluster nodes and drop the rest. """ if self["node-limit"] > 0: if len(self["nodes"]) > self["node-limit"]: # pylint thinks self["node-limit"] is a list even though we initialize # it as an int in __init__ and treat it as an int everywhere. # pylint: disable=bad-string-format-type self._logger.log("Limiting the number of nodes configured=%d (max=%d)" % (len(self["nodes"]), self["node-limit"])) while len(self["nodes"]) > self["node-limit"]: self["nodes"].pop(len(self["nodes"]) - 1) def _validate(self): """Check that we were given all required command line parameters.""" if not self["nodes"]: raise ValueError("No nodes specified!") def _discover(self): """Probe cluster nodes to figure out how to log and manage services.""" self._target = random.Random().choice(self["nodes"]) exerciser = socket.gethostname() # Use the IP where possible to avoid name lookup failures for ip in socket.gethostbyname_ex(exerciser)[2]: if ip != "127.0.0.1": exerciser = ip break self["cts-exerciser"] = exerciser self._detect_systemd() self._detect_syslog() self._detect_at_boot() self._detect_ip_offset() def _parse_args(self, argv): """ Parse and validate command line parameters. Set the appropriate values in the environment dictionary. If argv is None, use sys.argv instead. """ if not argv: argv = sys.argv[1:] parser = argparse.ArgumentParser(epilog="%s -g virt1 -r --stonith ssh --schema pacemaker-2.0 500" % sys.argv[0]) grp1 = parser.add_argument_group("Common options") grp1.add_argument("-g", "--dsh-group", "--group", metavar="GROUP", dest="group", help="Use the nodes listed in the named DSH group (~/.dsh/groups/$name)") grp1.add_argument("-l", "--limit-nodes", type=int, default=0, metavar="MAX", help="Only use the first MAX cluster nodes supplied with --nodes") grp1.add_argument("--benchmark", action="store_true", help="Add timing information") grp1.add_argument("--list", "--list-tests", action="store_true", dest="list_tests", help="List the valid tests") grp1.add_argument("--nodes", metavar="NODES", help="List of cluster nodes separated by whitespace") grp1.add_argument("--stack", default="corosync", metavar="STACK", help="Which cluster stack is installed") grp2 = parser.add_argument_group("Options that CTS will usually auto-detect correctly") grp2.add_argument("-L", "--logfile", metavar="PATH", help="Where to look for logs from cluster nodes (or 'journal' for systemd journal)") grp2.add_argument("--at-boot", "--cluster-starts-at-boot", choices=["1", "0", "yes", "no"], help="Does the cluster software start at boot time?") grp2.add_argument("--facility", "--syslog-facility", default="daemon", metavar="NAME", help="Which syslog facility to log to") grp2.add_argument("--ip", "--test-ip-base", metavar="IP", help="Offset for generated IP address resources") grp3 = parser.add_argument_group("Options for release testing") grp3.add_argument("-r", "--populate-resources", action="store_true", help="Generate a sample configuration") grp3.add_argument("--choose", metavar="NAME", help="Run only the named tests, separated by whitespace") grp3.add_argument("--fencing", "--stonith", choices=["1", "0", "yes", "no", "lha", "openstack", "rhcs", "rhevm", "scsi", "ssh", "virt", "xvm"], default="1", help="What fencing agent to use") grp3.add_argument("--once", action="store_true", help="Run all valid tests once") grp4 = parser.add_argument_group("Additional (less common) options") grp4.add_argument("-c", "--clobber-cib", action="store_true", help="Erase any existing configuration") grp4.add_argument("-y", "--yes", action="store_true", dest="always_continue", help="Continue to run whenever prompted") grp4.add_argument("--boot", action="store_true", help="") grp4.add_argument("--cib-filename", metavar="PATH", help="Install the given CIB file to the cluster") grp4.add_argument("--experimental-tests", action="store_true", help="Include experimental tests") grp4.add_argument("--loop-minutes", type=int, default=60, help="") grp4.add_argument("--no-loop-tests", action="store_true", help="Don't run looping/time-based tests") grp4.add_argument("--no-unsafe-tests", action="store_true", help="Don't run tests that are unsafe for use with ocfs2/drbd") grp4.add_argument("--notification-agent", metavar="PATH", default="/var/lib/pacemaker/notify.sh", help="Script to configure for Pacemaker alerts") grp4.add_argument("--notification-recipient", metavar="R", default="/var/lib/pacemaker/notify.log", help="Recipient to pass to alert script") grp4.add_argument("--oprofile", metavar="NODES", help="List of cluster nodes to run oprofile on") grp4.add_argument("--outputfile", metavar="PATH", help="Location to write logs to") grp4.add_argument("--qarsh", action="store_true", help="Use QARSH to access nodes instead of SSH") grp4.add_argument("--schema", metavar="SCHEMA", default="pacemaker-%s" % BuildOptions.CIB_SCHEMA_VERSION, help="Create a CIB conforming to the given schema") grp4.add_argument("--seed", metavar="SEED", help="Use the given string as the random number seed") grp4.add_argument("--set", action="append", metavar="ARG", default=[], help="Set key=value pairs (can be specified multiple times)") grp4.add_argument("--stonith-args", metavar="ARGS", default="hostlist=all,livedangerously=yes", help="") grp4.add_argument("--stonith-type", metavar="TYPE", default="external/ssh", help="") grp4.add_argument("--trunc", action="store_true", dest="truncate", help="Truncate log file before starting") grp4.add_argument("--valgrind-procs", metavar="PROCS", default="pacemaker-attrd pacemaker-based pacemaker-controld pacemaker-execd pacemaker-fenced pacemaker-schedulerd", help="Run valgrind against the given space-separated list of processes") grp4.add_argument("--valgrind-tests", action="store_true", help="Include tests using valgrind") grp4.add_argument("--warn-inactive", action="store_true", help="Warn if a resource is assigned to an inactive node") parser.add_argument("iterations", nargs='?', type=int, default=1, help="Number of tests to run") args = parser.parse_args(args=argv) # Set values on this object based on what happened with command line # processing. This has to be done in several blocks. # These values can always be set. They get a default from the add_argument # calls, only do one thing, and they do not have any side effects. self["ClobberCIB"] = args.clobber_cib self["ListTests"] = args.list_tests self["Schema"] = args.schema self["Stack"] = args.stack self["SyslogFacility"] = args.facility self["TruncateLog"] = args.truncate self["at-boot"] = args.at_boot in ["1", "yes"] self["benchmark"] = args.benchmark self["continue"] = args.always_continue self["experimental-tests"] = args.experimental_tests self["iterations"] = args.iterations self["loop-minutes"] = args.loop_minutes self["loop-tests"] = not args.no_loop_tests self["notification-agent"] = args.notification_agent self["notification-recipient"] = args.notification_recipient self["node-limit"] = args.limit_nodes self["stonith-params"] = args.stonith_args self["stonith-type"] = args.stonith_type self["unsafe-tests"] = not args.no_unsafe_tests self["valgrind-procs"] = args.valgrind_procs self["valgrind-tests"] = args.valgrind_tests self["warn-inactive"] = args.warn_inactive # Nodes and groups are mutually exclusive, so their defaults cannot be # set in their add_argument calls. Additionally, groups does more than # just set a value. Here, set nodes first and then if a group is # specified, override the previous nodes value. if args.nodes: self["nodes"] = args.nodes.split(" ") else: self["nodes"] = [] if args.group: self["OutputFile"] = "%s/cluster-%s.log" % (os.environ['HOME'], args.dsh_group) LogFactory().add_file(self["OutputFile"], "CTS") dsh_file = "%s/.dsh/group/%s" % (os.environ['HOME'], args.dsh_group) if os.path.isfile(dsh_file): self["nodes"] = [] with open(dsh_file, "r", encoding="utf-8") as f: for line in f: stripped = line.strip() if not stripped.startswith('#'): self["nodes"].append(stripped) else: print("Unknown DSH group: %s" % args.dsh_group) # Everything else either can't have a default set in an add_argument # call (likely because we don't want to always have a value set for it) # or it does something fancier than just set a single value. However, # order does not matter for these as long as the user doesn't provide # conflicting arguments on the command line. So just do Everything # alphabetically. if args.boot: self["scenario"] = "boot" if args.cib_filename: self["CIBfilename"] = args.cib_filename else: self["CIBfilename"] = None if args.choose: self["scenario"] = "sequence" self["tests"].extend(args.choose.split()) self["iterations"] = len(self["tests"]) if args.fencing: if args.fencing in ["0", "no"]: self["DoFencing"] = False else: self["DoFencing"] = True if args.fencing in ["rhcs", "virt", "xvm"]: self["stonith-type"] = "fence_xvm" elif args.fencing == "scsi": self["stonith-type"] = "fence_scsi" elif args.fencing in ["lha", "ssh"]: self["stonith-params"] = "hostlist=all,livedangerously=yes" self["stonith-type"] = "external/ssh" elif args.fencing == "openstack": self["stonith-type"] = "fence_openstack" print("Obtaining OpenStack credentials from the current environment") self["stonith-params"] = "region=%s,tenant=%s,auth=%s,user=%s,password=%s" % ( os.environ['OS_REGION_NAME'], os.environ['OS_TENANT_NAME'], os.environ['OS_AUTH_URL'], os.environ['OS_USERNAME'], os.environ['OS_PASSWORD'] ) elif args.fencing == "rhevm": self["stonith-type"] = "fence_rhevm" print("Obtaining RHEV-M credentials from the current environment") self["stonith-params"] = "login=%s,passwd=%s,ipaddr=%s,ipport=%s,ssl=1,shell_timeout=10" % ( os.environ['RHEVM_USERNAME'], os.environ['RHEVM_PASSWORD'], os.environ['RHEVM_SERVER'], os.environ['RHEVM_PORT'], ) if args.ip: self["CIBResource"] = True self["ClobberCIB"] = True self["IPBase"] = args.ip if args.logfile == "journal": self["LogAuditDisabled"] = True self["log_kind"] = LogKind.JOURNAL elif args.logfile: self["LogAuditDisabled"] = True self["LogFileName"] = args.logfile self["log_kind"] = LogKind.REMOTE_FILE else: # We can't set this as the default on the parser.add_argument call # for this option because then args.logfile will be set, which means # the above branch will be taken and those other values will also be # set. self["LogFileName"] = "/var/log/messages" if args.once: self["scenario"] = "all-once" if args.oprofile: self["oprofile"] = args.oprofile.split(" ") else: self["oprofile"] = [] if args.outputfile: self["OutputFile"] = args.outputfile LogFactory().add_file(self["OutputFile"]) if args.populate_resources: self["CIBResource"] = True self["ClobberCIB"] = True if args.qarsh: self._rsh.enable_qarsh() for kv in args.set: (name, value) = kv.split("=") self[name] = value print("Setting %s = %s" % (name, value)) class EnvFactory: """A class for constructing a singleton instance of an Environment object.""" instance = None # pylint: disable=invalid-name def getInstance(self, args=None): """ Return the previously created instance of Environment. If no instance exists, create a new instance and return that. """ if not EnvFactory.instance: EnvFactory.instance = Environment(args) return EnvFactory.instance diff --git a/python/pacemaker/_cts/remote.py b/python/pacemaker/_cts/remote.py index ba5b878f13..ac70f8aeb8 100644 --- a/python/pacemaker/_cts/remote.py +++ b/python/pacemaker/_cts/remote.py @@ -1,279 +1,281 @@ """Remote command runner for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["RemoteExec", "RemoteFactory"] __copyright__ = "Copyright 2014-2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import os from subprocess import Popen, PIPE from threading import Thread from pacemaker._cts.logging import LogFactory def convert2string(lines): """ Convert byte strings to UTF-8 strings. Lists of byte strings are converted to a list of UTF-8 strings. All other text formats are passed through. """ if isinstance(lines, bytes): return lines.decode("utf-8") if isinstance(lines, list): lst = [] for line in lines: if isinstance(line, bytes): line = line.decode("utf-8") lst.append(line) return lst return lines class AsyncCmd(Thread): """A class for doing the hard work of running a command on another machine.""" def __init__(self, node, command, proc=None, delegate=None): """ Create a new AsyncCmd instance. Arguments: node -- The remote machine to run on command -- The ssh command string to use for remote execution proc -- If not None, a process object previously created with Popen. Instead of spawning a new process, we will then wait on this process to finish and handle its output. delegate -- When the command completes, call the async_complete method on this object """ self._command = command self._delegate = delegate self._logger = LogFactory() self._node = node self._proc = proc Thread.__init__(self) def run(self): """Run the previously instantiated AsyncCmd object.""" out = None err = None if not self._proc: # pylint: disable=consider-using-with self._proc = Popen(self._command, stdout=PIPE, stderr=PIPE, close_fds=True, shell=True) self._logger.debug("cmd: async: target=%s, pid=%d: %s" % (self._node, self._proc.pid, self._command)) self._proc.wait() if self._delegate: self._logger.debug("cmd: pid %d returned %d to %r" % (self._proc.pid, self._proc.returncode, self._delegate)) else: self._logger.debug("cmd: pid %d returned %d" % (self._proc.pid, self._proc.returncode)) if self._proc.stderr: err = self._proc.stderr.readlines() self._proc.stderr.close() for line in err: self._logger.debug("cmd: stderr[%d]: %s" % (self._proc.pid, line)) err = convert2string(err) if self._proc.stdout: out = self._proc.stdout.readlines() self._proc.stdout.close() out = convert2string(out) if self._delegate: self._delegate.async_complete(self._proc.pid, self._proc.returncode, out, err) class RemoteExec: """ An abstract class for remote execution. It runs a command on another machine using ssh and scp. """ def __init__(self, command, cp_command, silent=False): """ Create a new RemoteExec instance. Arguments: command -- The ssh command string to use for remote execution cp_command -- The scp command string to use for copying files silent -- Should we log command status? """ self._command = command self._cp_command = cp_command self._logger = LogFactory() self._silent = silent self._our_node = os.uname()[1].lower() def _fixcmd(self, cmd): """Perform shell escapes on certain characters in the input cmd string.""" return re.sub("\'", "'\\''", cmd) def _cmd(self, args): """Given a list of arguments, return the string that will be run on the remote system.""" sysname = args[0] command = args[1] if sysname is None or sysname.lower() in [self._our_node, "localhost"]: ret = command else: ret = "%s %s '%s'" % (self._command, sysname, self._fixcmd(command)) return ret def _log(self, args): """Log a message.""" if not self._silent: self._logger.log(args) def _debug(self, args): """Log a message at the debug level.""" if not self._silent: self._logger.debug(args) def call_async(self, node, command, delegate=None): """ Run the given command on the given remote system and do not wait for it to complete. Arguments: node -- The remote machine to run on command -- The command to run, as a string delegate -- When the command completes, call the async_complete method on this object Returns the running process object. """ aproc = AsyncCmd(node, self._cmd([node, command]), delegate=delegate) aproc.start() return aproc def __call__(self, node, command, synchronous=True, verbose=2): """ Run the given command on the given remote system. If you call this class like a function, this is what gets called. It's approximately the same as a system() call on the remote machine. Arguments: node -- The remote machine to run on command -- The command to run, as a string synchronous -- Should we wait for the command to complete? verbose -- If 0, do not lo:g anything. If 1, log the command and its return code but not its output. If 2, additionally log command output. Returns a tuple of (return code, command output). """ rc = 0 result = None # pylint: disable=consider-using-with proc = Popen(self._cmd([node, command]), stdout=PIPE, stderr=PIPE, close_fds=True, shell=True) if not synchronous and proc.pid > 0 and not self._silent: aproc = AsyncCmd(node, command, proc=proc) aproc.start() return (rc, result) if proc.stdout: result = proc.stdout.readlines() proc.stdout.close() else: self._log("No stdout stream") rc = proc.wait() if verbose > 0: self._debug("cmd: target=%s, rc=%d: %s" % (node, rc, command)) result = convert2string(result) if proc.stderr: errors = proc.stderr.readlines() proc.stderr.close() for err in errors: self._debug("cmd: stderr: %s" % err) if verbose == 2: for line in result: self._debug("cmd: stdout: %s" % line) return (rc, result) def copy(self, source, target, silent=False): """ Perform a copy of the source file to the remote target. This function uses the cp_command provided when the RemoteExec object was created. Returns the return code of the cp_command. """ + # @TODO Use subprocess module with argument array instead + # (self._cp_command should be an array too) cmd = "%s '%s' '%s'" % (self._cp_command, source, target) rc = os.system(cmd) if not silent: self._debug("cmd: rc=%d: %s" % (rc, cmd)) return rc def exists_on_all(self, filename, hosts): """Return True if specified file exists on all specified hosts.""" for host in hosts: rc = self(host, "test -r %s" % filename) if rc != 0: return False return True class RemoteFactory: """A class for constructing a singleton instance of a RemoteExec object.""" # Class variables # -n: no stdin, -x: no X11, # -o ServerAliveInterval=5: disconnect after 3*5s if the server # stops responding command = ("ssh -l root -n -x -o ServerAliveInterval=5 " "-o ConnectTimeout=10 -o TCPKeepAlive=yes " "-o ServerAliveCountMax=3 ") # -B: batch mode, -q: no stats (quiet) cp_command = "scp -B -q" instance = None # pylint: disable=invalid-name def getInstance(self): """ Return the previously created instance of RemoteExec. If no instance exists, create one and then return that. """ if not RemoteFactory.instance: RemoteFactory.instance = RemoteExec(RemoteFactory.command, RemoteFactory.cp_command, False) return RemoteFactory.instance def enable_qarsh(self): """Enable the QA remote shell.""" # http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/ print("Using QARSH for connections to cluster nodes") RemoteFactory.command = "qarsh -t 300 -l root" RemoteFactory.cp_command = "qacp -q" diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py index 0832407fee..08fae3add8 100644 --- a/python/pacemaker/_cts/tests/componentfail.py +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -1,162 +1,166 @@ """Kill a pacemaker daemon and test how the cluster recovers.""" __all__ = ["ComponentFail"] __copyright__ = "Copyright 2000-2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object +# @TODO Separate this into a separate test for each component, so the patterns +# can be made specific to each component, investigating failures is a little +# easier, and specific testing can be done for each component (for example, +# set attributes before and after killing pacemaker-attrd and check values). class ComponentFail(CTSTest): """Kill a random pacemaker daemon and wait for the cluster to recover.""" def __init__(self, cm): """ Create a new ComponentFail instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "ComponentFail" self._complist = cm.components self._okerrpatterns = [] self._patterns = [] self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") self._patterns = [] self._okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") if not self._cm.cluster_stable(self._env["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self._cm.is_node_dc(node, None) # select a component to kill chosen = self._env.random_gen.choice(self._complist) while chosen.dc_only and not node_is_dc: chosen = self._env.random_gen.choice(self._complist) self.debug("...component %s (dc=%s)" % (chosen.name, node_is_dc)) self.incr(chosen.name) if chosen.name != "corosync": self._patterns.extend([ self.templates["Pat:ChildKilled"] % (node, chosen.name), self.templates["Pat:ChildRespawn"] % (node, chosen.name), ]) self._patterns.extend(chosen.pats) if node_is_dc: self._patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self._okerrpatterns = [ self.templates["Pat:Fencing_active"], ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self._okerrpatterns.extend([ self.templates["Pat:Fencing_recover"] % r.id, self.templates["Pat:Fencing_probe"] % r.id, ]) # supply a copy so self.patterns doesn't end up empty tmp_pats = self._patterns.copy() self._patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith_pats = [ self.templates["Pat:Fencing_ok"] % node ] stonith = self.create_watch(stonith_pats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) watch.set_watch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self._cm.cluster_stable(self._env["StartTime"]) self.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.debug("Found: %r" % shot) self._okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.expected_status[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log("Patterns not found: %r" % watch.unmatched) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self._cm.cluster_stable(self._env["StartTime"]) if not matched: return self.failure("Didn't find all expected %s patterns" % chosen.name) if not is_stable: return self.failure("Cluster did not become stable after killing %s" % chosen.name) return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self._okerrpatterns.extend(self._patterns) return self._okerrpatterns