diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in index 034304a974..30fb39f298 100644 --- a/cts/cts-fencing.in +++ b/cts/cts-fencing.in @@ -1,954 +1,925 @@ #!@PYTHON@ """Regression tests for Pacemaker's fencer.""" # pylint doesn't like the module name "cts-fencing" which is an invalid complaint for this file # but probably something we want to continue warning about elsewhere # pylint: disable=invalid-name # pacemaker imports need to come after we modify sys.path, which pylint will complain about. # pylint: disable=wrong-import-position __copyright__ = "Copyright 2012-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse import os import sys import subprocess import tempfile # These imports allow running from a source checkout after running `make`. # Note that while this doesn't necessarily mean it will successfully run tests, # but being able to see --help output can be useful. if os.path.exists("@abs_top_srcdir@/python"): sys.path.insert(0, "@abs_top_srcdir@/python") # pylint: disable=comparison-of-constants,comparison-with-itself,condition-evals-to-constant if os.path.exists("@abs_top_builddir@/python") and "@abs_top_builddir@" != "@abs_top_srcdir@": sys.path.insert(0, "@abs_top_builddir@/python") from pacemaker.buildoptions import BuildOptions from pacemaker.exitstatus import ExitStatus from pacemaker._cts.corosync import Corosync, localname from pacemaker._cts.process import killall, exit_if_proc_running from pacemaker._cts.test import Test, Tests TEST_DIR = sys.path[0] def update_path(): """Set the PATH environment variable appropriately for the tests.""" new_path = os.environ['PATH'] if os.path.exists(f"{TEST_DIR}/cts-fencing.in"): # pylint: disable=protected-access print(f"Running tests from the source tree: {BuildOptions._BUILD_DIR} ({TEST_DIR})") # For pacemaker-fenced and cts-fence-helper new_path = f"{BuildOptions._BUILD_DIR}/daemons/fenced:{new_path}" new_path = f"{BuildOptions._BUILD_DIR}/tools:{new_path}" # For stonith_admin new_path = f"{BuildOptions._BUILD_DIR}/cts/support:{new_path}" # For cts-support else: print(f"Running tests from the install tree: {BuildOptions.DAEMON_DIR} (not {TEST_DIR})") # For pacemaker-fenced, cts-fence-helper, and cts-support new_path = f"{BuildOptions.DAEMON_DIR}:{new_path}" print(f'Using PATH="{new_path}"') os.environ['PATH'] = new_path class FenceTest(Test): """Executor for a single test.""" def __init__(self, name, description, **kwargs): """ Create a new FenceTest instance. Arguments: name -- A unique name for this test. This can be used on the command line to specify that only a specific test should be executed. description -- A meaningful description for the test. """ Test.__init__(self, name, description, **kwargs) self._daemon_location = "pacemaker-fenced" def _kill_daemons(self): killall(["pacemakerd", "pacemaker-fenced"]) def _start_daemons(self): cmd = ["pacemaker-fenced", "--stand-alone", "--logfile", self.logpath] if self.verbose: cmd += ["-V"] s = " ".join(cmd) print(f"Starting {s}") # pylint: disable=consider-using-with self._daemon_process = subprocess.Popen(cmd) class FenceTests(Tests): """Collection of all fencing regression tests.""" def __init__(self, **kwargs): """Create a new FenceTests instance.""" Tests.__init__(self, **kwargs) self._corosync = Corosync(self.verbose, self.logdir, "cts-fencing") def new_test(self, name, description): """Create a named test.""" test = FenceTest(name, description, verbose=self.verbose, timeout=self.timeout, force_wait=self.force_wait, logdir=self.logdir) self._tests.append(test) return test def build_api_sanity_tests(self): """Register tests to verify basic API usage.""" verbose_arg = "" if self.verbose: verbose_arg = "-V" test = self.new_test("low_level_api_test", "Sanity-test client API") test.add_cmd("cts-fence-helper", args=f"-t {verbose_arg}", validate=False) test = self.new_test("low_level_api_mainloop_test", "Sanity-test client API using mainloop") test.add_cmd("cts-fence-helper", args=f"-m {verbose_arg}", validate=False) def build_custom_timeout_tests(self): """Register tests to verify custom timeout usage.""" # custom timeout without topology test = self.new_test("custom_timeout_1", "Verify per device timeouts work as expected without using topology") test.add_cmd('stonith_admin', args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 -o pcmk_off_timeout=1') test.add_cmd('stonith_admin', args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 -o pcmk_off_timeout=4') test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") # timeout is 5+1+4 = 10 test.add_log_pattern("Total timeout set to 12s") # custom timeout _WITH_ topology test = self.new_test("custom_timeout_2", "Verify per device timeouts work as expected _WITH_ topology") test.add_cmd('stonith_admin', args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 -o pcmk_off_timeout=1000ms') test.add_cmd('stonith_admin', args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 -o pcmk_off_timeout=4000s') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") # timeout is 5+1+4000 = 4006 test.add_log_pattern("Total timeout set to 4807s") def build_fence_merge_tests(self): """Register tests to verify when fence operations should be merged.""" # Simple test that overlapping fencing operations get merged test = self.new_test("custom_merge_single", "Verify overlapping identical fencing operations are merged, no fencing levels used") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") # one merger will happen test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") # the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") # Test that multiple mergers occur test = self.new_test("custom_merge_multiple", "Verify multiple overlapping identical fencing operations are merged") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o delay=2 -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") # 4 mergers should occur test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") # the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") # Test that multiple mergers occur with topologies used test = self.new_test("custom_merge_with_topology", "Verify multiple overlapping identical fencing operations are merged with fencing levels") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") # 4 mergers should occur test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") # the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") def build_fence_no_merge_tests(self): """Register tests to verify when fence operations should not be merged.""" test = self.new_test("custom_no_merge", "Verify differing fencing operations are not merged") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -F node2 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client", negative=True) def build_standalone_tests(self): """Register a grab bag of tests.""" # test what happens when all devices timeout test = self.new_test("fence_multi_device_failure", "Verify that all devices timeout, a fencing failure is returned") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false3 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 2", expected_exitcode=ExitStatus.TIMEOUT) test.add_log_pattern("Total timeout set to 7s") test.add_log_pattern("targeting node3 using false1 returned ") test.add_log_pattern("targeting node3 using false2 returned ") test.add_log_pattern("targeting node3 using false3 returned ") # test what happens when multiple devices can fence a node, but the first device fails test = self.new_test("fence_device_failure_rollover", "Verify that when one fence device fails for a node, the others are tried") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") test.add_log_pattern("Total timeout set to 18s") # test what happens when we try to use a missing fence-agent test = self.new_test("fence_missing_agent", "Verify proper error-handling when using a non-existent fence-agent") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_missing -o mode=pass -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node2") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5", expected_exitcode=ExitStatus.NOSUCH) test.add_cmd("stonith_admin", args="--output-as=xml -F node2 -t 5") # simple topology test for one device test = self.new_test("topology_simple", "Verify all fencing devices at a level are used") test.add_cmd("stonith_admin", args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") test.add_log_pattern("Total timeout set to 6s") test.add_log_pattern("targeting node3 using true returned 0") # add topology, delete topology, verify fencing still works test = self.new_test("topology_add_remove", "Verify fencing occurrs after all topology levels are removed") test.add_cmd("stonith_admin", args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 1") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") test.add_log_pattern("Total timeout set to 6s") test.add_log_pattern("targeting node3 using true returned 0") # test what happens when the first fencing level has multiple devices test = self.new_test("topology_device_fails", "Verify if one device in a level fails, the other is tried") test.add_cmd("stonith_admin", args='--output-as=xml -R false -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") test.add_log_pattern("Total timeout set to 48s") test.add_log_pattern("targeting node3 using false returned 1") test.add_log_pattern("targeting node3 using true returned 0") # test what happens when the first fencing level fails test = self.new_test("topology_multi_level_fails", "Verify if one level fails, the next leve is tried") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 3") test.add_log_pattern("Total timeout set to 21s") test.add_log_pattern("targeting node3 using false1 returned 1") test.add_log_pattern("targeting node3 using false2 returned 1") test.add_log_pattern("targeting node3 using true3 returned 0") test.add_log_pattern("targeting node3 using true4 returned 0") # test what happens when the first fencing level had devices that no one has registered test = self.new_test("topology_missing_devices", "Verify topology can continue with missing devices") test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") # Test what happens if multiple fencing levels are defined, and then the first one is removed test = self.new_test("topology_level_removal", "Verify level removal works") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") # Now remove level 2, verify none of the devices in level two are hit test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 2") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") test.add_log_pattern("Total timeout set to 96s") test.add_log_pattern("targeting node3 using false1 returned 1") test.add_log_pattern("targeting node3 using false2 returned ", negative=True) test.add_log_pattern("targeting node3 using true3 returned 0") test.add_log_pattern("targeting node3 using true4 returned 0") # Test targeting a topology level by node name pattern test = self.new_test("topology_level_pattern", "Verify targeting topology by node name pattern works") test.add_cmd("stonith_admin", args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r '@node.*' -i 1 -v true") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") test.add_log_pattern("targeting node3 using true returned 0") # test allowing commas and semicolons as delimiters in pcmk_host_list test = self.new_test("host_list_delimiters", "Verify commas and semicolons can be used as pcmk_host_list delimiters") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1,node2,node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=pcmk1;pcmk2;pcmk3"') test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F node2 -t 5") test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F pcmk3 -t 5") test.add_log_pattern("targeting node2 using true1 returned 0") test.add_log_pattern("targeting pcmk3 using true2 returned 0") # test the stonith builds the correct list of devices that can fence a node test = self.new_test("list_devices", "Verify list of devices that can fence a node is correct") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", stdout_match="true2", stdout_no_match="true1") test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", stdout_match="true3", stdout_no_match="true1") # simple test of device monitor test = self.new_test("monitor", "Verify device is reachable") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node3"') test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") test.add_cmd("stonith_admin", args="--output-as=xml -Q false1") test.add_cmd("stonith_admin", args="--output-as=xml -Q true2", expected_exitcode=ExitStatus.NOSUCH) # Verify monitor occurs for duration of timeout period on failure test = self.new_test("monitor_timeout", "Verify monitor uses duration of timeout period given") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 5", expected_exitcode=ExitStatus.ERROR) test.add_log_pattern("Attempt 2 to execute") # Verify monitor occurs for duration of timeout period on failure, but stops at max retries test = self.new_test("monitor_timeout_max_retries", "Verify monitor retries until max retry value or timeout is hit") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 15", expected_exitcode=ExitStatus.ERROR) test.add_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times") # simple register test test = self.new_test("register", "Verify devices can be registered and un-registered") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") test.add_cmd("stonith_admin", args="--output-as=xml -D true1") test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) # simple reboot test test = self.new_test("reboot", "Verify devices can be rebooted") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') test.add_cmd("stonith_admin", args="--output-as=xml -B node3 -t 5") test.add_cmd("stonith_admin", args="--output-as=xml -D true1") test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) # test fencing history test = self.new_test("fence_history", "Verify last fencing operation is returned") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5 -V") test.add_cmd("stonith_admin", args="--output-as=xml -H node3", stdout_match='action="off" target="node3" .* status="success"') # simple test of dynamic list query test = self.new_test("dynamic_list_query", "Verify dynamic list of fencing devices can be retrieved") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", stdout_match='count="3"') # fence using dynamic list query test = self.new_test("fence_dynamic_list_query", "Verify dynamic list of fencing devices can be retrieved") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", args="--output-as=xml -F fake_port_1 -t 5 -V") # simple test of query using status action test = self.new_test("status_query", "Verify dynamic list of fencing devices can be retrieved") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_check=status') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_check=status') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o pcmk_host_check=status') test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", stdout_match='count="3"') # test what happens when no reboot action is advertised test = self.new_test("no_reboot_support", "Verify reboot action defaults to off when no reboot action is advertised by agent") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy_no_reboot -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") test.add_log_pattern("does not support reboot") test.add_log_pattern("using true1 returned 0") # make sure reboot is used when reboot action is advertised test = self.new_test("with_reboot_support", "Verify reboot action can be used when metadata advertises it") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") test.add_log_pattern("does not advertise support for 'reboot', performing 'off'", negative=True) test.add_log_pattern("using true1 returned 0") # make sure all fencing delays are applied correctly and taken into account by fencing timeouts with topology test = self.new_test("topology_delays", "Verify all fencing delays are applied correctly and taken into account by fencing timeouts with topology") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') # Resulting "random" delay will always be 1 since (rand() % (delay_max - delay_base)) is always 0 here test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1 -o pcmk_delay_max=2') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 --delay 1") # Total fencing timeout takes all fencing delays into account test.add_log_pattern("Total timeout set to 582s") # Fencing timeout for the first device takes the requested fencing delay # and pcmk_delay_base into account test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true1 .*146s.*", regex=True) # Requested fencing delay is applied only for the first device in the # first level, with the static delay from pcmk_delay_base added test.add_log_pattern("Delaying 'off' action targeting node3 using true1 for 2s | timeout=120s requested_delay=1s base=1s max=1s") # Fencing timeout no longer takes the requested fencing delay into account for further devices test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using false1 .*145s.*", regex=True) # Requested fencing delay is no longer applied for further devices test.add_log_pattern("Delaying 'off' action targeting node3 using false1 for 1s | timeout=120s requested_delay=0s base=1s max=1s") # Fencing timeout takes pcmk_delay_max into account test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true2 .*146s.*", regex=True) test.add_log_pattern("Delaying 'off' action targeting node3 using true2 for 1s | timeout=120s requested_delay=0s base=1s max=2s") test.add_log_pattern("Delaying 'off' action targeting node3 using true3", negative=True) - def build_nodeid_tests(self): - """Register tests that use a corosync node id.""" - our_uname = localname() - - # verify nodeid is supplied when nodeid is in the metadata parameters - test = self.new_test("supply_nodeid", - "Verify nodeid is given when fence agent has nodeid as parameter") - - test.add_cmd("stonith_admin", - args=f'--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname}"') - test.add_cmd("stonith_admin", args=f"--output-as=xml -F {our_uname} -t 3") - test.add_log_pattern(f"as nodeid with fence action 'off' targeting {our_uname}") - - # verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters - test = self.new_test("do_not_supply_nodeid", - "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter") - - # use a host name that won't be in corosync.conf - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy_no_nodeid ' - f'-o mode=pass -o pcmk_host_list="regr-test {our_uname}"') - test.add_cmd("stonith_admin", args="--output-as=xml -F regr-test -t 3") - test.add_log_pattern("as nodeid with fence action 'off' targeting regr-test", - negative=True) - test.add_cmd("stonith_admin", args=f"--output-as=xml -F {our_uname} -t 3") - test.add_log_pattern("as nodeid with fence action 'off' targeting {our_uname}", - negative=True) - def build_unfence_tests(self): """Register tests that verify unfencing.""" our_uname = localname() # verify unfencing using automatic unfencing test = self.new_test("unfence_required_1", "Verify require unfencing on all devices when automatic=true in agent's metadata") test.add_cmd('stonith_admin', args=f'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname}"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname}"') test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 3") # both devices should be executed test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") # verify unfencing using automatic unfencing fails if any of the required agents fail test = self.new_test("unfence_required_2", "Verify require unfencing on all devices when automatic=true in agent's metadata") test.add_cmd('stonith_admin', args=f'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname}"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=fail -o "pcmk_host_list={our_uname}"') test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 6", expected_exitcode=ExitStatus.ERROR) # verify unfencing using automatic devices with topology test = self.new_test("unfence_required_3", "Verify require unfencing on all devices even when at different topology levels") test.add_cmd('stonith_admin', args=f'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 1 -v true1") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v true2") test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 3") test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") # verify unfencing using automatic devices with topology test = self.new_test("unfence_required_4", "Verify all required devices are executed even with topology levels fail") test.add_cmd('stonith_admin', args=f'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true3 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R true4 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R false3 -a fence_dummy -o mode=fail -o "pcmk_host_list={our_uname} node3"') test.add_cmd('stonith_admin', args=f'--output-as=xml -R false4 -a fence_dummy -o mode=fail -o "pcmk_host_list={our_uname} node3"') test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 1 -v true1") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 1 -v false1") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v false2") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v true2") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v false3") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v true3") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 3 -v false4") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 4 -v true4") test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 3") test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") test.add_log_pattern("using true3 returned 0") test.add_log_pattern("using true4 returned 0") def build_unfence_on_target_tests(self): """Register tests that verify unfencing that runs on the target.""" our_uname = localname() # verify unfencing using on_target device test = self.new_test("unfence_on_target_1", "Verify unfencing with on_target = true") test.add_cmd("stonith_admin", args=f'--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname}"') test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 3") test.add_log_pattern("(on) to be executed on target") # verify failure of unfencing using on_target device test = self.new_test("unfence_on_target_2", "Verify failure unfencing with on_target = true") test.add_cmd("stonith_admin", args=f'--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname} node_fake_1234"') test.add_cmd("stonith_admin", args="--output-as=xml -U node_fake_1234 -t 3", expected_exitcode=ExitStatus.NOSUCH) test.add_log_pattern("(on) to be executed on target") # verify unfencing using on_target device with topology test = self.new_test("unfence_on_target_3", "Verify unfencing with on_target = true using topology") test.add_cmd("stonith_admin", args=f'--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd("stonith_admin", args=f'--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname} node3"') test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 1 -v true1") test.add_cmd("stonith_admin", args=f"--output-as=xml -r {our_uname} -i 2 -v true2") test.add_cmd("stonith_admin", args=f"--output-as=xml -U {our_uname} -t 3") test.add_log_pattern("(on) to be executed on target") # verify unfencing using on_target device with topology fails when target node doesn't exist test = self.new_test("unfence_on_target_4", "Verify unfencing failure with on_target = true using topology") test.add_cmd("stonith_admin", args=f'--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname} node_fake"') test.add_cmd("stonith_admin", args=f'--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list={our_uname} node_fake"') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -U node_fake -t 3", expected_exitcode=ExitStatus.NOSUCH) test.add_log_pattern("(on) to be executed on target") def build_remap_tests(self): """Register tests that verify remapping of reboots to off-on.""" test = self.new_test("remap_simple", "Verify sequential topology reboot is remapped to all-off-then-all-on") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake ' '-o pcmk_off_timeout=1 -o pcmk_reboot_timeout=10') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake ' '-o pcmk_off_timeout=2 -o pcmk_reboot_timeout=20') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) test.add_log_pattern("Total timeout set to 3s for peer's fencing targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # fence_dummy sets "on" as an on_target action test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("remap_simple_off", "Verify sequential topology reboot skips 'on' if " "pcmk_reboot_action=off or agent doesn't support " "'on'") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass " "-o pcmk_host_list=node_fake -o pcmk_off_timeout=1 " "-o pcmk_reboot_timeout=10 -o pcmk_reboot_action=off") test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy_no_on " "-o mode=pass -o pcmk_host_list=node_fake " "-o pcmk_off_timeout=2 -o pcmk_reboot_timeout=20") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) test.add_log_pattern("Total timeout set to 3s for peer's fencing targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # "on" should be skipped test.add_log_pattern("Not turning node_fake back on using " "true1 because the device is configured " "to stay off") test.add_log_pattern("Not turning node_fake back on using true2" " because the agent doesn't support 'on'") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("remap_automatic", "Verify remapped topology reboot skips automatic 'on'") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy_auto_unfence ' '-o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy_auto_unfence ' '-o "mode=pass" -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test.add_log_pattern("perform 'on' action targeting node_fake using", negative=True) test.add_log_pattern("'on' failure", negative=True) test = self.new_test("remap_complex_1", "Verify remapped topology reboot in second level works if non-remapped first level fails") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("remap_complex_2", "Verify remapped topology reboot failure in second level proceeds to third level") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true1 -v false2 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 3 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using false2") test.add_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test.add_log_pattern("perform 'reboot' action targeting node_fake using true2") test.add_log_pattern("node_fake with true3", negative=True) def build_query_tests(self): """Run stonith_admin --metadata for the fence_dummy agent and check command output.""" test = self.new_test("get_metadata", "Run stonith_admin --metadata for the fence_dummy agent") test.add_cmd("stonith_admin", args="--output-as=xml -a fence_dummy --metadata", stdout_match=' %s %s """ % (agent(), SHORT_DESC, AGENT_VERSION, OCF_VERSION, LONG_DESC)) for (option, _) in sorted_options(avail_opt): if "shortdesc" not in ALL_OPT[option]: continue print(' ' % (option, ALL_OPT[option]["required"])) default = "" default_name_arg = "-%s" % ALL_OPT[option]["getopt"][:-1] default_name_no_arg = "-%s" % ALL_OPT[option]["getopt"] if "default" in ALL_OPT[option]: default = 'default="%s"' % ALL_OPT[option]["default"] elif options.get(default_name_arg) is not None: try: default = 'default="%s"' % options[default_name_arg] except TypeError: # @todo/@note: Currently there is no clean way how to handle lists # we can create a string from it but we can't set it on command line default = 'default="%s"' % str(options[default_name_arg]) elif default_name_no_arg in options: default = 'default="true"' mixed = ALL_OPT[option]["help"] # split it between option and help text res = re.compile(r"^(.*--\S+)\s+", re.IGNORECASE | re.S).search(mixed) if res is not None: mixed = res.group(1) mixed = mixed.replace("<", "<").replace(">", ">") print(' ' % mixed) if ALL_OPT[option]["getopt"].count(":") > 0: print(' ' % default) else: print(' ' % default) print(' %s' % ALL_OPT[option]["shortdesc"]) print(' ') print(' \n ') if not no_on: if auto_unfence: attr_name = 'automatic' else: attr_name = 'on_target' print(' ' % attr_name) print(' ') if not no_reboot: print(' ') print(' ') print(' ') print(' ') print(' ') print(' ') print('') def option_longopt(option): """Return the getopt-compatible long-option name of the given option.""" if ALL_OPT[option]["getopt"].endswith(":"): return ALL_OPT[option]["longopt"] + "=" return ALL_OPT[option]["longopt"] def opts_from_command_line(argv, avail_opt): """Read options from command-line arguments.""" # Prepare list of options for getopt getopt_string = "" longopt_list = [] for k in avail_opt: if k in ALL_OPT: getopt_string += ALL_OPT[k]["getopt"] else: fail_usage("Parse error: unknown option '%s'" % k) if k in ALL_OPT and "longopt" in ALL_OPT[k]: longopt_list.append(option_longopt(k)) try: (opt, _) = getopt.gnu_getopt(argv, getopt_string, longopt_list) except getopt.GetoptError as error: fail_usage("Parse error: %s" % error.msg) # Transform longopt to short one which are used in fencing agents old_opt = opt opt = {} for old_option in dict(old_opt): if old_option.startswith("--"): for rec in ALL_OPT.values(): if rec.get("longopt") is None: continue long = "--%s" % rec["longopt"] if long == old_option: short = "-%s" % rec["getopt"][0] opt[short] = dict(old_opt)[old_option] else: opt[old_option] = dict(old_opt)[old_option] # Compatibility Layer (with what? probably not needed for fence_dummy) new_opt = dict(opt) if "-T" in new_opt: new_opt["-o"] = "status" if "-n" in new_opt: new_opt["-m"] = new_opt["-n"] opt = new_opt return opt def opts_from_stdin(avail_opt): """Read options from standard input.""" opt = {} name = "" for line in sys.stdin.readlines(): line = line.strip() if line.startswith("#") or (len(line) == 0): continue (name, value) = (line + "=").split("=", 1) value = value[:-1] # Compatibility Layer (with what? probably not needed for fence_dummy) if name == "option": name = "action" if name not in avail_opt: print("Parse error: Ignoring unknown option '%s'" % line, file=sys.stderr) continue if ALL_OPT[name]["getopt"].endswith(":"): short = "-%s" % ALL_OPT[name]["getopt"][0] opt[short] = value elif value.lower() in ["1", "yes", "on", "true"]: short = "-%s" % ALL_OPT[name]["getopt"] opt[short] = "1" return opt def process_input(avail_opt): """Set standard environment variables, and parse all options.""" # Set standard environment os.putenv("LANG", "C") os.putenv("LC_ALL", "C") # Read options from command line or standard input if len(sys.argv) > 1: return opts_from_command_line(sys.argv[1:], avail_opt) return opts_from_stdin(avail_opt) def atexit_handler(): """Close stdout on exit.""" try: sys.stdout.close() os.close(1) except IOError: sys.exit("%s failed to close standard output" % agent()) def success_mode(options, option, default_value): """Return exit code specified by option.""" if option in options: test_value = options[option] else: test_value = default_value if test_value == "pass": exitcode = 0 elif test_value == "fail": exitcode = 1 else: exitcode = random.randint(0, 1) return exitcode def write_options(options): """Write out all options to debug file.""" with contextlib.suppress(IOError): with io.open(options["-D"], "at", encoding="utf-8") as debugfile: debugfile.write("### %s ###\n" % time.strftime("%Y-%m-%d %H:%M:%S")) for option in sorted(options): debugfile.write("%s=%s\n" % (option, options[option])) debugfile.write("###\n") def main(): """Run the dummy fencing agent.""" auto_unfence = False no_reboot = False no_on = False # Meta-data can't take parameters, so we simulate different meta-data # behavior based on the executable name (which can be a symbolic link). if sys.argv[0].endswith("_auto_unfence"): auto_unfence = True elif sys.argv[0].endswith("_no_reboot"): no_reboot = True elif sys.argv[0].endswith("_no_on"): no_on = True - elif sys.argv[0].endswith("_no_nodeid"): - del ALL_OPT["nodeid"] device_opt = ALL_OPT.keys() # Defaults for fence agent atexit.register(atexit_handler) options = process_input(device_opt) options["device_opt"] = device_opt show_docs(options, auto_unfence, no_reboot, no_on) action = options.get("-o", "reboot") # dump input to file if "-D" in options and action != "validate-all": write_options(options) if "-f" in options and action != "validate-all": val = int(options["-f"]) print("delay sleep for %d seconds" % val, file=sys.stderr) time.sleep(val) # random sleep for testing if "-R" in options and action != "validate-all": val = int(options["-R"]) ran = random.randint(1, val) print("random sleep for %d seconds" % ran, file=sys.stderr) time.sleep(ran) if action == "monitor": if "-d" in options: time.sleep(int(options["-d"])) exitcode = success_mode(options, "-m", "pass") elif action == "list": print("fence_dummy action (list) called", file=sys.stderr) if "-H" in options: print(options["-H"]) exitcode = 0 else: print("dynamic hostlist requires mock_dynamic_hosts to be set", file=sys.stderr) exitcode = 1 elif action == "validate-all": if "-f" in options: val = int(options["-f"]) if val > 10: exitcode = 1 else: exitcode = 0 else: exitcode = 1 elif action == "off": if "-F" in options: time.sleep(int(options["-F"])) exitcode = success_mode(options, "-M", "random") else: exitcode = success_mode(options, "-M", "random") # Ensure we generate some error output on failure exit. if exitcode == 1: print("simulated %s failure" % action, file=sys.stderr) sys.exit(exitcode) if __name__ == "__main__": main() diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c index d085ff8ab3..e1535b5529 100644 --- a/lib/fencing/st_actions.c +++ b/lib/fencing/st_actions.c @@ -1,730 +1,729 @@ /* * Copyright 2004-2025 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include // xmlNode #include #include #include #include #include #include "fencing_private.h" struct stonith_action_s { /*! user defined data */ char *agent; char *action; GHashTable *args; int timeout; bool async; void *userdata; void (*done_cb) (int pid, const pcmk__action_result_t *result, void *user_data); void (*fork_cb) (int pid, void *user_data); svc_action_t *svc_action; /*! internal timing information */ time_t initial_start_time; int tries; int remaining_timeout; int max_retries; int pid; pcmk__action_result_t result; }; static int internal_stonith_action_execute(stonith_action_t *action); static void log_action(stonith_action_t *action, pid_t pid); /*! * \internal * \brief Set an action's result based on services library result * * \param[in,out] action Fence action to set result for * \param[in,out] svc_action Service action to get result from */ static void set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) { services__copy_result(svc_action, &(action->result)); pcmk__set_result_output(&(action->result), services__grab_stdout(svc_action), services__grab_stderr(svc_action)); } static void log_action(stonith_action_t *action, pid_t pid) { /* The services library has already logged the output at info or debug * level, so just raise to warning for stderr. */ if (action->result.action_stderr != NULL) { /* Logging the whole string confuses syslog when the string is xml */ char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); free(prefix); } } static void append_config_arg(gpointer key, gpointer value, gpointer user_data) { /* Filter out parameters handled directly by Pacemaker. * * STONITH_ATTR_ACTION_OP is added elsewhere and should never be part of the * fencing resource's parameter list. We should ignore its value if it is * configured there. */ if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) && !pcmk_stonith_param(key) && (strstr(key, CRM_META) == NULL) && !pcmk__str_eq(key, PCMK_XA_CRM_FEATURE_SET, pcmk__str_none)) { crm_trace("Passing %s=%s with fence action", (const char *) key, (const char *) (value? value : "")); pcmk__insert_dup((GHashTable *) user_data, key, pcmk__s(value, "")); } } /*! * \internal * \brief Create a table of arguments for a fencing action * * \param[in] agent Fencing agent name * \param[in] action Name of fencing action * \param[in] target Name of target node for fencing action * \param[in] target_nodeid Node ID of target node for fencing action * \param[in] device_args Fence device parameters * \param[in] port_map Target node-to-port mapping for fence device * \param[in] host_arg Argument name for passing target * * \return Newly created hash table of arguments for fencing action */ static GHashTable * make_args(const char *agent, const char *action, const char *target, uint32_t target_nodeid, GHashTable *device_args, GHashTable *port_map, const char *host_arg) { GHashTable *arg_list = NULL; const char *value = NULL; CRM_CHECK(action != NULL, return NULL); arg_list = pcmk__strkey_table(free, free); // Add action to arguments (using an alias if requested) if (device_args) { char buffer[512]; snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); value = g_hash_table_lookup(device_args, buffer); if (value) { crm_debug("Substituting '%s' for fence action %s targeting %s", value, action, pcmk__s(target, "no node")); action = value; } } // Tell the fence agent what action to perform pcmk__insert_dup(arg_list, STONITH_ATTR_ACTION_OP, action); /* If this is a fencing operation against another node, add more standard * arguments. */ if ((target != NULL) && (device_args != NULL)) { const char *param = NULL; /* Always pass the target's name, per * https://github.com/ClusterLabs/fence-agents/blob/main/doc/FenceAgentAPI.md */ pcmk__insert_dup(arg_list, "nodename", target); // If the target's node ID was specified, pass it, too if (target_nodeid != 0) { char *nodeid = crm_strdup_printf("%" PRIu32, target_nodeid); - // cts-fencing looks for this log message crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", nodeid, action, pcmk__s(target, "no node")); g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); } // Check whether target should be specified as some other argument param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); if (param == NULL) { // Use caller's default (likely from agent metadata) param = host_arg; } if ((param != NULL) && !pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) && !pcmk__str_eq(param, PCMK_VALUE_NONE, pcmk__str_casei)) { value = g_hash_table_lookup(device_args, param); if (pcmk__str_eq(value, "dynamic", pcmk__str_casei|pcmk__str_null_matches)) { /* If the host argument is "dynamic" or not configured, * reset it to the target */ const char *alias = NULL; if (port_map) { alias = g_hash_table_lookup(port_map, target); } if (alias == NULL) { alias = target; } crm_debug("Passing %s='%s' with fence action %s targeting %s", param, alias, action, pcmk__s(target, "no node")); pcmk__insert_dup(arg_list, param, alias); } } } if (device_args) { g_hash_table_foreach(device_args, append_config_arg, arg_list); } return arg_list; } /*! * \internal * \brief Free all memory used by a stonith action * * \param[in,out] action Action to free */ void stonith__destroy_action(stonith_action_t *action) { if (action) { free(action->agent); if (action->args) { g_hash_table_destroy(action->args); } free(action->action); if (action->svc_action) { services_action_free(action->svc_action); } pcmk__reset_result(&(action->result)); free(action); } } /*! * \internal * \brief Get the result of an executed stonith action * * \param[in] action Executed action * * \return Pointer to action's result (or NULL if \p action is NULL) */ pcmk__action_result_t * stonith__action_result(stonith_action_t *action) { return (action == NULL)? NULL : &(action->result); } #define FAILURE_MAX_RETRIES 2 /*! * \internal * \brief Create a new fencing action to be executed * * \param[in] agent Fence agent to use * \param[in] action_name Fencing action to be executed * \param[in] target Name of target of fencing action (if known) * \param[in] target_nodeid Node ID of target of fencing action (if known) * \param[in] timeout_sec Timeout to be used when executing action * \param[in] device_args Parameters to pass to fence agent * \param[in] port_map Mapping of target names to device ports * \param[in] host_arg Agent parameter used to pass target name * * \return Newly created fencing action (asserts on error, never NULL) */ stonith_action_t * stonith__action_create(const char *agent, const char *action_name, const char *target, uint32_t target_nodeid, int timeout_sec, GHashTable *device_args, GHashTable *port_map, const char *host_arg) { stonith_action_t *action = pcmk__assert_alloc(1, sizeof(stonith_action_t)); action->args = make_args(agent, action_name, target, target_nodeid, device_args, port_map, host_arg); crm_debug("Preparing '%s' action targeting %s using agent %s", action_name, pcmk__s(target, "no node"), agent); action->agent = strdup(agent); action->action = strdup(action_name); action->timeout = action->remaining_timeout = timeout_sec; action->max_retries = FAILURE_MAX_RETRIES; pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, "Initialization bug in fencing library"); if (device_args) { char buffer[512]; const char *value = NULL; snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", action_name); value = g_hash_table_lookup(device_args, buffer); if (value) { action->max_retries = atoi(value); } } return action; } static gboolean update_remaining_timeout(stonith_action_t * action) { int diff = time(NULL) - action->initial_start_time; if (action->tries >= action->max_retries) { crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", action->agent, action->action, action->max_retries); action->remaining_timeout = 0; } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) && (diff < (action->timeout * 0.7))) { /* only set remaining timeout period if there is 30% * or greater of the original timeout period left */ action->remaining_timeout = action->timeout - diff; } else { action->remaining_timeout = 0; } return action->remaining_timeout ? TRUE : FALSE; } /*! * \internal * \brief Map a fencing action result to a standard return code * * \param[in] result Fencing action result to map * * \return Standard Pacemaker return code that best corresponds to \p result */ int stonith__result2rc(const pcmk__action_result_t *result) { if (pcmk__result_ok(result)) { return pcmk_rc_ok; } switch (result->execution_status) { case PCMK_EXEC_PENDING: return EINPROGRESS; case PCMK_EXEC_CANCELLED: return ECANCELED; case PCMK_EXEC_TIMEOUT: return ETIME; case PCMK_EXEC_NOT_INSTALLED: return ENOENT; case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; case PCMK_EXEC_NO_SECRETS: return EACCES; /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API * operations that don't involve executing an agent (for example, * registering devices). This allows us to use the CRM_EX_* codes in the * exit status for finer-grained responses. */ case PCMK_EXEC_INVALID: switch (result->exit_status) { case CRM_EX_INVALID_PARAM: return EINVAL; case CRM_EX_INSUFFICIENT_PRIV: return EACCES; case CRM_EX_PROTOCOL: return EPROTO; /* CRM_EX_EXPIRED is used for orphaned fencing operations left * over from a previous instance of the fencer. For API backward * compatibility, this is mapped to the previously used code for * this case, EHOSTUNREACH. */ case CRM_EX_EXPIRED: return EHOSTUNREACH; default: break; } break; default: break; } // Try to provide useful error code based on result's error output if (result->action_stderr == NULL) { return ENODATA; } else if (strcasestr(result->action_stderr, "timed out") || strcasestr(result->action_stderr, "timeout")) { return ETIME; } else if (strcasestr(result->action_stderr, "unrecognised action") || strcasestr(result->action_stderr, "unrecognized action") || strcasestr(result->action_stderr, "unsupported action")) { return EOPNOTSUPP; } // Oh well, we tried return pcmk_rc_error; } /*! * \internal * \brief Determine execution status equivalent of legacy fencer return code * * Fence action notifications, and fence action callbacks from older fencers * (<=2.1.2) in a rolling upgrade, will have only a legacy return code. Map this * to an execution status as best as possible (essentially, the inverse of * stonith__result2rc()). * * \param[in] rc Legacy return code from fencer * * \return Execution status best corresponding to \p rc */ int stonith__legacy2status(int rc) { if (rc >= 0) { return PCMK_EXEC_DONE; } switch (-rc) { case EACCES: return PCMK_EXEC_NO_SECRETS; case ECANCELED: return PCMK_EXEC_CANCELLED; case EHOSTUNREACH: return PCMK_EXEC_INVALID; case EINPROGRESS: return PCMK_EXEC_PENDING; case ENODEV: return PCMK_EXEC_NO_FENCE_DEVICE; case ENOENT: return PCMK_EXEC_NOT_INSTALLED; case ENOTCONN: return PCMK_EXEC_NOT_CONNECTED; case EOPNOTSUPP: return PCMK_EXEC_NOT_SUPPORTED; case EPROTO: return PCMK_EXEC_INVALID; case EPROTONOSUPPORT: return PCMK_EXEC_NOT_SUPPORTED; case ETIME: return PCMK_EXEC_TIMEOUT; case ETIMEDOUT: return PCMK_EXEC_TIMEOUT; default: return PCMK_EXEC_ERROR; } } /*! * \internal * \brief Add a fencing result to an XML element as attributes * * \param[in,out] xml XML element to add result to * \param[in] result Fencing result to add (assume success if NULL) */ void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) { int exit_status = CRM_EX_OK; enum pcmk_exec_status execution_status = PCMK_EXEC_DONE; const char *exit_reason = NULL; const char *action_stdout = NULL; int rc = pcmk_ok; CRM_CHECK(xml != NULL, return); if (result != NULL) { exit_status = result->exit_status; execution_status = result->execution_status; exit_reason = result->exit_reason; action_stdout = result->action_stdout; rc = pcmk_rc2legacy(stonith__result2rc(result)); } crm_xml_add_int(xml, PCMK__XA_OP_STATUS, (int) execution_status); crm_xml_add_int(xml, PCMK__XA_RC_CODE, exit_status); crm_xml_add(xml, PCMK_XA_EXIT_REASON, exit_reason); crm_xml_add(xml, PCMK__XA_ST_OUTPUT, action_stdout); /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external * code that use libstonithd <=2.1.2 don't check for the full result, and * need a legacy return code instead. */ crm_xml_add_int(xml, PCMK__XA_ST_RC, rc); } /*! * \internal * \brief Find a fencing result beneath an XML element * * \param[in] xml XML element to search * * \return \p xml or descendant of it that contains a fencing result, else NULL */ xmlNode * stonith__find_xe_with_result(xmlNode *xml) { xmlNode *match = pcmk__xpath_find_one(xml->doc, "//*[@" PCMK__XA_RC_CODE "]", LOG_NEVER); if (match == NULL) { /* @COMPAT Peers <=2.1.2 in a rolling upgrade provide only a legacy * return code, not a full result, so check for that. */ match = pcmk__xpath_find_one(xml->doc, "//*[@" PCMK__XA_ST_RC "]", LOG_ERR); } return match; } /*! * \internal * \brief Get a fencing result from an XML element's attributes * * \param[in] xml XML element with fencing result * \param[out] result Where to store fencing result */ void stonith__xe_get_result(const xmlNode *xml, pcmk__action_result_t *result) { int exit_status = CRM_EX_OK; int execution_status = PCMK_EXEC_DONE; const char *exit_reason = NULL; char *action_stdout = NULL; CRM_CHECK((xml != NULL) && (result != NULL), return); exit_reason = crm_element_value(xml, PCMK_XA_EXIT_REASON); action_stdout = crm_element_value_copy(xml, PCMK__XA_ST_OUTPUT); // A result must include an exit status and execution status if ((crm_element_value_int(xml, PCMK__XA_RC_CODE, &exit_status) < 0) || (crm_element_value_int(xml, PCMK__XA_OP_STATUS, &execution_status) < 0)) { int rc = pcmk_ok; exit_status = CRM_EX_ERROR; /* @COMPAT Peers <=2.1.2 in rolling upgrades provide only a legacy * return code, not a full result, so check for that. */ if (crm_element_value_int(xml, PCMK__XA_ST_RC, &rc) == 0) { if ((rc == pcmk_ok) || (rc == -EINPROGRESS)) { exit_status = CRM_EX_OK; } execution_status = stonith__legacy2status(rc); exit_reason = pcmk_strerror(rc); } else { execution_status = PCMK_EXEC_ERROR; exit_reason = "Fencer reply contained neither a full result " "nor a legacy return code (bug?)"; } } pcmk__set_result(result, exit_status, execution_status, exit_reason); pcmk__set_result_output(result, action_stdout, NULL); } static void stonith_action_async_done(svc_action_t *svc_action) { stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; set_result_from_svc_action(action, svc_action); svc_action->params = NULL; log_action(action, action->pid); if (!pcmk__result_ok(&(action->result)) && update_remaining_timeout(action)) { int rc = internal_stonith_action_execute(action); if (rc == pcmk_ok) { return; } } if (action->done_cb) { action->done_cb(action->pid, &(action->result), action->userdata); } action->svc_action = NULL; // don't remove our caller stonith__destroy_action(action); } static void stonith_action_async_forked(svc_action_t *svc_action) { stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; action->pid = svc_action->pid; action->svc_action = svc_action; if (action->fork_cb) { (action->fork_cb) (svc_action->pid, action->userdata); } pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL); crm_trace("Child process %d performing action '%s' successfully forked", action->pid, action->action); } /*! * \internal * \brief Convert a fencing library action to a services library action * * \param[in,out] action Fencing library action to convert * * \return Services library action equivalent to \p action on success; on error, * NULL will be returned and \p action's result will be set */ static svc_action_t * stonith_action_to_svc(stonith_action_t *action) { static int stonith_sequence = 0; char *path = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", action->agent); svc_action_t *svc_action = services_action_create_generic(path, NULL); free(path); if (svc_action->rc != PCMK_OCF_UNKNOWN) { set_result_from_svc_action(action, svc_action); services_action_free(svc_action); return NULL; } svc_action->timeout = action->remaining_timeout * 1000; svc_action->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_STONITH); svc_action->id = crm_strdup_printf("%s_%s_%dof%d", action->agent, action->action, action->tries, action->max_retries); svc_action->agent = pcmk__str_copy(action->agent); svc_action->sequence = stonith_sequence++; svc_action->params = action->args; svc_action->cb_data = (void *) action; svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Action", svc_action->id, svc_action->flags, SVC_ACTION_NON_BLOCKED, "SVC_ACTION_NON_BLOCKED"); return svc_action; } static int internal_stonith_action_execute(stonith_action_t * action) { int rc = pcmk_ok; int is_retry = 0; svc_action_t *svc_action = NULL; CRM_CHECK(action != NULL, return -EINVAL); if ((action->action == NULL) || (action->args == NULL) || (action->agent == NULL)) { pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); return -EINVAL; } if (action->tries++ == 0) { // First attempt of the desired action action->initial_start_time = time(NULL); } else { // Later attempt after earlier failure crm_info("Attempt %d to execute '%s' action of agent %s " "(%ds timeout remaining)", action->tries, action->action, action->agent, action->remaining_timeout); is_retry = 1; } svc_action = stonith_action_to_svc(action); if (svc_action == NULL) { // The only possible errors are out-of-memory and too many arguments return -E2BIG; } /* keep retries from executing out of control and free previous results */ if (is_retry) { pcmk__reset_result(&(action->result)); // @TODO This should be nonblocking via timer if mainloop is used sleep(1); } if (action->async) { // We never create a recurring action, so this should always return TRUE CRM_LOG_ASSERT(services_action_async_fork_notify(svc_action, &stonith_action_async_done, &stonith_action_async_forked)); return pcmk_ok; } else if (!services_action_sync(svc_action)) { rc = -ECONNABORTED; // @TODO Update API to return more useful error } set_result_from_svc_action(action, svc_action); svc_action->params = NULL; services_action_free(svc_action); return rc; } /*! * \internal * \brief Kick off execution of an async stonith action * * \param[in,out] action Action to be executed * \param[in,out] userdata Datapointer to be passed to callbacks * \param[in] done Callback to notify action has failed/succeeded * \param[in] fork_callback Callback to notify successful fork of child * * \return pcmk_ok if ownership of action has been taken, -errno otherwise */ int stonith__execute_async(stonith_action_t * action, void *userdata, void (*done) (int pid, const pcmk__action_result_t *result, void *user_data), void (*fork_cb) (int pid, void *user_data)) { if (!action) { return -EINVAL; } action->userdata = userdata; action->done_cb = done; action->fork_cb = fork_cb; action->async = true; return internal_stonith_action_execute(action); } /*! * \internal * \brief Execute a stonith action * * \param[in,out] action Action to execute * * \return pcmk_ok on success, -errno otherwise */ int stonith__execute(stonith_action_t *action) { int rc = pcmk_ok; CRM_CHECK(action != NULL, return -EINVAL); // Keep trying until success, max retries, or timeout do { rc = internal_stonith_action_execute(action); } while ((rc != pcmk_ok) && update_remaining_timeout(action)); return rc; }