diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in index 8ce354e035..8ddd4fc68a 100644 --- a/cts/cts-fencing.in +++ b/cts/cts-fencing.in @@ -1,1102 +1,954 @@ #!@PYTHON@ """ Regression tests for Pacemaker's fencer """ -__copyright__ = "Copyright 2012-2023 the Pacemaker project contributors" +__copyright__ = "Copyright 2012-2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse import os import sys import subprocess import tempfile # These imports allow running from a source checkout after running `make`. # Note that while this doesn't necessarily mean it will successfully run tests, # but being able to see --help output can be useful. if os.path.exists("@abs_top_srcdir@/python"): sys.path.insert(0, "@abs_top_srcdir@/python") if os.path.exists("@abs_top_builddir@/python") and "@abs_top_builddir@" != "@abs_top_srcdir@": sys.path.insert(0, "@abs_top_builddir@/python") from pacemaker.buildoptions import BuildOptions from pacemaker.exitstatus import ExitStatus from pacemaker._cts.corosync import Corosync, localname from pacemaker._cts.errors import ExitCodeError, OutputFoundError, OutputNotFoundError, XmlValidationError from pacemaker._cts.process import killall, exit_if_proc_running from pacemaker._cts.test import Test, Tests TEST_DIR = sys.path[0] def update_path(): """ Set the PATH environment variable appropriately for the tests """ new_path = os.environ['PATH'] if os.path.exists("%s/cts-fencing.in" % TEST_DIR): print("Running tests from the source tree: %s (%s)" % (BuildOptions._BUILD_DIR, TEST_DIR)) # For pacemaker-fenced and cts-fence-helper new_path = "%s/daemons/fenced:%s" % (BuildOptions._BUILD_DIR, new_path) new_path = "%s/tools:%s" % (BuildOptions._BUILD_DIR, new_path) # For stonith_admin new_path = "%s/cts/support:%s" % (BuildOptions._BUILD_DIR, new_path) # For cts-support else: print("Running tests from the install tree: %s (not %s)" % (BuildOptions.DAEMON_DIR, TEST_DIR)) # For pacemaker-fenced, cts-fence-helper, and cts-support new_path = "%s:%s" % (BuildOptions.DAEMON_DIR, new_path) print('Using PATH="%s"' % new_path) os.environ['PATH'] = new_path class FenceTest(Test): """ Executor for a single test """ def __init__(self, name, description, **kwargs): Test.__init__(self, name, description, **kwargs) - if kwargs.get("with_cpg", False): - self._enable_corosync = True - self._daemon_options = ["-c"] - else: - self._enable_corosync = False - self._daemon_options = ["-s"] - self._daemon_location = "pacemaker-fenced" def _kill_daemons(self): killall(["pacemakerd", "pacemaker-fenced"]) def _start_daemons(self): + cmd = ["pacemaker-fenced", "-c", "-l", self.logpath] if self.verbose: - self._daemon_options += ["-V"] - print("Starting %s with %s" % (self._daemon_location, self._daemon_options)) + cmd += ["-V"] + print("Starting %s" % " ".join(cmd)) - cmd = ["pacemaker-fenced", "-l", self.logpath] + self._daemon_options self._daemon_process = subprocess.Popen(cmd) class FenceTests(Tests): """ Collection of all fencing regression tests """ def __init__(self, **kwargs): Tests.__init__(self, **kwargs) self._corosync = Corosync(self.verbose, self.logdir, "cts-fencing") - def new_test(self, name, description, with_cpg=False): + def new_test(self, name, description): """ Create a named test """ - test = FenceTest(name, description, verbose=self.verbose, with_cpg=with_cpg, + test = FenceTest(name, description, verbose=self.verbose, timeout=self.timeout, force_wait=self.force_wait, logdir=self.logdir) self._tests.append(test) return test - def run_cpg_only(self): - """ Run all corosync-enabled tests """ - - for test in self._tests: - if test._enable_corosync: - test.run() - - def run_no_cpg(self): - """ Run all standalone tests """ - - for test in self._tests: - if not test._enable_corosync: - test.run() - def build_api_sanity_tests(self): """ Register tests to verify basic API usage """ verbose_arg = "" if self.verbose: verbose_arg = "-V" - test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.") + test = self.new_test("low_level_api_test", "Sanity-test client API") test.add_cmd("cts-fence-helper", args="-t %s" % verbose_arg, validate=False) - test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", True) + test = self.new_test("low_level_api_mainloop_test", + "Sanity-test client API using mainloop") test.add_cmd("cts-fence-helper", args="-m %s" % verbose_arg, validate=False) def build_custom_timeout_tests(self): """ Register tests to verify custom timeout usage """ # custom timeout without topology - test = self.new_test("cpg_custom_timeout_1", - "Verify per device timeouts work as expected without using topology.", True) + test = self.new_test("custom_timeout_1", + "Verify per device timeouts work as expected without using topology") test.add_cmd('stonith_admin', args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 -o pcmk_off_timeout=1') test.add_cmd('stonith_admin', args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 -o pcmk_off_timeout=4') test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") # timeout is 5+1+4 = 10 test.add_log_pattern("Total timeout set to 12s") # custom timeout _WITH_ topology - test = self.new_test("cpg_custom_timeout_2", - "Verify per device timeouts work as expected _WITH_ topology.", True) + test = self.new_test("custom_timeout_2", + "Verify per device timeouts work as expected _WITH_ topology") test.add_cmd('stonith_admin', args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 -o pcmk_off_timeout=1000ms') test.add_cmd('stonith_admin', args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 -o pcmk_off_timeout=4000s') test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") # timeout is 5+1+4000 = 4006 test.add_log_pattern("Total timeout set to 4807s") def build_fence_merge_tests(self): """ Register tests to verify when fence operations should be merged """ ### Simple test that overlapping fencing operations get merged - test = self.new_test("cpg_custom_merge_single", - "Verify overlapping identical fencing operations are merged, no fencing levels used.", True) + test = self.new_test("custom_merge_single", + "Verify overlapping identical fencing operations are merged, no fencing levels used") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") ### one merger will happen test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") ### Test that multiple mergers occur - test = self.new_test("cpg_custom_merge_multiple", - "Verify multiple overlapping identical fencing operations are merged", True) + test = self.new_test("custom_merge_multiple", + "Verify multiple overlapping identical fencing operations are merged") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o delay=2 -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") ### 4 mergers should occur test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") ### Test that multiple mergers occur with topologies used - test = self.new_test("cpg_custom_merge_with_topology", - "Verify multiple overlapping identical fencing operations are merged with fencing levels.", - True) + test = self.new_test("custom_merge_with_topology", + "Verify multiple overlapping identical fencing operations are merged with fencing levels") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") ### 4 mergers should occur test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") test.add_log_pattern("Operation 'off' targeting node3 by ") def build_fence_no_merge_tests(self): """ Register tests to verify when fence operations should not be merged """ - test = self.new_test("cpg_custom_no_merge", - "Verify differing fencing operations are not merged", True) + test = self.new_test("custom_no_merge", + "Verify differing fencing operations are not merged") test.add_cmd("stonith_admin", args="--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node3 node2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -F node2 -t 10", no_wait=True) test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 10") test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client", negative=True) def build_standalone_tests(self): - """ Register a grab bag of tests that can be executed in standalone or corosync mode """ - - test_types = [ - { - "prefix" : "standalone", - "use_cpg" : False, - }, - { - "prefix" : "cpg", - "use_cpg" : True, - }, - ] + """ Register a grab bag of tests """ # test what happens when all devices timeout - for test_type in test_types: - test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"], - "Verify that all devices timeout, a fencing failure is returned.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false3 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - if test_type["use_cpg"]: - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 2", expected_exitcode=ExitStatus.TIMEOUT) - test.add_log_pattern("Total timeout set to 7s") - else: - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 2", expected_exitcode=ExitStatus.ERROR) - - test.add_log_pattern("targeting node3 using false1 returned ") - test.add_log_pattern("targeting node3 using false2 returned ") - test.add_log_pattern("targeting node3 using false3 returned ") - - # test what happens when multiple devices can fence a node, but the first device fails. - for test_type in test_types: - test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"], - "Verify that when one fence device fails for a node, the others are tried.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") - - if test_type["use_cpg"]: - test.add_log_pattern("Total timeout set to 18s") - - # test what happens when we try to use a missing fence-agent. - for test_type in test_types: - test = self.new_test("%s_fence_missing_agent" % test_type["prefix"], - "Verify proper error-handling when using a non-existent fence-agent.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args="--output-as=xml -R true1 -a fence_missing -o mode=pass -o pcmk_host_list=node3") - test.add_cmd("stonith_admin", - args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node2") - - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5", expected_exitcode=ExitStatus.NOSUCH) - test.add_cmd("stonith_admin", args="--output-as=xml -F node2 -t 5") + test = self.new_test("fence_multi_device_failure", + "Verify that all devices timeout, a fencing failure is returned") + test.add_cmd("stonith_admin", + args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false3 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 2", expected_exitcode=ExitStatus.TIMEOUT) + test.add_log_pattern("Total timeout set to 7s") + test.add_log_pattern("targeting node3 using false1 returned ") + test.add_log_pattern("targeting node3 using false2 returned ") + test.add_log_pattern("targeting node3 using false3 returned ") + + # test what happens when multiple devices can fence a node, but the first device fails + test = self.new_test("fence_device_failure_rollover", + "Verify that when one fence device fails for a node, the others are tried") + test.add_cmd("stonith_admin", + args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + test.add_log_pattern("Total timeout set to 18s") - # simple topology test for one device - for test_type in test_types: - if not test_type["use_cpg"]: - continue + # test what happens when we try to use a missing fence-agent + test = self.new_test("fence_missing_agent", + "Verify proper error-handling when using a non-existent fence-agent") + test.add_cmd("stonith_admin", + args="--output-as=xml -R true1 -a fence_missing -o mode=pass -o pcmk_host_list=node3") + test.add_cmd("stonith_admin", + args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node2") - test = self.new_test("%s_topology_simple" % test_type["prefix"], - "Verify all fencing devices at a level are used.", test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5", expected_exitcode=ExitStatus.NOSUCH) + test.add_cmd("stonith_admin", args="--output-as=xml -F node2 -t 5") - test.add_log_pattern("Total timeout set to 6s") - test.add_log_pattern("targeting node3 using true returned 0") + # simple topology test for one device + test = self.new_test("topology_simple", + "Verify all fencing devices at a level are used") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + test.add_log_pattern("Total timeout set to 6s") + test.add_log_pattern("targeting node3 using true returned 0") # add topology, delete topology, verify fencing still works - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_add_remove" % test_type["prefix"], - "Verify fencing occurrs after all topology levels are removed", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") - test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 1") - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") - - test.add_log_pattern("Total timeout set to 6s") - test.add_log_pattern("targeting node3 using true returned 0") - - # test what happens when the first fencing level has multiple devices. - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_device_fails" % test_type["prefix"], - "Verify if one device in a level fails, the other is tried.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R false -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true") - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") - - test.add_log_pattern("Total timeout set to 48s") - test.add_log_pattern("targeting node3 using false returned 1") - test.add_log_pattern("targeting node3 using true returned 0") - - # test what happens when the first fencing level fails. - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"], - "Verify if one level fails, the next leve is tried.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") - - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 3") - - test.add_log_pattern("Total timeout set to 21s") - test.add_log_pattern("targeting node3 using false1 returned 1") - test.add_log_pattern("targeting node3 using false2 returned 1") - test.add_log_pattern("targeting node3 using true3 returned 0") - test.add_log_pattern("targeting node3 using true4 returned 0") + test = self.new_test("topology_add_remove", + "Verify fencing occurrs after all topology levels are removed") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true") + test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 1") + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + + test.add_log_pattern("Total timeout set to 6s") + test.add_log_pattern("targeting node3 using true returned 0") + + # test what happens when the first fencing level has multiple devices + test = self.new_test("topology_device_fails", + "Verify if one device in a level fails, the other is tried") + test.add_cmd("stonith_admin", + args='--output-as=xml -R false -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true") + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") + test.add_log_pattern("Total timeout set to 48s") + test.add_log_pattern("targeting node3 using false returned 1") + test.add_log_pattern("targeting node3 using true returned 0") + + # test what happens when the first fencing level fails + test = self.new_test("topology_multi_level_fails", + "Verify if one level fails, the next leve is tried") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") + + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 3") + + test.add_log_pattern("Total timeout set to 21s") + test.add_log_pattern("targeting node3 using false1 returned 1") + test.add_log_pattern("targeting node3 using false2 returned 1") + test.add_log_pattern("targeting node3 using true3 returned 0") + test.add_log_pattern("targeting node3 using true4 returned 0") # test what happens when the first fencing level had devices that no one has registered - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], - "Verify topology can continue with missing devices.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") - - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") - - # Test what happens if multiple fencing levels are defined, and then the first one is removed. - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_level_removal" % test_type["prefix"], - "Verify level removal works.", test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") - - # Now remove level 2, verify none of the devices in level two are hit. - test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 2") - - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") - - test.add_log_pattern("Total timeout set to 96s") - test.add_log_pattern("targeting node3 using false1 returned 1") - test.add_log_pattern("targeting node3 using false2 returned ", - negative=True) - test.add_log_pattern("targeting node3 using true3 returned 0") - test.add_log_pattern("targeting node3 using true4 returned 0") - - # Test targeting a topology level by node name pattern. - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_level_pattern" % test_type["prefix"], - "Verify targeting topology by node name pattern works.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -r '@node.*' -i 1 -v true") - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") - test.add_log_pattern("targeting node3 using true returned 0") + test = self.new_test("topology_missing_devices", + "Verify topology can continue with missing devices") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") + + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + + # Test what happens if multiple fencing levels are defined, and then the first one is removed + test = self.new_test("topology_level_removal", + "Verify level removal works") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true4 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3"') + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v false2") + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true3") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 3 -v true4") + + # Now remove level 2, verify none of the devices in level two are hit + test.add_cmd("stonith_admin", args="--output-as=xml -d node3 -i 2") + + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 20") + + test.add_log_pattern("Total timeout set to 96s") + test.add_log_pattern("targeting node3 using false1 returned 1") + test.add_log_pattern("targeting node3 using false2 returned ", + negative=True) + test.add_log_pattern("targeting node3 using true3 returned 0") + test.add_log_pattern("targeting node3 using true4 returned 0") + + # Test targeting a topology level by node name pattern + test = self.new_test("topology_level_pattern", + "Verify targeting topology by node name pattern works") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -r '@node.*' -i 1 -v true") + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5") + test.add_log_pattern("targeting node3 using true returned 0") # test allowing commas and semicolons as delimiters in pcmk_host_list - for test_type in test_types: - test = self.new_test("%s_host_list_delimiters" % test_type["prefix"], - "Verify commas and semicolons can be used as pcmk_host_list delimiters", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1,node2,node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=pcmk1;pcmk2;pcmk3"') - test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F node2 -t 5") - test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F pcmk3 -t 5") - test.add_log_pattern("targeting node2 using true1 returned 0") - test.add_log_pattern("targeting pcmk3 using true2 returned 0") - - # test the stonith builds the correct list of devices that can fence a node. - for test_type in test_types: - test = self.new_test("%s_list_devices" % test_type["prefix"], - "Verify list of devices that can fence a node is correct", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", - stdout_match="true2", stdout_no_match="true1") - test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", - stdout_match="true3", stdout_no_match="true1") + test = self.new_test("host_list_delimiters", + "Verify commas and semicolons can be used as pcmk_host_list delimiters") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1,node2,node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=pcmk1;pcmk2;pcmk3"') + test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F node2 -t 5") + test.add_cmd("stonith_admin", args="stonith_admin --output-as=xml -F pcmk3 -t 5") + test.add_log_pattern("targeting node2 using true1 returned 0") + test.add_log_pattern("targeting pcmk3 using true2 returned 0") + + # test the stonith builds the correct list of devices that can fence a node + test = self.new_test("list_devices", + "Verify list of devices that can fence a node is correct") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", + stdout_match="true2", stdout_no_match="true1") + test.add_cmd("stonith_admin", args="--output-as=xml -l node1 -V", + stdout_match="true3", stdout_no_match="true1") # simple test of device monitor - for test_type in test_types: - test = self.new_test("%s_monitor" % test_type["prefix"], - "Verify device is reachable", test_type["use_cpg"]) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') - test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node3"') + test = self.new_test("monitor", "Verify device is reachable") + test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node3"') + test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") - test.add_cmd("stonith_admin", args="--output-as=xml -Q false1") - test.add_cmd("stonith_admin", args="--output-as=xml -Q true2", expected_exitcode=ExitStatus.NOSUCH) + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") + test.add_cmd("stonith_admin", args="--output-as=xml -Q false1") + test.add_cmd("stonith_admin", args="--output-as=xml -Q true2", expected_exitcode=ExitStatus.NOSUCH) # Verify monitor occurs for duration of timeout period on failure - for test_type in test_types: - test = self.new_test("%s_monitor_timeout" % test_type["prefix"], - "Verify monitor uses duration of timeout period given.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 5", expected_exitcode=ExitStatus.ERROR) - test.add_log_pattern("Attempt 2 to execute") + test = self.new_test("monitor_timeout", + "Verify monitor uses duration of timeout period given") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 5", expected_exitcode=ExitStatus.ERROR) + test.add_log_pattern("Attempt 2 to execute") # Verify monitor occurs for duration of timeout period on failure, but stops at max retries - for test_type in test_types: - test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"], - "Verify monitor retries until max retry value or timeout is hit.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 15", expected_exitcode=ExitStatus.ERROR) - test.add_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times") + test = self.new_test("monitor_timeout_max_retries", + "Verify monitor retries until max retry value or timeout is hit") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=fail -o monitor_mode=fail -o pcmk_host_list=node3') + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1 -t 15", expected_exitcode=ExitStatus.ERROR) + test.add_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times") # simple register test - for test_type in test_types: - test = self.new_test("%s_register" % test_type["prefix"], - "Verify devices can be registered and un-registered", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") - test.add_cmd("stonith_admin", args="--output-as=xml -D true1") - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) + test = self.new_test("register", + "Verify devices can be registered and un-registered") + test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1") + test.add_cmd("stonith_admin", args="--output-as=xml -D true1") + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) # simple reboot test - for test_type in test_types: - test = self.new_test("%s_reboot" % test_type["prefix"], - "Verify devices can be rebooted", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') - test.add_cmd("stonith_admin", args="--output-as=xml -B node3 -t 5") - test.add_cmd("stonith_admin", args="--output-as=xml -D true1") - test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) - - # test fencing history. - for test_type in test_types: - if not test_type["use_cpg"]: - continue - test = self.new_test("%s_fence_history" % test_type["prefix"], - "Verify last fencing operation is returned.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5 -V") - test.add_cmd("stonith_admin", args="--output-as=xml -H node3", - stdout_match='action="off" target="node3" .* status="success"') + test = self.new_test("reboot", "Verify devices can be rebooted") + test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') + test.add_cmd("stonith_admin", args="--output-as=xml -B node3 -t 5") + test.add_cmd("stonith_admin", args="--output-as=xml -D true1") + test.add_cmd("stonith_admin", args="--output-as=xml -Q true1", expected_exitcode=ExitStatus.NOSUCH) + + # test fencing history + test = self.new_test("fence_history", + "Verify last fencing operation is returned") + test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node3') + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 -t 5 -V") + test.add_cmd("stonith_admin", args="--output-as=xml -H node3", + stdout_match='action="off" target="node3" .* status="success"') # simple test of dynamic list query - for test_type in test_types: - test = self.new_test("%s_dynamic_list_query" % test_type["prefix"], - "Verify dynamic list of fencing devices can be retrieved.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test = self.new_test("dynamic_list_query", + "Verify dynamic list of fencing devices can be retrieved") + test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", - stdout_match='count="3"') + test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", + stdout_match='count="3"') # fence using dynamic list query - for test_type in test_types: - test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"], - "Verify dynamic list of fencing devices can be retrieved.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test = self.new_test("fence_dynamic_list_query", + "Verify dynamic list of fencing devices can be retrieved") + test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") + test.add_cmd("stonith_admin", args="--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") - test.add_cmd("stonith_admin", args="--output-as=xml -F fake_port_1 -t 5 -V") + test.add_cmd("stonith_admin", args="--output-as=xml -F fake_port_1 -t 5 -V") # simple test of query using status action - for test_type in test_types: - test = self.new_test("%s_status_query" % test_type["prefix"], - "Verify dynamic list of fencing devices can be retrieved.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_check=status') - test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_check=status') - test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o pcmk_host_check=status') + test = self.new_test("status_query", + "Verify dynamic list of fencing devices can be retrieved") + test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_check=status') + test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_check=status') + test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o pcmk_host_check=status') - test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", - stdout_match='count="3"') + test.add_cmd("stonith_admin", args="--output-as=xml -l fake_port_1", + stdout_match='count="3"') # test what happens when no reboot action is advertised - for test_type in test_types: - test = self.new_test("%s_no_reboot_support" % test_type["prefix"], - "Verify reboot action defaults to off when no reboot action is advertised by agent.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy_no_reboot -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") - test.add_log_pattern("does not support reboot") - test.add_log_pattern("using true1 returned 0") + test = self.new_test("no_reboot_support", + "Verify reboot action defaults to off when no reboot action is advertised by agent") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy_no_reboot -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") + test.add_log_pattern("does not support reboot") + test.add_log_pattern("using true1 returned 0") # make sure reboot is used when reboot action is advertised - for test_type in test_types: - test = self.new_test("%s_with_reboot_support" % test_type["prefix"], - "Verify reboot action can be used when metadata advertises it.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") - test.add_log_pattern("does not advertise support for 'reboot', performing 'off'", - negative=True) - test.add_log_pattern("using true1 returned 0") + test = self.new_test("with_reboot_support", + "Verify reboot action can be used when metadata advertises it") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + test.add_cmd("stonith_admin", args="--output-as=xml -B node1 -t 5 -V") + test.add_log_pattern("does not advertise support for 'reboot', performing 'off'", + negative=True) + test.add_log_pattern("using true1 returned 0") # make sure all fencing delays are applied correctly and taken into account by fencing timeouts with topology - for test_type in test_types: - if not test_type["use_cpg"]: - continue - - test = self.new_test("%s_topology_delays" % test_type["prefix"], - "Verify all fencing delays are applied correctly and taken into account by fencing timeouts with topology.", - test_type["use_cpg"]) - test.add_cmd("stonith_admin", - args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') - test.add_cmd("stonith_admin", - args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') - # Resulting "random" delay will always be 1 since (rand() % (delay_max - delay_base)) is always 0 here. - test.add_cmd("stonith_admin", - args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1 -o pcmk_delay_max=2') - test.add_cmd("stonith_admin", - args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') - - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") - test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true3") - - test.add_cmd("stonith_admin", args="--output-as=xml -F node3 --delay 1") - - # Total fencing timeout takes all fencing delays into account. - test.add_log_pattern("Total timeout set to 582s") - - # Fencing timeout for the first device takes the requested fencing delay into account. - # Fencing timeout also takes pcmk_delay_base into account. - test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true1 .*146s.*", - regex=True) - # Requested fencing delay is applied only for the first device in the first level. - # Static delay from pcmk_delay_base is added. - test.add_log_pattern("Delaying 'off' action targeting node3 using true1 for 2s | timeout=120s requested_delay=1s base=1s max=1s") - - # Fencing timeout no longer takes the requested fencing delay into account for further devices. - test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using false1 .*145s.*", - regex=True) - # Requested fencing delay is no longer applied for further devices. - test.add_log_pattern("Delaying 'off' action targeting node3 using false1 for 1s | timeout=120s requested_delay=0s base=1s max=1s") - - # Fencing timeout takes pcmk_delay_max into account. - test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true2 .*146s.*", - regex=True) - test.add_log_pattern("Delaying 'off' action targeting node3 using true2 for 1s | timeout=120s requested_delay=0s base=1s max=2s") - - test.add_log_pattern("Delaying 'off' action targeting node3 using true3", - negative=True) + test = self.new_test("topology_delays", + "Verify all fencing delays are applied correctly and taken into account by fencing timeouts with topology") + test.add_cmd("stonith_admin", + args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') + test.add_cmd("stonith_admin", + args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1') + # Resulting "random" delay will always be 1 since (rand() % (delay_max - delay_base)) is always 0 here + test.add_cmd("stonith_admin", + args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3" -o pcmk_delay_base=1 -o pcmk_delay_max=2') + test.add_cmd("stonith_admin", + args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o "pcmk_host_list=node1 node2 node3"') + + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v true1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 1 -v false1") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true2") + test.add_cmd("stonith_admin", args="--output-as=xml -r node3 -i 2 -v true3") + + test.add_cmd("stonith_admin", args="--output-as=xml -F node3 --delay 1") + + # Total fencing timeout takes all fencing delays into account + test.add_log_pattern("Total timeout set to 582s") + + # Fencing timeout for the first device takes the requested fencing delay + # and pcmk_delay_base into account + test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true1 .*146s.*", + regex=True) + # Requested fencing delay is applied only for the first device in the + # first level, with the static delay from pcmk_delay_base added + test.add_log_pattern("Delaying 'off' action targeting node3 using true1 for 2s | timeout=120s requested_delay=1s base=1s max=1s") + + # Fencing timeout no longer takes the requested fencing delay into account for further devices + test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using false1 .*145s.*", + regex=True) + # Requested fencing delay is no longer applied for further devices + test.add_log_pattern("Delaying 'off' action targeting node3 using false1 for 1s | timeout=120s requested_delay=0s base=1s max=1s") + + # Fencing timeout takes pcmk_delay_max into account + test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true2 .*146s.*", + regex=True) + test.add_log_pattern("Delaying 'off' action targeting node3 using true2 for 1s | timeout=120s requested_delay=0s base=1s max=2s") + + test.add_log_pattern("Delaying 'off' action targeting node3 using true3", + negative=True) def build_nodeid_tests(self): """ Register tests that use a corosync node id """ our_uname = localname() ### verify nodeid is supplied when nodeid is in the metadata parameters - test = self.new_test("cpg_supply_nodeid", - "Verify nodeid is given when fence agent has nodeid as parameter", True) + test = self.new_test("supply_nodeid", + "Verify nodeid is given when fence agent has nodeid as parameter") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -F %s -t 3" % our_uname) test.add_log_pattern("as nodeid with fence action 'off' targeting %s" % (our_uname)) ### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters - test = self.new_test("cpg_do_not_supply_nodeid", - "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", - True) + test = self.new_test("do_not_supply_nodeid", + "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter") # use a host name that won't be in corosync.conf test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=regr-test') test.add_cmd("stonith_admin", args="--output-as=xml -F regr-test -t 3") test.add_log_pattern("as nodeid with fence action 'off' targeting regr-test", negative=True) - ### verify nodeid use doesn't explode standalone mode - test = self.new_test("standalone_do_not_supply_nodeid", - "Verify nodeid in metadata parameter list doesn't kill standalone mode", - False) - test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -F %s -t 3" % our_uname) test.add_log_pattern("as nodeid with fence action 'off' targeting %s" % our_uname, negative=True) def build_unfence_tests(self): """ Register tests that verify unfencing """ our_uname = localname() ### verify unfencing using automatic unfencing - test = self.new_test("cpg_unfence_required_1", - "Verify require unfencing on all devices when automatic=true in agent's metadata", - True) + test = self.new_test("unfence_required_1", + "Verify require unfencing on all devices when automatic=true in agent's metadata") test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 3" % our_uname) # both devices should be executed test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") ### verify unfencing using automatic unfencing fails if any of the required agents fail - test = self.new_test("cpg_unfence_required_2", - "Verify require unfencing on all devices when automatic=true in agent's metadata", - True) + test = self.new_test("unfence_required_2", + "Verify require unfencing on all devices when automatic=true in agent's metadata") test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=fail -o "pcmk_host_list=%s"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 6" % our_uname, expected_exitcode=ExitStatus.ERROR) ### verify unfencing using automatic devices with topology - test = self.new_test("cpg_unfence_required_3", - "Verify require unfencing on all devices even when at different topology levels", - True) + test = self.new_test("unfence_required_3", + "Verify require unfencing on all devices even when at different topology levels") test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 1 -v true1" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v true2" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 3" % our_uname) test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") ### verify unfencing using automatic devices with topology - test = self.new_test("cpg_unfence_required_4", - "Verify all required devices are executed even with topology levels fail.", - True) + test = self.new_test("unfence_required_4", + "Verify all required devices are executed even with topology levels fail") test.add_cmd('stonith_admin', args='--output-as=xml -R true1 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true2 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true3 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R true4 -a fence_dummy_auto_unfence -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R false3 -a fence_dummy -o mode=fail -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd('stonith_admin', args='--output-as=xml -R false4 -a fence_dummy -o mode=fail -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 1 -v true1" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 1 -v false1" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v false2" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v true2" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v false3" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v true3" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 3 -v false4" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 4 -v true4" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 3" % our_uname) test.add_log_pattern("using true1 returned 0") test.add_log_pattern("using true2 returned 0") test.add_log_pattern("using true3 returned 0") test.add_log_pattern("using true4 returned 0") def build_unfence_on_target_tests(self): """ Register tests that verify unfencing that runs on the target """ our_uname = localname() ### verify unfencing using on_target device - test = self.new_test("cpg_unfence_on_target_1", - "Verify unfencing with on_target = true", True) + test = self.new_test("unfence_on_target_1", + "Verify unfencing with on_target = true") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 3" % our_uname) test.add_log_pattern("(on) to be executed on target") ### verify failure of unfencing using on_target device - test = self.new_test("cpg_unfence_on_target_2", - "Verify failure unfencing with on_target = true", - True) + test = self.new_test("unfence_on_target_2", + "Verify failure unfencing with on_target = true") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s node_fake_1234"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U node_fake_1234 -t 3", expected_exitcode=ExitStatus.NOSUCH) test.add_log_pattern("(on) to be executed on target") ### verify unfencing using on_target device with topology - test = self.new_test("cpg_unfence_on_target_3", - "Verify unfencing with on_target = true using topology", - True) + test = self.new_test("unfence_on_target_3", + "Verify unfencing with on_target = true using topology") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s node3"' % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 1 -v true1" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -r %s -i 2 -v true2" % our_uname) test.add_cmd("stonith_admin", args="--output-as=xml -U %s -t 3" % our_uname) test.add_log_pattern("(on) to be executed on target") ### verify unfencing using on_target device with topology fails when target node doesn't exist - test = self.new_test("cpg_unfence_on_target_4", - "Verify unfencing failure with on_target = true using topology", - True) - + test = self.new_test("unfence_on_target_4", + "Verify unfencing failure with on_target = true using topology") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s node_fake"' % our_uname) test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o "pcmk_host_list=%s node_fake"' % our_uname) - test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true2") - test.add_cmd("stonith_admin", args="--output-as=xml -U node_fake -t 3", expected_exitcode=ExitStatus.NOSUCH) test.add_log_pattern("(on) to be executed on target") def build_remap_tests(self): """ Register tests that verify remapping of reboots to off-on """ - test = self.new_test("cpg_remap_simple", - "Verify sequential topology reboot is remapped to all-off-then-all-on", True) + test = self.new_test("remap_simple", + "Verify sequential topology reboot is remapped to all-off-then-all-on") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake ' '-o pcmk_off_timeout=1 -o pcmk_reboot_timeout=10') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake ' '-o pcmk_off_timeout=2 -o pcmk_reboot_timeout=20') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) test.add_log_pattern("Total timeout set to 3s for peer's fencing targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # fence_dummy sets "on" as an on_target action test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Undoing remap of reboot targeting node_fake") - test = self.new_test("cpg_remap_simple_off", + test = self.new_test("remap_simple_off", "Verify sequential topology reboot skips 'on' if " "pcmk_reboot_action=off or agent doesn't support " - "'on'", True) + "'on'") test.add_cmd("stonith_admin", args="--output-as=xml -R true1 -a fence_dummy -o mode=pass " "-o pcmk_host_list=node_fake -o pcmk_off_timeout=1 " "-o pcmk_reboot_timeout=10 -o pcmk_reboot_action=off") test.add_cmd("stonith_admin", args="--output-as=xml -R true2 -a fence_dummy_no_on " "-o mode=pass -o pcmk_host_list=node_fake " "-o pcmk_off_timeout=2 -o pcmk_reboot_timeout=20") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) test.add_log_pattern("Total timeout set to 3s for peer's fencing targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # "on" should be skipped test.add_log_pattern("Not turning node_fake back on using " "true1 because the device is configured " "to stay off") test.add_log_pattern("Not turning node_fake back on using true2" " because the agent doesn't support 'on'") test.add_log_pattern("Undoing remap of reboot targeting node_fake") - test = self.new_test("cpg_remap_automatic", - "Verify remapped topology reboot skips automatic 'on'", True) + test = self.new_test("remap_automatic", + "Verify remapped topology reboot skips automatic 'on'") test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy_auto_unfence ' '-o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy_auto_unfence ' '-o "mode=pass" -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test.add_log_pattern("perform 'on' action targeting node_fake using", negative=True) test.add_log_pattern("'on' failure", negative=True) - test = self.new_test("cpg_remap_complex_1", - "Verify remapped topology reboot in second level works if non-remapped first level fails", - True) + test = self.new_test("remap_complex_1", + "Verify remapped topology reboot in second level works if non-remapped first level fails") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true1 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using true2") test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_log_pattern("Undoing remap of reboot targeting node_fake") - test = self.new_test("cpg_remap_complex_2", - "Verify remapped topology reboot failure in second level proceeds to third level", - True) + test = self.new_test("remap_complex_2", + "Verify remapped topology reboot failure in second level proceeds to third level") test.add_cmd("stonith_admin", args='--output-as=xml -R false1 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R false2 -a fence_dummy -o mode=fail -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true1 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true2 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args='--output-as=xml -R true3 -a fence_dummy -o mode=pass -o pcmk_host_list=node_fake') test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 2 -v true1 -v false2 -v true3") test.add_cmd("stonith_admin", args="--output-as=xml -r node_fake -i 3 -v true2") test.add_cmd("stonith_admin", args="--output-as=xml -B node_fake -t 5") test.add_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_log_pattern("perform 'off' action targeting node_fake using true1") test.add_log_pattern("perform 'off' action targeting node_fake using false2") test.add_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times") test.add_log_pattern("Undoing remap of reboot targeting node_fake") test.add_log_pattern("perform 'reboot' action targeting node_fake using true2") test.add_log_pattern("node_fake with true3", negative=True) def build_query_tests(self): """ run stonith_admin --metadata for the fence_dummy agent and check command output """ test = self.new_test("get_metadata", - "Run stonith_admin --metadata for the fence_dummy agent", True) + "Run stonith_admin --metadata for the fence_dummy agent") test.add_cmd("stonith_admin", args="--output-as=xml -a fence_dummy --metadata", stdout_match=' #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SUMMARY "cts-fence-helper - inject commands into the Pacemaker fencer and watch for events" static GMainLoop *mainloop = NULL; static crm_trigger_t *trig = NULL; static int mainloop_iter = 0; static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; typedef void (*mainloop_test_iteration_cb) (int check_event); #define MAINLOOP_DEFAULT_TIMEOUT 2 enum test_modes { test_standard = 0, // test using a specific developer environment test_passive, // watch notifications only test_api_sanity, // sanity-test stonith client API using fence_dummy test_api_mainloop, // sanity-test mainloop code with async responses }; struct { enum test_modes mode; } options = { .mode = test_standard }; static gboolean mode_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { if (pcmk__str_any_of(option_name, "--mainloop_api_test", "-m", NULL)) { options.mode = test_api_mainloop; } else if (pcmk__str_any_of(option_name, "--api_test", "-t", NULL)) { options.mode = test_api_sanity; } else if (pcmk__str_any_of(option_name, "--passive", "-p", NULL)) { options.mode = test_passive; } return TRUE; } static GOptionEntry entries[] = { { "mainloop_api_test", 'm', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, mode_cb, NULL, NULL, }, { "api_test", 't', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, mode_cb, NULL, NULL, }, { "passive", 'p', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, mode_cb, NULL, NULL, }, { NULL } }; static stonith_t *st = NULL; static struct pollfd pollfd; static const int st_opts = st_opt_sync_call; static int expected_notifications = 0; static int verbose = 0; static void mainloop_test_done(const char *origin, bool pass) { if (pass) { crm_info("SUCCESS - %s", origin); mainloop_iter++; mainloop_set_trigger(trig); result.execution_status = PCMK_EXEC_DONE; result.exit_status = CRM_EX_OK; } else { crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, pcmk_exec_status_str(result.execution_status)); crm_exit(CRM_EX_ERROR); } } static void dispatch_helper(int timeout) { int rc; crm_debug("Looking for notification"); pollfd.events = POLLIN; while (true) { rc = poll(&pollfd, 1, timeout); /* wait 10 minutes, -1 forever */ if (rc > 0) { if (!stonith_dispatch(st)) { break; } } else { break; } } } static void st_callback(stonith_t * st, stonith_event_t * e) { char *desc = NULL; if (st->state == stonith_disconnected) { crm_exit(CRM_EX_DISCONNECT); } desc = stonith__event_description(e); crm_notice("%s", desc); free(desc); if (expected_notifications) { expected_notifications--; } } static void st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) { crm_notice("Call %d exited %d: %s (%s)", data->call_id, stonith__exit_status(data), stonith__execution_status(data), pcmk__s(stonith__exit_reason(data), "unspecified reason")); } static void passive_test(void) { int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } st->cmds->register_notification(st, PCMK__VALUE_ST_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, PCMK__VALUE_ST_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_DEL, st_callback); st->cmds->register_callback(st, 0, 120, st_opt_timeout_updates, NULL, "st_global_callback", st_global_callback); dispatch_helper(600 * 1000); } #define single_test(cmd, str, num_notifications, expected_rc) \ { \ int rc = 0; \ rc = cmd; \ expected_notifications = 0; \ if (num_notifications) { \ expected_notifications = num_notifications; \ dispatch_helper(500); \ } \ if (rc != expected_rc) { \ crm_err("FAILURE - expected rc %d != %d(%s) for cmd - %s", expected_rc, rc, pcmk_strerror(rc), str); \ crm_exit(CRM_EX_ERROR); \ } else if (expected_notifications) { \ crm_err("FAILURE - expected %d notifications, got only %d for cmd - %s", \ num_notifications, num_notifications - expected_notifications, str); \ crm_exit(CRM_EX_ERROR); \ } else { \ if (verbose) { \ crm_info("SUCCESS - %s: %d", str, rc); \ } else { \ crm_debug("SUCCESS - %s: %d", str, rc); \ } \ } \ }\ static void run_fence_failure_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "fail"); single_test(st-> cmds->register_device(st, st_opts, "test-id1", "stonith-ng", "fence_dummy", params), "Register device1 for failure test", 1, 0); single_test(st->cmds->fence(st, st_opts, "false_1_node2", PCMK_ACTION_OFF, 3, 0), "Fence failure results off", 1, -ENODATA); single_test(st->cmds->fence(st, st_opts, "false_1_node2", PCMK_ACTION_REBOOT, 3, 0), "Fence failure results reboot", 1, -ENODATA); single_test(st->cmds->remove_device(st, st_opts, "test-id1"), "Remove device1 for failure test", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void run_fence_failure_rollover_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "fail"); single_test(st-> cmds->register_device(st, st_opts, "test-id1", "stonith-ng", "fence_dummy", params), "Register device1 for rollover test", 1, 0); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "pass"); single_test(st-> cmds->register_device(st, st_opts, "test-id2", "stonith-ng", "fence_dummy", params), "Register device2 for rollover test", 1, 0); single_test(st->cmds->fence(st, st_opts, "false_1_node2", PCMK_ACTION_OFF, 3, 0), "Fence rollover results off", 1, 0); /* Expect -ENODEV because fence_dummy requires 'on' to be executed on target */ single_test(st->cmds->fence(st, st_opts, "false_1_node2", PCMK_ACTION_ON, 3, 0), "Fence rollover results on", 1, -ENODEV); single_test(st->cmds->remove_device(st, st_opts, "test-id1"), "Remove device1 for rollover tests", 1, 0); single_test(st->cmds->remove_device(st, st_opts, "test-id2"), "Remove device2 for rollover tests", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void run_standard_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "pass"); params = stonith_key_value_add(params, "mock_dynamic_hosts", "false_1_node1 false_1_node2"); single_test(st-> cmds->register_device(st, st_opts, "test-id", "stonith-ng", "fence_dummy", params), "Register", 1, 0); stonith_key_value_freeall(params, 1, 1); params = NULL; single_test(st->cmds->list(st, st_opts, "test-id", NULL, 1), - PCMK_ACTION_LIST, 1, 0); + PCMK_ACTION_LIST, 0, 0); - single_test(st->cmds->monitor(st, st_opts, "test-id", 1), "Monitor", 1, 0); + single_test(st->cmds->monitor(st, st_opts, "test-id", 1), "Monitor", 0, 0); single_test(st->cmds->status(st, st_opts, "test-id", "false_1_node2", 1), - "Status false_1_node2", 1, 0); + "Status false_1_node2", 0, 0); single_test(st->cmds->status(st, st_opts, "test-id", "false_1_node1", 1), - "Status false_1_node1", 1, 0); + "Status false_1_node1", 0, 0); single_test(st->cmds->fence(st, st_opts, "unknown-host", PCMK_ACTION_OFF, 1, 0), "Fence unknown-host (expected failure)", 0, -ENODEV); single_test(st->cmds->fence(st, st_opts, "false_1_node1", PCMK_ACTION_OFF, 1, 0), "Fence false_1_node1", 1, 0); /* Expect -ENODEV because fence_dummy requires 'on' to be executed on target */ single_test(st->cmds->fence(st, st_opts, "false_1_node1", PCMK_ACTION_ON, 1, 0), "Unfence false_1_node1", 1, -ENODEV); /* Confirm that an invalid level index is rejected */ single_test(st->cmds->register_level(st, st_opts, "node1", 999, params), "Attempt to register an invalid level index", 0, -EINVAL); single_test(st->cmds->remove_device(st, st_opts, "test-id"), "Remove test-id", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void sanity_tests(void) { int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } st->cmds->register_notification(st, PCMK__VALUE_ST_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, PCMK__VALUE_ST_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_DEL, st_callback); st->cmds->register_callback(st, 0, 120, st_opt_timeout_updates, NULL, "st_global_callback", st_global_callback); crm_info("Starting API Sanity Tests"); run_standard_test(); run_fence_failure_test(); run_fence_failure_rollover_test(); crm_info("Sanity Tests Passed"); } static void standard_dev_test(void) { int rc = 0; char *tmp = NULL; stonith_key_value_t *params = NULL; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "some-host=pcmk-7 true_1_node1=3,4"); rc = st->cmds->register_device(st, st_opts, "test-id", "stonith-ng", "fence_xvm", params); crm_debug("Register: %d", rc); rc = st->cmds->list(st, st_opts, "test-id", &tmp, 10); crm_debug("List: %d output: %s", rc, tmp ? tmp : ""); rc = st->cmds->monitor(st, st_opts, "test-id", 10); crm_debug("Monitor: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node2", 10); crm_debug("Status false_1_node2: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "unknown-host", PCMK_ACTION_OFF, 60, 0); crm_debug("Fence unknown-host: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", PCMK_ACTION_OFF, 60, 0); crm_debug("Fence false_1_node1: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", PCMK_ACTION_ON, 10, 0); crm_debug("Unfence false_1_node1: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "some-host", PCMK_ACTION_OFF, 10, 0); crm_debug("Fence alias: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "some-host", 10); crm_debug("Status alias: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", PCMK_ACTION_ON, 10, 0); crm_debug("Unfence false_1_node1: %d", rc); rc = st->cmds->remove_device(st, st_opts, "test-id"); crm_debug("Remove test-id: %d", rc); stonith_key_value_freeall(params, 1, 1); } static void iterate_mainloop_tests(gboolean event_ready); static void mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) { pcmk__set_result(&result, stonith__exit_status(data), stonith__execution_status(data), stonith__exit_reason(data)); iterate_mainloop_tests(TRUE); } static int register_callback_helper(int callid) { return st->cmds->register_callback(st, callid, MAINLOOP_DEFAULT_TIMEOUT, st_opt_timeout_updates, NULL, "callback", mainloop_callback); } static void test_async_fence_pass(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); return; } rc = st->cmds->fence(st, 0, "true_1_node1", PCMK_ACTION_OFF, MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } #define CUSTOM_TIMEOUT_ADDITION 10 static void test_async_fence_custom_timeout(int check_event) { int rc = 0; static time_t begin = 0; if (check_event) { uint32_t diff = (time(NULL) - begin); if (result.execution_status != PCMK_EXEC_TIMEOUT) { mainloop_test_done(__func__, false); } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { crm_err ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); mainloop_test_done(__func__, false); } else { mainloop_test_done(__func__, true); } return; } begin = time(NULL); rc = st->cmds->fence(st, 0, "custom_timeout_node1", PCMK_ACTION_OFF, MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_async_fence_timeout(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); return; } rc = st->cmds->fence(st, 0, "false_1_node2", PCMK_ACTION_OFF, MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_async_monitor(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); return; } rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); if (rc < 0) { crm_err("monitor failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_register_async_devices(int check_event) { char buf[16] = { 0, }; stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2"); params = stonith_key_value_add(params, "mode", "fail"); st->cmds->register_device(st, st_opts, "false_1", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "true_1_node1=1,2"); params = stonith_key_value_add(params, "mode", "pass"); st->cmds->register_device(st, st_opts, "true_1", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "custom_timeout_node1=1,2"); params = stonith_key_value_add(params, "mode", "fail"); params = stonith_key_value_add(params, "delay", "1000"); snprintf(buf, sizeof(buf) - 1, "%d", MAINLOOP_DEFAULT_TIMEOUT + CUSTOM_TIMEOUT_ADDITION); params = stonith_key_value_add(params, "pcmk_off_timeout", buf); st->cmds->register_device(st, st_opts, "false_custom_timeout", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); mainloop_test_done(__func__, true); } static void try_mainloop_connect(int check_event) { int rc = stonith_api_connect_retry(st, crm_system_name, 10); if (rc == pcmk_ok) { mainloop_test_done(__func__, true); return; } crm_err("API CONNECTION FAILURE"); mainloop_test_done(__func__, false); } static void iterate_mainloop_tests(gboolean event_ready) { static mainloop_test_iteration_cb callbacks[] = { try_mainloop_connect, test_register_async_devices, test_async_monitor, test_async_fence_pass, test_async_fence_timeout, test_async_fence_custom_timeout, }; if (mainloop_iter == (sizeof(callbacks) / sizeof(mainloop_test_iteration_cb))) { /* all tests ran, everything passed */ crm_info("ALL MAINLOOP TESTS PASSED!"); crm_exit(CRM_EX_OK); } callbacks[mainloop_iter] (event_ready); } static gboolean trigger_iterate_mainloop_tests(gpointer user_data) { iterate_mainloop_tests(FALSE); return TRUE; } static void test_shutdown(int nsig) { int rc = 0; if (st) { rc = st->cmds->disconnect(st); crm_info("Disconnect: %d", rc); crm_debug("Destroy"); stonith_api_delete(st); } if (rc) { crm_exit(CRM_EX_ERROR); } } static void mainloop_tests(void) { trig = mainloop_add_trigger(G_PRIORITY_HIGH, trigger_iterate_mainloop_tests, NULL); mainloop_set_trigger(trig); mainloop_add_signal(SIGTERM, test_shutdown); crm_info("Starting"); mainloop = g_main_loop_new(NULL, FALSE); g_main_loop_run(mainloop); } static GOptionContext * build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { GOptionContext *context = NULL; context = pcmk__build_arg_context(args, NULL, group, NULL); pcmk__add_main_args(context, entries); return context; } int main(int argc, char **argv) { GError *error = NULL; crm_exit_t exit_code = CRM_EX_OK; pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); gchar **processed_args = pcmk__cmdline_preproc(argv, NULL); GOptionContext *context = build_arg_context(args, NULL); if (!g_option_context_parse_strv(context, &processed_args, &error)) { exit_code = CRM_EX_USAGE; goto done; } /* We have to use crm_log_init here to set up the logging because there's * different handling for daemons vs. command line programs, and * pcmk__cli_init_logging is set up to only handle the latter. */ crm_log_init(NULL, LOG_INFO, TRUE, (verbose? TRUE : FALSE), argc, argv, FALSE); for (int i = 0; i < args->verbosity; i++) { crm_bump_log_level(argc, argv); } st = stonith_api_new(); if (st == NULL) { exit_code = CRM_EX_DISCONNECT; g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Could not connect to fencer: API memory allocation failed"); goto done; } switch (options.mode) { case test_standard: standard_dev_test(); break; case test_passive: passive_test(); break; case test_api_sanity: sanity_tests(); break; case test_api_mainloop: mainloop_tests(); break; } test_shutdown(0); done: g_strfreev(processed_args); pcmk__free_arg_context(context); pcmk__output_and_clear_error(&error, NULL); crm_exit(exit_code); } diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c index 884699554a..e9f2086c33 100644 --- a/daemons/fenced/fenced_commands.c +++ b/daemons/fenced/fenced_commands.c @@ -1,3641 +1,3619 @@ /* * Copyright 2009-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *device_list = NULL; GHashTable *topology = NULL; static GList *cmd_list = NULL; static GHashTable *fenced_handlers = NULL; struct device_search_s { /* target of fence action */ char *host; /* requested fence action */ char *action; /* timeout to use if a device is queried dynamically for possible targets */ int per_device_timeout; /* number of registered fencing devices at time of request */ int replies_needed; /* number of device replies received so far */ int replies_received; /* whether the target is eligible to perform requested action (or off) */ bool allow_suicide; /* private data to pass to search callback function */ void *user_data; /* function to call when all replies have been received */ void (*callback) (GList * devices, void *user_data); /* devices capable of performing requested action (or off if remapping) */ GList *capable; /* Whether to perform searches that support the action */ uint32_t support_action_only; }; static gboolean stonith_device_dispatch(gpointer user_data); static void st_child_done(int pid, const pcmk__action_result_t *result, void *user_data); static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence); static int get_agent_metadata(const char *agent, xmlNode **metadata); static void read_action_metadata(stonith_device_t *device); static enum fenced_target_by unpack_level_kind(const xmlNode *level); typedef struct async_command_s { int id; int pid; int fd_stdout; int options; int default_timeout; /* seconds */ int timeout; /* seconds */ int start_delay; // seconds (-1 means disable static/random fencing delays) int delay_id; char *op; char *origin; char *client; char *client_name; char *remote_op_id; char *target; uint32_t target_nodeid; char *action; char *device; GList *device_list; GList *next_device_iter; // device_list entry for next device to execute void *internal_user_data; void (*done_cb) (int pid, const pcmk__action_result_t *result, void *user_data); guint timer_sigterm; guint timer_sigkill; /*! If the operation timed out, this is the last signal * we sent to the process to get it to terminate */ int last_timeout_signo; stonith_device_t *active_on; stonith_device_t *activating_on; } async_command_t; static xmlNode *construct_async_reply(const async_command_t *cmd, const pcmk__action_result_t *result); static gboolean is_action_required(const char *action, const stonith_device_t *device) { return (device != NULL) && device->automatic_unfencing && pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none); } static int get_action_delay_max(const stonith_device_t *device, const char *action) { const char *value = NULL; guint delay_max = 0U; if (!pcmk__is_fencing_action(action)) { return 0; } value = g_hash_table_lookup(device->params, PCMK_STONITH_DELAY_MAX); if (value) { pcmk_parse_interval_spec(value, &delay_max); delay_max /= 1000; } return (int) delay_max; } static int get_action_delay_base(const stonith_device_t *device, const char *action, const char *target) { char *hash_value = NULL; guint delay_base = 0U; if (!pcmk__is_fencing_action(action)) { return 0; } hash_value = g_hash_table_lookup(device->params, PCMK_STONITH_DELAY_BASE); if (hash_value) { char *value = pcmk__str_copy(hash_value); char *valptr = value; if (target != NULL) { for (char *val = strtok(value, "; \t"); val != NULL; val = strtok(NULL, "; \t")) { char *mapval = strchr(val, ':'); if (mapval == NULL || mapval[1] == 0) { crm_err("pcmk_delay_base: empty value in mapping", val); continue; } if (mapval != val && strncasecmp(target, val, (size_t)(mapval - val)) == 0) { value = mapval + 1; crm_debug("pcmk_delay_base mapped to %s for %s", value, target); break; } } } if (strchr(value, ':') == 0) { pcmk_parse_interval_spec(value, &delay_base); delay_base /= 1000; } free(valptr); } return (int) delay_base; } /*! * \internal * \brief Override STONITH timeout with pcmk_*_timeout if available * * \param[in] device STONITH device to use * \param[in] action STONITH action name * \param[in] default_timeout Timeout to use if device does not have * a pcmk_*_timeout parameter for action * * \return Value of pcmk_(action)_timeout if available, otherwise default_timeout * \note For consistency, it would be nice if reboot/off/on timeouts could be * set the same way as start/stop/monitor timeouts, i.e. with an * entry in the fencing resource configuration. However that * is insufficient because fencing devices may be registered directly via * the fencer's register_device() API instead of going through the CIB * (e.g. stonith_admin uses it for its -R option, and the executor uses it * to ensure a device is registered when a command is issued). As device * properties, pcmk_*_timeout parameters can be grabbed by the fencer when * the device is registered, whether by CIB change or API call. */ static int get_action_timeout(const stonith_device_t *device, const char *action, int default_timeout) { if (action && device && device->params) { char buffer[64] = { 0, }; const char *value = NULL; /* If "reboot" was requested but the device does not support it, * we will remap to "off", so check timeout for "off" instead */ if (pcmk__str_eq(action, PCMK_ACTION_REBOOT, pcmk__str_none) && !pcmk_is_set(device->flags, st_device_supports_reboot)) { crm_trace("%s doesn't support reboot, using timeout for off instead", device->id); action = PCMK_ACTION_OFF; } /* If the device config specified an action-specific timeout, use it */ snprintf(buffer, sizeof(buffer), "pcmk_%s_timeout", action); value = g_hash_table_lookup(device->params, buffer); if (value) { long long timeout_ms = crm_get_msec(value); return (int) QB_MIN(timeout_ms / 1000, INT_MAX); } } return default_timeout; } /*! * \internal * \brief Get the currently executing device for a fencing operation * * \param[in] cmd Fencing operation to check * * \return Currently executing device for \p cmd if any, otherwise NULL */ static stonith_device_t * cmd_device(const async_command_t *cmd) { if ((cmd == NULL) || (cmd->device == NULL) || (device_list == NULL)) { return NULL; } return g_hash_table_lookup(device_list, cmd->device); } /*! * \internal * \brief Return the configured reboot action for a given device * * \param[in] device_id Device ID * * \return Configured reboot action for \p device_id */ const char * fenced_device_reboot_action(const char *device_id) { const char *action = NULL; if ((device_list != NULL) && (device_id != NULL)) { stonith_device_t *device = g_hash_table_lookup(device_list, device_id); if ((device != NULL) && (device->params != NULL)) { action = g_hash_table_lookup(device->params, "pcmk_reboot_action"); } } return pcmk__s(action, PCMK_ACTION_REBOOT); } /*! * \internal * \brief Check whether a given device supports the "on" action * * \param[in] device_id Device ID * * \return true if \p device_id supports "on", otherwise false */ bool fenced_device_supports_on(const char *device_id) { if ((device_list != NULL) && (device_id != NULL)) { stonith_device_t *device = g_hash_table_lookup(device_list, device_id); if (device != NULL) { return pcmk_is_set(device->flags, st_device_supports_on); } } return false; } static void free_async_command(async_command_t * cmd) { if (!cmd) { return; } if (cmd->delay_id) { g_source_remove(cmd->delay_id); } cmd_list = g_list_remove(cmd_list, cmd); g_list_free_full(cmd->device_list, free); free(cmd->device); free(cmd->action); free(cmd->target); free(cmd->remote_op_id); free(cmd->client); free(cmd->client_name); free(cmd->origin); free(cmd->op); free(cmd); } /*! * \internal * \brief Create a new asynchronous fencing operation from request XML * * \param[in] msg Fencing request XML (from IPC or CPG) * * \return Newly allocated fencing operation on success, otherwise NULL * * \note This asserts on memory errors, so a NULL return indicates an * unparseable message. */ static async_command_t * create_async_command(xmlNode *msg) { xmlNode *op = NULL; async_command_t *cmd = NULL; if (msg == NULL) { return NULL; } op = get_xpath_object("//@" PCMK__XE_ST_DEVICE_ACTION, msg, LOG_ERR); if (op == NULL) { return NULL; } cmd = pcmk__assert_alloc(1, sizeof(async_command_t)); // All messages must include these cmd->action = crm_element_value_copy(op, PCMK__XA_ST_DEVICE_ACTION); cmd->op = crm_element_value_copy(msg, PCMK__XA_ST_OP); cmd->client = crm_element_value_copy(msg, PCMK__XA_ST_CLIENTID); if ((cmd->action == NULL) || (cmd->op == NULL) || (cmd->client == NULL)) { free_async_command(cmd); return NULL; } crm_element_value_int(msg, PCMK__XA_ST_CALLID, &(cmd->id)); crm_element_value_int(msg, PCMK__XA_ST_CALLOPT, &(cmd->options)); crm_element_value_int(msg, PCMK__XA_ST_DELAY, &(cmd->start_delay)); crm_element_value_int(msg, PCMK__XA_ST_TIMEOUT, &(cmd->default_timeout)); cmd->timeout = cmd->default_timeout; cmd->origin = crm_element_value_copy(msg, PCMK__XA_SRC); cmd->remote_op_id = crm_element_value_copy(msg, PCMK__XA_ST_REMOTE_OP); cmd->client_name = crm_element_value_copy(msg, PCMK__XA_ST_CLIENTNAME); cmd->target = crm_element_value_copy(op, PCMK__XA_ST_TARGET); cmd->device = crm_element_value_copy(op, PCMK__XA_ST_DEVICE_ID); cmd->done_cb = st_child_done; // Track in global command list cmd_list = g_list_append(cmd_list, cmd); return cmd; } static int get_action_limit(stonith_device_t * device) { const char *value = NULL; int action_limit = 1; value = g_hash_table_lookup(device->params, PCMK_STONITH_ACTION_LIMIT); if ((value == NULL) || (pcmk__scan_min_int(value, &action_limit, INT_MIN) != pcmk_rc_ok) || (action_limit == 0)) { action_limit = 1; } return action_limit; } static int get_active_cmds(stonith_device_t * device) { int counter = 0; GList *gIter = NULL; GList *gIterNext = NULL; CRM_CHECK(device != NULL, return 0); for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) { async_command_t *cmd = gIter->data; gIterNext = gIter->next; if (cmd->active_on == device) { counter++; } } return counter; } static void fork_cb(int pid, void *user_data) { async_command_t *cmd = (async_command_t *) user_data; stonith_device_t * device = /* in case of a retry we've done the move from activating_on to active_on already */ cmd->activating_on?cmd->activating_on:cmd->active_on; CRM_ASSERT(device); crm_debug("Operation '%s' [%d]%s%s using %s now running with %ds timeout", cmd->action, pid, ((cmd->target == NULL)? "" : " targeting "), pcmk__s(cmd->target, ""), device->id, cmd->timeout); cmd->active_on = device; cmd->activating_on = NULL; } static int get_agent_metadata_cb(gpointer data) { stonith_device_t *device = data; guint period_ms; switch (get_agent_metadata(device->agent, &device->agent_metadata)) { case pcmk_rc_ok: if (device->agent_metadata) { read_action_metadata(device); stonith__device_parameter_flags(&(device->flags), device->id, device->agent_metadata); } return G_SOURCE_REMOVE; case EAGAIN: period_ms = pcmk__mainloop_timer_get_period(device->timer); if (period_ms < 160 * 1000) { mainloop_timer_set_period(device->timer, 2 * period_ms); } return G_SOURCE_CONTINUE; default: return G_SOURCE_REMOVE; } } /*! * \internal * \brief Call a command's action callback for an internal (not library) result * * \param[in,out] cmd Command to report result for * \param[in] execution_status Execution status to use for result * \param[in] exit_status Exit status to use for result * \param[in] exit_reason Exit reason to use for result */ static void report_internal_result(async_command_t *cmd, int exit_status, int execution_status, const char *exit_reason) { pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; pcmk__set_result(&result, exit_status, execution_status, exit_reason); cmd->done_cb(0, &result, cmd); pcmk__reset_result(&result); } static gboolean stonith_device_execute(stonith_device_t * device) { int exec_rc = 0; const char *action_str = NULL; const char *host_arg = NULL; async_command_t *cmd = NULL; stonith_action_t *action = NULL; int active_cmds = 0; int action_limit = 0; GList *gIter = NULL; GList *gIterNext = NULL; CRM_CHECK(device != NULL, return FALSE); active_cmds = get_active_cmds(device); action_limit = get_action_limit(device); if (action_limit > -1 && active_cmds >= action_limit) { crm_trace("%s is over its action limit of %d (%u active action%s)", device->id, action_limit, active_cmds, pcmk__plural_s(active_cmds)); return TRUE; } for (gIter = device->pending_ops; gIter != NULL; gIter = gIterNext) { async_command_t *pending_op = gIter->data; gIterNext = gIter->next; if (pending_op && pending_op->delay_id) { crm_trace("Operation '%s'%s%s using %s was asked to run too early, " "waiting for start delay of %ds", pending_op->action, ((pending_op->target == NULL)? "" : " targeting "), pcmk__s(pending_op->target, ""), device->id, pending_op->start_delay); continue; } device->pending_ops = g_list_remove_link(device->pending_ops, gIter); g_list_free_1(gIter); cmd = pending_op; break; } if (cmd == NULL) { crm_trace("No actions using %s are needed", device->id); return TRUE; } if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { if (pcmk__is_fencing_action(cmd->action)) { if (node_does_watchdog_fencing(fenced_get_local_node())) { pcmk__panic("Watchdog self-fencing required"); goto done; } } else { crm_info("Faking success for %s watchdog operation", cmd->action); report_internal_result(cmd, CRM_EX_OK, PCMK_EXEC_DONE, NULL); goto done; } } #if PCMK__ENABLE_CIBSECRETS exec_rc = pcmk__substitute_secrets(device->id, device->params); if (exec_rc != pcmk_rc_ok) { if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_none)) { crm_info("Proceeding with stop operation for %s " "despite being unable to load CIB secrets (%s)", device->id, pcmk_rc_str(exec_rc)); } else { crm_err("Considering %s unconfigured " "because unable to load CIB secrets: %s", device->id, pcmk_rc_str(exec_rc)); report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, "Failed to get CIB secrets"); goto done; } } #endif action_str = cmd->action; if (pcmk__str_eq(cmd->action, PCMK_ACTION_REBOOT, pcmk__str_none) && !pcmk_is_set(device->flags, st_device_supports_reboot)) { crm_notice("Remapping 'reboot' action%s%s using %s to 'off' " "because agent '%s' does not support reboot", ((cmd->target == NULL)? "" : " targeting "), pcmk__s(cmd->target, ""), device->id, device->agent); action_str = PCMK_ACTION_OFF; } if (pcmk_is_set(device->flags, st_device_supports_parameter_port)) { host_arg = "port"; } else if (pcmk_is_set(device->flags, st_device_supports_parameter_plug)) { host_arg = "plug"; } action = stonith__action_create(device->agent, action_str, cmd->target, cmd->target_nodeid, cmd->timeout, device->params, device->aliases, host_arg); /* for async exec, exec_rc is negative for early error exit otherwise handling of success/errors is done via callbacks */ cmd->activating_on = device; exec_rc = stonith__execute_async(action, (void *)cmd, cmd->done_cb, fork_cb); if (exec_rc < 0) { cmd->activating_on = NULL; cmd->done_cb(0, stonith__action_result(action), cmd); stonith__destroy_action(action); } done: /* Device might get triggered to work by multiple fencing commands * simultaneously. Trigger the device again to make sure any * remaining concurrent commands get executed. */ if (device->pending_ops) { mainloop_set_trigger(device->work); } return TRUE; } static gboolean stonith_device_dispatch(gpointer user_data) { return stonith_device_execute(user_data); } static gboolean start_delay_helper(gpointer data) { async_command_t *cmd = data; stonith_device_t *device = cmd_device(cmd); cmd->delay_id = 0; if (device) { mainloop_set_trigger(device->work); } return FALSE; } static void schedule_stonith_command(async_command_t * cmd, stonith_device_t * device) { int delay_max = 0; int delay_base = 0; int requested_delay = cmd->start_delay; CRM_CHECK(cmd != NULL, return); CRM_CHECK(device != NULL, return); if (cmd->device) { free(cmd->device); } if (device->include_nodeid && (cmd->target != NULL)) { pcmk__node_status_t *node = pcmk__get_node(0, cmd->target, NULL, pcmk__node_search_cluster_member); cmd->target_nodeid = node->cluster_layer_id; } cmd->device = pcmk__str_copy(device->id); cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout); if (cmd->remote_op_id) { crm_debug("Scheduling '%s' action%s%s using %s for remote peer %s " "with op id %.8s and timeout %ds", cmd->action, (cmd->target == NULL)? "" : " targeting ", pcmk__s(cmd->target, ""), device->id, cmd->origin, cmd->remote_op_id, cmd->timeout); } else { crm_debug("Scheduling '%s' action%s%s using %s for %s with timeout %ds", cmd->action, (cmd->target == NULL)? "" : " targeting ", pcmk__s(cmd->target, ""), device->id, cmd->client, cmd->timeout); } device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); // Value -1 means disable any static/random fencing delays if (requested_delay < 0) { return; } delay_max = get_action_delay_max(device, cmd->action); delay_base = get_action_delay_base(device, cmd->action, cmd->target); if (delay_max == 0) { delay_max = delay_base; } if (delay_max < delay_base) { crm_warn(PCMK_STONITH_DELAY_BASE " (%ds) is larger than " PCMK_STONITH_DELAY_MAX " (%ds) for %s using %s " "(limiting to maximum delay)", delay_base, delay_max, cmd->action, device->id); delay_base = delay_max; } if (delay_max > 0) { // coverity[dontcall] It doesn't matter here if rand() is predictable cmd->start_delay += ((delay_max != delay_base)?(rand() % (delay_max - delay_base)):0) + delay_base; } if (cmd->start_delay > 0) { crm_notice("Delaying '%s' action%s%s using %s for %ds " QB_XS " timeout=%ds requested_delay=%ds base=%ds max=%ds", cmd->action, (cmd->target == NULL)? "" : " targeting ", pcmk__s(cmd->target, ""), device->id, cmd->start_delay, cmd->timeout, requested_delay, delay_base, delay_max); cmd->delay_id = g_timeout_add_seconds(cmd->start_delay, start_delay_helper, cmd); } } static void free_device(gpointer data) { GList *gIter = NULL; stonith_device_t *device = data; g_hash_table_destroy(device->params); g_hash_table_destroy(device->aliases); for (gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) { async_command_t *cmd = gIter->data; crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "Device was removed before action could be executed"); } g_list_free(device->pending_ops); g_list_free_full(device->targets, free); if (device->timer) { mainloop_timer_stop(device->timer); mainloop_timer_del(device->timer); } mainloop_destroy_trigger(device->work); pcmk__xml_free(device->agent_metadata); free(device->namespace); if (device->on_target_actions != NULL) { g_string_free(device->on_target_actions, TRUE); } free(device->agent); free(device->id); free(device); } void free_device_list(void) { if (device_list != NULL) { g_hash_table_destroy(device_list); device_list = NULL; } } void init_device_list(void) { if (device_list == NULL) { device_list = pcmk__strkey_table(NULL, free_device); } } static GHashTable * build_port_aliases(const char *hostmap, GList ** targets) { char *name = NULL; int last = 0, lpc = 0, max = 0, added = 0; GHashTable *aliases = pcmk__strikey_table(free, free); if (hostmap == NULL) { return aliases; } max = strlen(hostmap); for (; lpc <= max; lpc++) { switch (hostmap[lpc]) { /* Skip escaped chars */ case '\\': lpc++; break; /* Assignment chars */ case '=': case ':': if (lpc > last) { free(name); name = pcmk__assert_alloc(1, 1 + lpc - last); memcpy(name, hostmap + last, lpc - last); } last = lpc + 1; break; /* Delimeter chars */ /* case ',': Potentially used to specify multiple ports */ case 0: case ';': case ' ': case '\t': if (name) { char *value = NULL; int k = 0; value = pcmk__assert_alloc(1, 1 + lpc - last); memcpy(value, hostmap + last, lpc - last); for (int i = 0; value[i] != '\0'; i++) { if (value[i] != '\\') { value[k++] = value[i]; } } value[k] = '\0'; crm_debug("Adding alias '%s'='%s'", name, value); g_hash_table_replace(aliases, name, value); if (targets) { *targets = g_list_append(*targets, pcmk__str_copy(value)); } value = NULL; name = NULL; added++; } else if (lpc > last) { crm_debug("Parse error at offset %d near '%s'", lpc - last, hostmap + last); } last = lpc + 1; break; } if (hostmap[lpc] == 0) { break; } } if (added == 0) { crm_info("No host mappings detected in '%s'", hostmap); } free(name); return aliases; } GHashTable *metadata_cache = NULL; void free_metadata_cache(void) { if (metadata_cache != NULL) { g_hash_table_destroy(metadata_cache); metadata_cache = NULL; } } static void init_metadata_cache(void) { if (metadata_cache == NULL) { metadata_cache = pcmk__strkey_table(free, free); } } int get_agent_metadata(const char *agent, xmlNode ** metadata) { char *buffer = NULL; if (metadata == NULL) { return EINVAL; } *metadata = NULL; if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT_INTERNAL, pcmk__str_none)) { return pcmk_rc_ok; } init_metadata_cache(); buffer = g_hash_table_lookup(metadata_cache, agent); if (buffer == NULL) { stonith_t *st = stonith_api_new(); int rc; if (st == NULL) { crm_warn("Could not get agent meta-data: " "API memory allocation failed"); return EAGAIN; } rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); stonith_api_delete(st); if (rc || !buffer) { crm_err("Could not retrieve metadata for fencing agent %s", agent); return EAGAIN; } g_hash_table_replace(metadata_cache, pcmk__str_copy(agent), buffer); } *metadata = pcmk__xml_parse(buffer); return pcmk_rc_ok; } static gboolean is_nodeid_required(xmlNode * xml) { xmlXPathObjectPtr xpath = NULL; - if (stand_alone) { - return FALSE; - } - if (!xml) { return FALSE; } xpath = xpath_search(xml, "//" PCMK_XE_PARAMETER "[@" PCMK_XA_NAME "='nodeid']"); if (numXpathResults(xpath) <= 0) { freeXpathObject(xpath); return FALSE; } freeXpathObject(xpath); return TRUE; } static void read_action_metadata(stonith_device_t *device) { xmlXPathObjectPtr xpath = NULL; int max = 0; int lpc = 0; if (device->agent_metadata == NULL) { return; } xpath = xpath_search(device->agent_metadata, "//action"); max = numXpathResults(xpath); if (max <= 0) { freeXpathObject(xpath); return; } for (lpc = 0; lpc < max; lpc++) { const char *action = NULL; xmlNode *match = getXpathResult(xpath, lpc); CRM_LOG_ASSERT(match != NULL); if(match == NULL) { continue; }; action = crm_element_value(match, PCMK_XA_NAME); if (pcmk__str_eq(action, PCMK_ACTION_LIST, pcmk__str_none)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_list); } else if (pcmk__str_eq(action, PCMK_ACTION_STATUS, pcmk__str_none)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_status); } else if (pcmk__str_eq(action, PCMK_ACTION_REBOOT, pcmk__str_none)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_reboot); } else if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) { /* PCMK_XA_AUTOMATIC means the cluster will unfence a node when it * joins. * * @COMPAT PCMK__XA_REQUIRED is a deprecated synonym for * PCMK_XA_AUTOMATIC. */ if (pcmk__xe_attr_is_true(match, PCMK_XA_AUTOMATIC) || pcmk__xe_attr_is_true(match, PCMK__XA_REQUIRED)) { device->automatic_unfencing = TRUE; } stonith__set_device_flags(device->flags, device->id, st_device_supports_on); } if ((action != NULL) && pcmk__xe_attr_is_true(match, PCMK_XA_ON_TARGET)) { pcmk__add_word(&(device->on_target_actions), 64, action); } } freeXpathObject(xpath); } static const char * target_list_type(stonith_device_t * dev) { const char *check_type = NULL; check_type = g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK); if (check_type == NULL) { if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_LIST)) { check_type = PCMK_VALUE_STATIC_LIST; } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP)) { check_type = PCMK_VALUE_STATIC_LIST; } else if (pcmk_is_set(dev->flags, st_device_supports_list)) { check_type = PCMK_VALUE_DYNAMIC_LIST; } else if (pcmk_is_set(dev->flags, st_device_supports_status)) { check_type = PCMK_VALUE_STATUS; } else { check_type = PCMK_VALUE_NONE; } } return check_type; } static stonith_device_t * build_device_from_xml(xmlNode *dev) { const char *value; stonith_device_t *device = NULL; char *agent = crm_element_value_copy(dev, PCMK_XA_AGENT); CRM_CHECK(agent != NULL, return device); device = pcmk__assert_alloc(1, sizeof(stonith_device_t)); device->id = crm_element_value_copy(dev, PCMK_XA_ID); device->agent = agent; device->namespace = crm_element_value_copy(dev, PCMK__XA_NAMESPACE); device->params = xml2list(dev); value = g_hash_table_lookup(device->params, PCMK_STONITH_HOST_LIST); if (value) { device->targets = stonith__parse_targets(value); } value = g_hash_table_lookup(device->params, PCMK_STONITH_HOST_MAP); device->aliases = build_port_aliases(value, &(device->targets)); value = target_list_type(device); if (!pcmk__str_eq(value, PCMK_VALUE_STATIC_LIST, pcmk__str_casei) && (device->targets != NULL)) { // device->targets is necessary only with PCMK_VALUE_STATIC_LIST g_list_free_full(device->targets, free); device->targets = NULL; } switch (get_agent_metadata(device->agent, &device->agent_metadata)) { case pcmk_rc_ok: if (device->agent_metadata) { read_action_metadata(device); stonith__device_parameter_flags(&(device->flags), device->id, device->agent_metadata); } break; case EAGAIN: if (device->timer == NULL) { device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, TRUE, get_agent_metadata_cb, device); } if (!mainloop_timer_running(device->timer)) { mainloop_timer_start(device->timer); } break; default: break; } value = g_hash_table_lookup(device->params, "nodeid"); if (!value) { device->include_nodeid = is_nodeid_required(device->agent_metadata); } value = crm_element_value(dev, PCMK__XA_RSC_PROVIDES); if (pcmk__str_eq(value, PCMK_VALUE_UNFENCING, pcmk__str_casei)) { device->automatic_unfencing = TRUE; } if (is_action_required(PCMK_ACTION_ON, device)) { crm_info("Fencing device '%s' requires unfencing", device->id); } if (device->on_target_actions != NULL) { crm_info("Fencing device '%s' requires actions (%s) to be executed " "on target", device->id, (const char *) device->on_target_actions->str); } device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device); /* TODO: Hook up priority */ return device; } static void schedule_internal_command(const char *origin, stonith_device_t * device, const char *action, const char *target, int timeout, void *internal_user_data, void (*done_cb) (int pid, const pcmk__action_result_t *result, void *user_data)) { async_command_t *cmd = NULL; cmd = pcmk__assert_alloc(1, sizeof(async_command_t)); cmd->id = -1; cmd->default_timeout = timeout ? timeout : 60; cmd->timeout = cmd->default_timeout; cmd->action = pcmk__str_copy(action); cmd->target = pcmk__str_copy(target); cmd->device = pcmk__str_copy(device->id); cmd->origin = pcmk__str_copy(origin); cmd->client = pcmk__str_copy(crm_system_name); cmd->client_name = pcmk__str_copy(crm_system_name); cmd->internal_user_data = internal_user_data; cmd->done_cb = done_cb; /* cmd, not internal_user_data, is passed to 'done_cb' as the userdata */ schedule_stonith_command(cmd, device); } // Fence agent status commands use custom exit status codes enum fence_status_code { fence_status_invalid = -1, fence_status_active = 0, fence_status_unknown = 1, fence_status_inactive = 2, }; static void status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd_device(cmd); gboolean can = FALSE; free_async_command(cmd); if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } mainloop_set_trigger(dev->work); if (result->execution_status != PCMK_EXEC_DONE) { crm_warn("Assuming %s cannot fence %s " "because status could not be executed: %s%s%s%s", dev->id, search->host, pcmk_exec_status_str(result->execution_status), ((result->exit_reason == NULL)? "" : " ("), ((result->exit_reason == NULL)? "" : result->exit_reason), ((result->exit_reason == NULL)? "" : ")")); search_devices_record_result(search, dev->id, FALSE); return; } switch (result->exit_status) { case fence_status_unknown: crm_trace("%s reported it cannot fence %s", dev->id, search->host); break; case fence_status_active: case fence_status_inactive: crm_trace("%s reported it can fence %s", dev->id, search->host); can = TRUE; break; default: crm_warn("Assuming %s cannot fence %s " "(status returned unknown code %d)", dev->id, search->host, result->exit_status); break; } search_devices_record_result(search, dev->id, can); } static void dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd_device(cmd); gboolean can_fence = FALSE; free_async_command(cmd); /* Host/alias must be in the list output to be eligible to be fenced * * Will cause problems if down'd nodes aren't listed or (for virtual nodes) * if the guest is still listed despite being moved to another machine */ if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } mainloop_set_trigger(dev->work); if (pcmk__result_ok(result)) { crm_info("Refreshing target list for %s", dev->id); g_list_free_full(dev->targets, free); dev->targets = stonith__parse_targets(result->action_stdout); dev->targets_age = time(NULL); } else if (dev->targets != NULL) { if (result->execution_status == PCMK_EXEC_DONE) { crm_info("Reusing most recent target list for %s " "because list returned error code %d", dev->id, result->exit_status); } else { crm_info("Reusing most recent target list for %s " "because list could not be executed: %s%s%s%s", dev->id, pcmk_exec_status_str(result->execution_status), ((result->exit_reason == NULL)? "" : " ("), ((result->exit_reason == NULL)? "" : result->exit_reason), ((result->exit_reason == NULL)? "" : ")")); } } else { // We have never successfully executed list if (result->execution_status == PCMK_EXEC_DONE) { crm_warn("Assuming %s cannot fence %s " "because list returned error code %d", dev->id, search->host, result->exit_status); } else { crm_warn("Assuming %s cannot fence %s " "because list could not be executed: %s%s%s%s", dev->id, search->host, pcmk_exec_status_str(result->execution_status), ((result->exit_reason == NULL)? "" : " ("), ((result->exit_reason == NULL)? "" : result->exit_reason), ((result->exit_reason == NULL)? "" : ")")); } /* Fall back to pcmk_host_check=PCMK_VALUE_STATUS if the user didn't * explicitly specify PCMK_VALUE_DYNAMIC_LIST */ if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK) == NULL) { crm_notice("Switching to pcmk_host_check='status' for %s", dev->id); pcmk__insert_dup(dev->params, PCMK_STONITH_HOST_CHECK, PCMK_VALUE_STATUS); } } if (dev->targets) { const char *alias = g_hash_table_lookup(dev->aliases, search->host); if (!alias) { alias = search->host; } if (pcmk__str_in_list(alias, dev->targets, pcmk__str_casei)) { can_fence = TRUE; } } search_devices_record_result(search, dev->id, can_fence); } /*! * \internal * \brief Returns true if any key in first is not in second or second has a different value for key */ static int device_params_diff(GHashTable *first, GHashTable *second) { char *key = NULL; char *value = NULL; GHashTableIter gIter; g_hash_table_iter_init(&gIter, first); while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&value)) { if(strstr(key, "CRM_meta") == key) { continue; } else if (strcmp(key, PCMK_XA_CRM_FEATURE_SET) == 0) { continue; } else { char *other_value = g_hash_table_lookup(second, key); if (!other_value || !pcmk__str_eq(other_value, value, pcmk__str_casei)) { crm_trace("Different value for %s: %s != %s", key, other_value, value); return 1; } } } return 0; } /*! * \internal * \brief Checks to see if an identical device already exists in the device_list */ static stonith_device_t * device_has_duplicate(const stonith_device_t *device) { stonith_device_t *dup = g_hash_table_lookup(device_list, device->id); if (!dup) { crm_trace("No match for %s", device->id); return NULL; } else if (!pcmk__str_eq(dup->agent, device->agent, pcmk__str_casei)) { crm_trace("Different agent: %s != %s", dup->agent, device->agent); return NULL; } // Use pcmk__digest_operation() here? if (device_params_diff(device->params, dup->params) || device_params_diff(dup->params, device->params)) { return NULL; } crm_trace("Match"); return dup; } int stonith_device_register(xmlNode *dev, gboolean from_cib) { stonith_device_t *dup = NULL; stonith_device_t *device = build_device_from_xml(dev); guint ndevices = 0; int rv = pcmk_ok; CRM_CHECK(device != NULL, return -ENOMEM); /* do we have a watchdog-device? */ if (pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none) || pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) do { if (stonith_watchdog_timeout_ms <= 0) { crm_err("Ignoring watchdog fence device without " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " set."); rv = -ENODEV; /* fall through to cleanup & return */ } else if (!pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { crm_err("Ignoring watchdog fence device with unknown " "agent '%s' unequal '" STONITH_WATCHDOG_AGENT "'.", device->agent?device->agent:""); rv = -ENODEV; /* fall through to cleanup & return */ } else if (!pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none)) { crm_err("Ignoring watchdog fence device " "named %s !='"STONITH_WATCHDOG_ID"'.", device->id?device->id:""); rv = -ENODEV; /* fall through to cleanup & return */ } else { const char *local_node_name = fenced_get_local_node(); if (pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { /* this either has an empty list or the targets configured for watchdog-fencing */ g_list_free_full(stonith_watchdog_targets, free); stonith_watchdog_targets = device->targets; device->targets = NULL; } if (node_does_watchdog_fencing(local_node_name)) { g_list_free_full(device->targets, free); device->targets = stonith__parse_targets(local_node_name); pcmk__insert_dup(device->params, PCMK_STONITH_HOST_LIST, local_node_name); /* proceed as with any other stonith-device */ break; } crm_debug("Skip registration of watchdog fence device on node not in host-list."); /* cleanup and fall through to more cleanup and return */ device->targets = NULL; stonith_device_remove(device->id, from_cib); } free_device(device); return rv; } while (0); dup = device_has_duplicate(device); if (dup) { ndevices = g_hash_table_size(device_list); crm_debug("Device '%s' already in device list (%d active device%s)", device->id, ndevices, pcmk__plural_s(ndevices)); free_device(device); device = dup; dup = g_hash_table_lookup(device_list, device->id); dup->dirty = FALSE; } else { stonith_device_t *old = g_hash_table_lookup(device_list, device->id); if (from_cib && old && old->api_registered) { /* If the cib is writing over an entry that is shared with a stonith client, * copy any pending ops that currently exist on the old entry to the new one. * Otherwise the pending ops will be reported as failures */ crm_info("Overwriting existing entry for %s from CIB", device->id); device->pending_ops = old->pending_ops; device->api_registered = TRUE; old->pending_ops = NULL; if (device->pending_ops) { mainloop_set_trigger(device->work); } } g_hash_table_replace(device_list, device->id, device); ndevices = g_hash_table_size(device_list); crm_notice("Added '%s' to device list (%d active device%s)", device->id, ndevices, pcmk__plural_s(ndevices)); } if (from_cib) { device->cib_registered = TRUE; } else { device->api_registered = TRUE; } return pcmk_ok; } void stonith_device_remove(const char *id, bool from_cib) { stonith_device_t *device = g_hash_table_lookup(device_list, id); guint ndevices = 0; if (!device) { ndevices = g_hash_table_size(device_list); crm_info("Device '%s' not found (%d active device%s)", id, ndevices, pcmk__plural_s(ndevices)); return; } if (from_cib) { device->cib_registered = FALSE; } else { device->verified = FALSE; device->api_registered = FALSE; } if (!device->cib_registered && !device->api_registered) { g_hash_table_remove(device_list, id); ndevices = g_hash_table_size(device_list); crm_info("Removed '%s' from device list (%d active device%s)", id, ndevices, pcmk__plural_s(ndevices)); } else { crm_trace("Not removing '%s' from device list (%d active) because " "still registered via:%s%s", id, g_hash_table_size(device_list), (device->cib_registered? " cib" : ""), (device->api_registered? " api" : "")); } } /*! * \internal * \brief Return the number of stonith levels registered for a node * * \param[in] tp Node's topology table entry * * \return Number of non-NULL levels in topology entry * \note This function is used only for log messages. */ static int count_active_levels(const stonith_topology_t *tp) { int lpc = 0; int count = 0; for (lpc = 0; lpc < ST__LEVEL_COUNT; lpc++) { if (tp->levels[lpc] != NULL) { count++; } } return count; } static void free_topology_entry(gpointer data) { stonith_topology_t *tp = data; int lpc = 0; for (lpc = 0; lpc < ST__LEVEL_COUNT; lpc++) { if (tp->levels[lpc] != NULL) { g_list_free_full(tp->levels[lpc], free); } } free(tp->target); free(tp->target_value); free(tp->target_pattern); free(tp->target_attribute); free(tp); } void free_topology_list(void) { if (topology != NULL) { g_hash_table_destroy(topology); topology = NULL; } } void init_topology_list(void) { if (topology == NULL) { topology = pcmk__strkey_table(NULL, free_topology_entry); } } char * stonith_level_key(const xmlNode *level, enum fenced_target_by mode) { if (mode == fenced_target_by_unknown) { mode = unpack_level_kind(level); } switch (mode) { case fenced_target_by_name: return crm_element_value_copy(level, PCMK_XA_TARGET); case fenced_target_by_pattern: return crm_element_value_copy(level, PCMK_XA_TARGET_PATTERN); case fenced_target_by_attribute: return crm_strdup_printf("%s=%s", crm_element_value(level, PCMK_XA_TARGET_ATTRIBUTE), crm_element_value(level, PCMK_XA_TARGET_VALUE)); default: return crm_strdup_printf("unknown-%s", pcmk__xe_id(level)); } } /*! * \internal * \brief Parse target identification from topology level XML * * \param[in] level Topology level XML to parse * * \return How to identify target of \p level */ static enum fenced_target_by unpack_level_kind(const xmlNode *level) { if (crm_element_value(level, PCMK_XA_TARGET) != NULL) { return fenced_target_by_name; } if (crm_element_value(level, PCMK_XA_TARGET_PATTERN) != NULL) { return fenced_target_by_pattern; } - if (!stand_alone /* if standalone, there's no attribute manager */ - && (crm_element_value(level, PCMK_XA_TARGET_ATTRIBUTE) != NULL) + if ((crm_element_value(level, PCMK_XA_TARGET_ATTRIBUTE) != NULL) && (crm_element_value(level, PCMK_XA_TARGET_VALUE) != NULL)) { return fenced_target_by_attribute; } return fenced_target_by_unknown; } static stonith_key_value_t * parse_device_list(const char *devices) { int lpc = 0; int max = 0; int last = 0; stonith_key_value_t *output = NULL; if (devices == NULL) { return output; } max = strlen(devices); for (lpc = 0; lpc <= max; lpc++) { if (devices[lpc] == ',' || devices[lpc] == 0) { char *line = strndup(devices + last, lpc - last); output = stonith_key_value_add(output, NULL, line); free(line); last = lpc + 1; } } return output; } /*! * \internal * \brief Unpack essential information from topology request XML * * \param[in] xml Request XML to search * \param[out] mode If not NULL, where to store level kind * \param[out] target If not NULL, where to store representation of target * \param[out] id If not NULL, where to store level number * \param[out] desc If not NULL, where to store log-friendly level description * * \return Topology level XML from within \p xml, or NULL if not found * \note The caller is responsible for freeing \p *target and \p *desc if set. */ static xmlNode * unpack_level_request(xmlNode *xml, enum fenced_target_by *mode, char **target, int *id, char **desc) { enum fenced_target_by local_mode = fenced_target_by_unknown; char *local_target = NULL; int local_id = 0; /* The level element can be the top element or lower. If top level, don't * search by xpath, because it might give multiple hits if the XML is the * CIB. */ if ((xml != NULL) && !pcmk__xe_is(xml, PCMK_XE_FENCING_LEVEL)) { xml = get_xpath_object("//" PCMK_XE_FENCING_LEVEL, xml, LOG_WARNING); } if (xml == NULL) { if (desc != NULL) { *desc = crm_strdup_printf("missing"); } } else { local_mode = unpack_level_kind(xml); local_target = stonith_level_key(xml, local_mode); crm_element_value_int(xml, PCMK_XA_INDEX, &local_id); if (desc != NULL) { *desc = crm_strdup_printf("%s[%d]", local_target, local_id); } } if (mode != NULL) { *mode = local_mode; } if (id != NULL) { *id = local_id; } if (target != NULL) { *target = local_target; } else { free(local_target); } return xml; } /*! * \internal * \brief Register a fencing topology level for a target * * Given an XML request specifying the target name, level index, and device IDs * for the level, this will create an entry for the target in the global topology * table if one does not already exist, then append the specified device IDs to * the entry's device list for the specified level. * * \param[in] msg XML request for STONITH level registration * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" * \param[out] result Where to set result of registration */ void fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) { int id = 0; xmlNode *level; enum fenced_target_by mode; char *target; stonith_topology_t *tp; stonith_key_value_t *dIter = NULL; stonith_key_value_t *devices = NULL; CRM_CHECK((msg != NULL) && (result != NULL), return); level = unpack_level_request(msg, &mode, &target, &id, desc); if (level == NULL) { fenced_set_protocol_error(result); return; } // Ensure an ID was given (even the client API adds an ID) if (pcmk__str_empty(pcmk__xe_id(level))) { crm_warn("Ignoring registration for topology level without ID"); free(target); crm_log_xml_trace(level, "Bad level"); pcmk__format_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, "Topology level is invalid without ID"); return; } // Ensure a valid target was specified if (mode == fenced_target_by_unknown) { crm_warn("Ignoring registration for topology level '%s' " "without valid target", pcmk__xe_id(level)); free(target); crm_log_xml_trace(level, "Bad level"); pcmk__format_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, "Invalid target for topology level '%s'", pcmk__xe_id(level)); return; } // Ensure level ID is in allowed range if ((id < ST__LEVEL_MIN) || (id > ST__LEVEL_MAX)) { crm_warn("Ignoring topology registration for %s with invalid level %d", target, id); free(target); crm_log_xml_trace(level, "Bad level"); pcmk__format_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, "Invalid level number '%s' for topology level '%s'", pcmk__s(crm_element_value(level, PCMK_XA_INDEX), ""), pcmk__xe_id(level)); return; } /* Find or create topology table entry */ tp = g_hash_table_lookup(topology, target); if (tp == NULL) { tp = pcmk__assert_alloc(1, sizeof(stonith_topology_t)); tp->kind = mode; tp->target = target; tp->target_value = crm_element_value_copy(level, PCMK_XA_TARGET_VALUE); tp->target_pattern = crm_element_value_copy(level, PCMK_XA_TARGET_PATTERN); tp->target_attribute = crm_element_value_copy(level, PCMK_XA_TARGET_ATTRIBUTE); g_hash_table_replace(topology, tp->target, tp); crm_trace("Added %s (%d) to the topology (%d active entries)", target, (int) mode, g_hash_table_size(topology)); } else { free(target); } if (tp->levels[id] != NULL) { crm_info("Adding to the existing %s[%d] topology entry", tp->target, id); } devices = parse_device_list(crm_element_value(level, PCMK_XA_DEVICES)); for (dIter = devices; dIter; dIter = dIter->next) { const char *device = dIter->value; crm_trace("Adding device '%s' for %s[%d]", device, tp->target, id); tp->levels[id] = g_list_append(tp->levels[id], pcmk__str_copy(device)); } stonith_key_value_freeall(devices, 1, 1); { int nlevels = count_active_levels(tp); crm_info("Target %s has %d active fencing level%s", tp->target, nlevels, pcmk__plural_s(nlevels)); } pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } /*! * \internal * \brief Unregister a fencing topology level for a target * * Given an XML request specifying the target name and level index (or 0 for all * levels), this will remove any corresponding entry for the target from the * global topology table. * * \param[in] msg XML request for STONITH level registration * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" * \param[out] result Where to set result of unregistration */ void fenced_unregister_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) { int id = -1; stonith_topology_t *tp; char *target; xmlNode *level = NULL; CRM_CHECK(result != NULL, return); level = unpack_level_request(msg, NULL, &target, &id, desc); if (level == NULL) { fenced_set_protocol_error(result); return; } // Ensure level ID is in allowed range if ((id < 0) || (id >= ST__LEVEL_COUNT)) { crm_warn("Ignoring topology unregistration for %s with invalid level %d", target, id); free(target); crm_log_xml_trace(level, "Bad level"); pcmk__format_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, "Invalid level number '%s' for topology level %s", pcmk__s(crm_element_value(level, PCMK_XA_INDEX), ""), // Client API doesn't add ID to unregistration XML pcmk__s(pcmk__xe_id(level), "")); return; } tp = g_hash_table_lookup(topology, target); if (tp == NULL) { guint nentries = g_hash_table_size(topology); crm_info("No fencing topology found for %s (%d active %s)", target, nentries, pcmk__plural_alt(nentries, "entry", "entries")); } else if (id == 0 && g_hash_table_remove(topology, target)) { guint nentries = g_hash_table_size(topology); crm_info("Removed all fencing topology entries related to %s " "(%d active %s remaining)", target, nentries, pcmk__plural_alt(nentries, "entry", "entries")); } else if (tp->levels[id] != NULL) { guint nlevels; g_list_free_full(tp->levels[id], free); tp->levels[id] = NULL; nlevels = count_active_levels(tp); crm_info("Removed level %d from fencing topology for %s " "(%d active level%s remaining)", id, target, nlevels, pcmk__plural_s(nlevels)); } free(target); pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } static char * list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) { int max = g_list_length(list); size_t delim_len = delim?strlen(delim):0; size_t alloc_size = 1 + (max?((max-1+(terminate_with_delim?1:0))*delim_len):0); char *rv; GList *gIter; char *pos = NULL; const char *lead_delim = ""; for (gIter = list; gIter != NULL; gIter = gIter->next) { const char *value = (const char *) gIter->data; alloc_size += strlen(value); } rv = pcmk__assert_alloc(alloc_size, sizeof(char)); pos = rv; for (gIter = list; gIter != NULL; gIter = gIter->next) { const char *value = (const char *) gIter->data; pos = &pos[sprintf(pos, "%s%s", lead_delim, value)]; lead_delim = delim; } if (max && terminate_with_delim) { sprintf(pos, "%s", delim); } return rv; } /*! * \internal * \brief Execute a fence agent action directly (and asynchronously) * * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action * directly on a specified device. Only list, monitor, and status actions are * expected to use this call, though it should work with any agent command. * * \param[in] msg Request XML specifying action * \param[out] result Where to store result of action * * \note If the action is monitor, the device must be registered via the API * (CIB registration is not sufficient), because monitor should not be * possible unless the device is "started" (API registered). */ static void execute_agent_action(xmlNode *msg, pcmk__action_result_t *result) { xmlNode *dev = get_xpath_object("//" PCMK__XE_ST_DEVICE_ID, msg, LOG_ERR); xmlNode *op = get_xpath_object("//@" PCMK__XE_ST_DEVICE_ACTION, msg, LOG_ERR); const char *id = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID); const char *action = crm_element_value(op, PCMK__XA_ST_DEVICE_ACTION); async_command_t *cmd = NULL; stonith_device_t *device = NULL; if ((id == NULL) || (action == NULL)) { crm_info("Malformed API action request: device %s, action %s", (id? id : "not specified"), (action? action : "not specified")); fenced_set_protocol_error(result); return; } if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { // Watchdog agent actions are implemented internally if (stonith_watchdog_timeout_ms <= 0) { pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "Watchdog fence device not configured"); return; } else if (pcmk__str_eq(action, PCMK_ACTION_LIST, pcmk__str_none)) { pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); pcmk__set_result_output(result, list_to_string(stonith_watchdog_targets, "\n", TRUE), NULL); return; } else if (pcmk__str_eq(action, PCMK_ACTION_MONITOR, pcmk__str_none)) { pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); return; } } device = g_hash_table_lookup(device_list, id); if (device == NULL) { crm_info("Ignoring API '%s' action request because device %s not found", action, id); pcmk__format_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "'%s' not found", id); return; } else if (!device->api_registered && (strcmp(action, PCMK_ACTION_MONITOR) == 0)) { // Monitors may run only on "started" (API-registered) devices crm_info("Ignoring API '%s' action request because device %s not active", action, id); pcmk__format_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "'%s' not active", id); return; } cmd = create_async_command(msg); if (cmd == NULL) { crm_log_xml_warn(msg, "invalid"); fenced_set_protocol_error(result); return; } schedule_stonith_command(cmd, device); pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); } static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence) { search->replies_received++; if (can_fence && device) { if (search->support_action_only != st_device_supports_none) { stonith_device_t *dev = g_hash_table_lookup(device_list, device); if (dev && !pcmk_is_set(dev->flags, search->support_action_only)) { return; } } search->capable = g_list_append(search->capable, pcmk__str_copy(device)); } if (search->replies_needed == search->replies_received) { guint ndevices = g_list_length(search->capable); crm_debug("Search found %d device%s that can perform '%s' targeting %s", ndevices, pcmk__plural_s(ndevices), (search->action? search->action : "unknown action"), (search->host? search->host : "any node")); search->callback(search->capable, search->user_data); free(search->host); free(search->action); free(search); } } /*! * \internal * \brief Check whether the local host is allowed to execute a fencing action * * \param[in] device Fence device to check * \param[in] action Fence action to check * \param[in] target Hostname of fence target * \param[in] allow_suicide Whether self-fencing is allowed for this operation * * \return TRUE if local host is allowed to execute action, FALSE otherwise */ static gboolean localhost_is_eligible(const stonith_device_t *device, const char *action, const char *target, gboolean allow_suicide) { gboolean localhost_is_target = pcmk__str_eq(target, fenced_get_local_node(), pcmk__str_casei); if ((device != NULL) && (action != NULL) && (device->on_target_actions != NULL) && (strstr((const char*) device->on_target_actions->str, action) != NULL)) { if (!localhost_is_target) { crm_trace("Operation '%s' using %s can only be executed for local " "host, not %s", action, device->id, target); return FALSE; } } else if (localhost_is_target && !allow_suicide) { crm_trace("'%s' operation does not support self-fencing", action); return FALSE; } return TRUE; } /*! * \internal * \brief Check if local node is allowed to execute (possibly remapped) action * * \param[in] device Fence device to check * \param[in] action Fence action to check * \param[in] target Node name of fence target * \param[in] allow_self Whether self-fencing is allowed for this operation * * \return true if local node is allowed to execute \p action or any actions it * might be remapped to, otherwise false */ static bool localhost_is_eligible_with_remap(const stonith_device_t *device, const char *action, const char *target, gboolean allow_self) { // Check exact action if (localhost_is_eligible(device, action, target, allow_self)) { return true; } // Check potential remaps if (pcmk__str_eq(action, PCMK_ACTION_REBOOT, pcmk__str_none)) { /* "reboot" might get remapped to "off" then "on", so even if reboot is * disallowed, return true if either of those is allowed. We'll report * the disallowed actions with the results. We never allow self-fencing * for remapped "on" actions because the target is off at that point. */ if (localhost_is_eligible(device, PCMK_ACTION_OFF, target, allow_self) || localhost_is_eligible(device, PCMK_ACTION_ON, target, FALSE)) { return true; } } return false; } static void can_fence_host_with_device(stonith_device_t *dev, struct device_search_s *search) { gboolean can = FALSE; const char *check_type = "Internal bug"; const char *target = NULL; const char *alias = NULL; const char *dev_id = "Unspecified device"; const char *action = (search == NULL)? NULL : search->action; CRM_CHECK((dev != NULL) && (action != NULL), goto search_report_results); if (dev->id != NULL) { dev_id = dev->id; } target = search->host; if (target == NULL) { can = TRUE; check_type = "No target"; goto search_report_results; } /* Answer immediately if the device does not support the action * or the local node is not allowed to perform it */ if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none) && !pcmk_is_set(dev->flags, st_device_supports_on)) { check_type = "Agent does not support 'on'"; goto search_report_results; } else if (!localhost_is_eligible_with_remap(dev, action, target, search->allow_suicide)) { check_type = "This node is not allowed to execute action"; goto search_report_results; } // Check eligibility as specified by pcmk_host_check check_type = target_list_type(dev); alias = g_hash_table_lookup(dev->aliases, target); if (pcmk__str_eq(check_type, PCMK_VALUE_NONE, pcmk__str_casei)) { can = TRUE; } else if (pcmk__str_eq(check_type, PCMK_VALUE_STATIC_LIST, pcmk__str_casei)) { if (pcmk__str_in_list(target, dev->targets, pcmk__str_casei)) { can = TRUE; } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP) && g_hash_table_lookup(dev->aliases, target)) { can = TRUE; } } else if (pcmk__str_eq(check_type, PCMK_VALUE_DYNAMIC_LIST, pcmk__str_casei)) { time_t now = time(NULL); if (dev->targets == NULL || dev->targets_age + 60 < now) { int device_timeout = get_action_timeout(dev, PCMK_ACTION_LIST, search->per_device_timeout); if (device_timeout > search->per_device_timeout) { crm_notice("Since the pcmk_list_timeout (%ds) parameter of %s " "is larger than " PCMK_OPT_STONITH_TIMEOUT " (%ds), timeout may occur", device_timeout, dev_id, search->per_device_timeout); } crm_trace("Running '%s' to check whether %s is eligible to fence %s (%s)", check_type, dev_id, target, action); schedule_internal_command(__func__, dev, PCMK_ACTION_LIST, NULL, search->per_device_timeout, search, dynamic_list_search_cb); /* we'll respond to this search request async in the cb */ return; } if (pcmk__str_in_list(((alias == NULL)? target : alias), dev->targets, pcmk__str_casei)) { can = TRUE; } } else if (pcmk__str_eq(check_type, PCMK_VALUE_STATUS, pcmk__str_casei)) { int device_timeout = get_action_timeout(dev, check_type, search->per_device_timeout); if (device_timeout > search->per_device_timeout) { crm_notice("Since the pcmk_status_timeout (%ds) parameter of %s is " "larger than " PCMK_OPT_STONITH_TIMEOUT " (%ds), " "timeout may occur", device_timeout, dev_id, search->per_device_timeout); } crm_trace("Running '%s' to check whether %s is eligible to fence %s (%s)", check_type, dev_id, target, action); schedule_internal_command(__func__, dev, PCMK_ACTION_STATUS, target, search->per_device_timeout, search, status_search_cb); /* we'll respond to this search request async in the cb */ return; } else { crm_err("Invalid value for " PCMK_STONITH_HOST_CHECK ": %s", check_type); check_type = "Invalid " PCMK_STONITH_HOST_CHECK; } search_report_results: crm_info("%s is%s eligible to fence (%s) %s%s%s%s: %s", dev_id, (can? "" : " not"), pcmk__s(action, "unspecified action"), pcmk__s(target, "unspecified target"), (alias == NULL)? "" : " (as '", pcmk__s(alias, ""), (alias == NULL)? "" : "')", check_type); search_devices_record_result(search, ((dev == NULL)? NULL : dev_id), can); } static void search_devices(gpointer key, gpointer value, gpointer user_data) { stonith_device_t *dev = value; struct device_search_s *search = user_data; can_fence_host_with_device(dev, search); } #define DEFAULT_QUERY_TIMEOUT 20 static void get_capable_devices(const char *host, const char *action, int timeout, bool suicide, void *user_data, void (*callback) (GList * devices, void *user_data), uint32_t support_action_only) { struct device_search_s *search; guint ndevices = g_hash_table_size(device_list); if (ndevices == 0) { callback(NULL, user_data); return; } search = pcmk__assert_alloc(1, sizeof(struct device_search_s)); search->host = pcmk__str_copy(host); search->action = pcmk__str_copy(action); search->per_device_timeout = timeout; search->allow_suicide = suicide; search->callback = callback; search->user_data = user_data; search->support_action_only = support_action_only; /* We are guaranteed this many replies, even if a device is * unregistered while the search is in progress. */ search->replies_needed = ndevices; crm_debug("Searching %d device%s to see which can execute '%s' targeting %s", ndevices, pcmk__plural_s(ndevices), (search->action? search->action : "unknown action"), (search->host? search->host : "any node")); g_hash_table_foreach(device_list, search_devices, search); } struct st_query_data { xmlNode *reply; char *remote_peer; char *client_id; char *target; char *action; int call_options; }; /*! * \internal * \brief Add action-specific attributes to query reply XML * * \param[in,out] xml XML to add attributes to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target */ static void add_action_specific_attributes(xmlNode *xml, const char *action, const stonith_device_t *device, const char *target) { int action_specific_timeout; int delay_max; int delay_base; CRM_CHECK(xml && action && device, return); // PCMK__XA_ST_REQUIRED is currently used only for unfencing if (is_action_required(action, device)) { crm_trace("Action '%s' is required using %s", action, device->id); crm_xml_add_int(xml, PCMK__XA_ST_REQUIRED, 1); } // pcmk__timeout if configured action_specific_timeout = get_action_timeout(device, action, 0); if (action_specific_timeout) { crm_trace("Action '%s' has timeout %ds using %s", action, action_specific_timeout, device->id); crm_xml_add_int(xml, PCMK__XA_ST_ACTION_TIMEOUT, action_specific_timeout); } delay_max = get_action_delay_max(device, action); if (delay_max > 0) { crm_trace("Action '%s' has maximum random delay %ds using %s", action, delay_max, device->id); crm_xml_add_int(xml, PCMK__XA_ST_DELAY_MAX, delay_max); } delay_base = get_action_delay_base(device, action, target); if (delay_base > 0) { crm_xml_add_int(xml, PCMK__XA_ST_DELAY_BASE, delay_base); } if ((delay_max > 0) && (delay_base == 0)) { crm_trace("Action '%s' has maximum random delay %ds using %s", action, delay_max, device->id); } else if ((delay_max == 0) && (delay_base > 0)) { crm_trace("Action '%s' has a static delay of %ds using %s", action, delay_base, device->id); } else if ((delay_max > 0) && (delay_base > 0)) { crm_trace("Action '%s' has a minimum delay of %ds and a randomly chosen " "maximum delay of %ds using %s", action, delay_base, delay_max, device->id); } } /*! * \internal * \brief Add "disallowed" attribute to query reply XML if appropriate * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_disallowed(xmlNode *xml, const char *action, const stonith_device_t *device, const char *target, gboolean allow_suicide) { if (!localhost_is_eligible(device, action, target, allow_suicide)) { crm_trace("Action '%s' using %s is disallowed for local host", action, device->id); pcmk__xe_set_bool_attr(xml, PCMK__XA_ST_ACTION_DISALLOWED, true); } } /*! * \internal * \brief Add child element with action-specific values to query reply XML * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_action_reply(xmlNode *xml, const char *action, const stonith_device_t *device, const char *target, gboolean allow_suicide) { xmlNode *child = pcmk__xe_create(xml, PCMK__XE_ST_DEVICE_ACTION); crm_xml_add(child, PCMK_XA_ID, action); add_action_specific_attributes(child, action, device, target); add_disallowed(child, action, device, target, allow_suicide); } /*! * \internal * \brief Send a reply to a CPG peer or IPC client * * \param[in] reply XML reply to send * \param[in] call_options Send synchronously if st_opt_sync_call is set * \param[in] remote_peer If not NULL, name of peer node to send CPG reply * \param[in,out] client If not NULL, client to send IPC reply */ static void stonith_send_reply(const xmlNode *reply, int call_options, const char *remote_peer, pcmk__client_t *client) { CRM_CHECK((reply != NULL) && ((remote_peer != NULL) || (client != NULL)), return); if (remote_peer == NULL) { do_local_reply(reply, client, call_options); } else { const pcmk__node_status_t *node = pcmk__get_node(0, remote_peer, NULL, pcmk__node_search_cluster_member); pcmk__cluster_send_message(node, pcmk_ipc_fenced, reply); } } static void stonith_query_capable_device_cb(GList * devices, void *user_data) { struct st_query_data *query = user_data; int available_devices = 0; xmlNode *wrapper = NULL; xmlNode *list = NULL; GList *lpc = NULL; pcmk__client_t *client = NULL; if (query->client_id != NULL) { client = pcmk__find_client_by_id(query->client_id); if ((client == NULL) && (query->remote_peer == NULL)) { crm_trace("Skipping reply to %s: no longer a client", query->client_id); goto done; } } // Pack the results into XML wrapper = pcmk__xe_create(query->reply, PCMK__XE_ST_CALLDATA); list = pcmk__xe_create(wrapper, __func__); crm_xml_add(list, PCMK__XA_ST_TARGET, query->target); for (lpc = devices; lpc != NULL; lpc = lpc->next) { stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); const char *action = query->action; xmlNode *dev = NULL; if (!device) { /* It is possible the device got unregistered while * determining who can fence the target */ continue; } available_devices++; dev = pcmk__xe_create(list, PCMK__XE_ST_DEVICE_ID); crm_xml_add(dev, PCMK_XA_ID, device->id); crm_xml_add(dev, PCMK__XA_NAMESPACE, device->namespace); crm_xml_add(dev, PCMK_XA_AGENT, device->agent); // Has had successful monitor, list, or status on this node crm_xml_add_int(dev, PCMK__XA_ST_MONITOR_VERIFIED, device->verified); crm_xml_add_int(dev, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS, device->flags); /* If the originating fencer wants to reboot the node, and we have a * capable device that doesn't support "reboot", remap to "off" instead. */ if (!pcmk_is_set(device->flags, st_device_supports_reboot) && pcmk__str_eq(query->action, PCMK_ACTION_REBOOT, pcmk__str_none)) { crm_trace("%s doesn't support reboot, using values for off instead", device->id); action = PCMK_ACTION_OFF; } /* Add action-specific values if available */ add_action_specific_attributes(dev, action, device, query->target); if (pcmk__str_eq(query->action, PCMK_ACTION_REBOOT, pcmk__str_none)) { /* A "reboot" *might* get remapped to "off" then "on", so after * sending the "reboot"-specific values in the main element, we add * sub-elements for "off" and "on" values. * * We short-circuited earlier if "reboot", "off" and "on" are all * disallowed for the local host. However if only one or two are * disallowed, we send back the results and mark which ones are * disallowed. If "reboot" is disallowed, this might cause problems * with older fencer versions, which won't check for it. Older * versions will ignore "off" and "on", so they are not a problem. */ add_disallowed(dev, action, device, query->target, pcmk_is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, PCMK_ACTION_OFF, device, query->target, pcmk_is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, PCMK_ACTION_ON, device, query->target, FALSE); } /* A query without a target wants device parameters */ if (query->target == NULL) { xmlNode *attrs = pcmk__xe_create(dev, PCMK__XE_ATTRIBUTES); g_hash_table_foreach(device->params, hash2field, attrs); } } crm_xml_add_int(list, PCMK__XA_ST_AVAILABLE_DEVICES, available_devices); if (query->target) { crm_debug("Found %d matching device%s for target '%s'", available_devices, pcmk__plural_s(available_devices), query->target); } else { crm_debug("%d device%s installed", available_devices, pcmk__plural_s(available_devices)); } crm_log_xml_trace(list, "query-result"); stonith_send_reply(query->reply, query->call_options, query->remote_peer, client); done: pcmk__xml_free(query->reply); free(query->remote_peer); free(query->client_id); free(query->target); free(query->action); free(query); g_list_free_full(devices, free); } /*! * \internal * \brief Log the result of an asynchronous command * * \param[in] cmd Command the result is for * \param[in] result Result of command * \param[in] pid Process ID of command, if available * \param[in] next Alternate device that will be tried if command failed * \param[in] op_merged Whether this command was merged with an earlier one */ static void log_async_result(const async_command_t *cmd, const pcmk__action_result_t *result, int pid, const char *next, bool op_merged) { int log_level = LOG_ERR; int output_log_level = LOG_NEVER; guint devices_remaining = g_list_length(cmd->next_device_iter); GString *msg = g_string_sized_new(80); // Reasonable starting size // Choose log levels appropriately if we have a result if (pcmk__result_ok(result)) { log_level = (cmd->target == NULL)? LOG_DEBUG : LOG_NOTICE; if ((result->action_stdout != NULL) && !pcmk__str_eq(cmd->action, PCMK_ACTION_METADATA, pcmk__str_none)) { output_log_level = LOG_DEBUG; } next = NULL; } else { log_level = (cmd->target == NULL)? LOG_NOTICE : LOG_ERR; if ((result->action_stdout != NULL) && !pcmk__str_eq(cmd->action, PCMK_ACTION_METADATA, pcmk__str_none)) { output_log_level = LOG_WARNING; } } // Build the log message piece by piece pcmk__g_strcat(msg, "Operation '", cmd->action, "' ", NULL); if (pid != 0) { g_string_append_printf(msg, "[%d] ", pid); } if (cmd->target != NULL) { pcmk__g_strcat(msg, "targeting ", cmd->target, " ", NULL); } if (cmd->device != NULL) { pcmk__g_strcat(msg, "using ", cmd->device, " ", NULL); } // Add exit status or execution status as appropriate if (result->execution_status == PCMK_EXEC_DONE) { g_string_append_printf(msg, "returned %d", result->exit_status); } else { pcmk__g_strcat(msg, "could not be executed: ", pcmk_exec_status_str(result->execution_status), NULL); } // Add exit reason and next device if appropriate if (result->exit_reason != NULL) { pcmk__g_strcat(msg, " (", result->exit_reason, ")", NULL); } if (next != NULL) { pcmk__g_strcat(msg, ", retrying with ", next, NULL); } if (devices_remaining > 0) { g_string_append_printf(msg, " (%u device%s remaining)", (unsigned int) devices_remaining, pcmk__plural_s(devices_remaining)); } g_string_append_printf(msg, " " QB_XS " %scall %d from %s", (op_merged? "merged " : ""), cmd->id, cmd->client_name); // Log the result do_crm_log(log_level, "%s", msg->str); g_string_free(msg, TRUE); // Log the output (which may have multiple lines), if appropriate if (output_log_level != LOG_NEVER) { char *prefix = crm_strdup_printf("%s[%d]", cmd->device, pid); crm_log_output(output_log_level, prefix, result->action_stdout); free(prefix); } } /*! * \internal * \brief Reply to requester after asynchronous command completion * * \param[in] cmd Command that completed * \param[in] result Result of command * \param[in] pid Process ID of command, if available * \param[in] merged If true, command was merged with another, not executed */ static void send_async_reply(const async_command_t *cmd, const pcmk__action_result_t *result, int pid, bool merged) { xmlNode *reply = NULL; pcmk__client_t *client = NULL; CRM_CHECK((cmd != NULL) && (result != NULL), return); log_async_result(cmd, result, pid, NULL, merged); if (cmd->client != NULL) { client = pcmk__find_client_by_id(cmd->client); if ((client == NULL) && (cmd->origin == NULL)) { crm_trace("Skipping reply to %s: no longer a client", cmd->client); return; } } reply = construct_async_reply(cmd, result); if (merged) { pcmk__xe_set_bool_attr(reply, PCMK__XA_ST_OP_MERGED, true); } - if (!stand_alone && pcmk__is_fencing_action(cmd->action) + if (pcmk__is_fencing_action(cmd->action) && pcmk__str_eq(cmd->origin, cmd->target, pcmk__str_casei)) { /* The target was also the originator, so broadcast the result on its * behalf (since it will be unable to). */ crm_trace("Broadcast '%s' result for %s (target was also originator)", cmd->action, cmd->target); crm_xml_add(reply, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST); crm_xml_add(reply, PCMK__XA_ST_OP, STONITH_OP_NOTIFY); pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, reply); } else { // Reply only to the originator stonith_send_reply(reply, cmd->options, cmd->origin, client); } crm_log_xml_trace(reply, "Reply"); pcmk__xml_free(reply); - - if (stand_alone) { - /* Do notification with a clean data object */ - xmlNode *notify_data = pcmk__xe_create(NULL, PCMK__XE_ST_NOTIFY_FENCE); - - stonith__xe_set_result(notify_data, result); - crm_xml_add(notify_data, PCMK__XA_ST_TARGET, cmd->target); - crm_xml_add(notify_data, PCMK__XA_ST_OP, cmd->op); - crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, "localhost"); - crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ID, cmd->device); - crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, cmd->remote_op_id); - crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, cmd->client); - - fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, result, - notify_data); - fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL); - } } static void cancel_stonith_command(async_command_t * cmd) { stonith_device_t *device = cmd_device(cmd); if (device) { crm_trace("Cancel scheduled '%s' action using %s", cmd->action, device->id); device->pending_ops = g_list_remove(device->pending_ops, cmd); } } /*! * \internal * \brief Cancel and reply to any duplicates of a just-completed operation * * Check whether any fencing operations are scheduled to do the same thing as * one that just succeeded. If so, rather than performing the same operation * twice, return the result of this operation for all matching pending commands. * * \param[in,out] cmd Fencing operation that just succeeded * \param[in] result Result of \p cmd * \param[in] pid If nonzero, process ID of agent invocation (for logs) * * \note Duplicate merging will do the right thing for either type of remapped * reboot. If the executing fencer remapped an unsupported reboot to off, * then cmd->action will be "reboot" and will be merged with any other * reboot requests. If the originating fencer remapped a topology reboot * to off then on, we will get here once with cmd->action "off" and once * with "on", and they will be merged separately with similar requests. */ static void reply_to_duplicates(async_command_t *cmd, const pcmk__action_result_t *result, int pid) { GList *next = NULL; for (GList *iter = cmd_list; iter != NULL; iter = next) { async_command_t *cmd_other = iter->data; next = iter->next; // We might delete this entry, so grab next now if (cmd == cmd_other) { continue; } /* A pending operation matches if: * 1. The client connections are different. * 2. The target is the same. * 3. The fencing action is the same. * 4. The device scheduled to execute the action is the same. */ if (pcmk__str_eq(cmd->client, cmd_other->client, pcmk__str_casei) || !pcmk__str_eq(cmd->target, cmd_other->target, pcmk__str_casei) || !pcmk__str_eq(cmd->action, cmd_other->action, pcmk__str_none) || !pcmk__str_eq(cmd->device, cmd_other->device, pcmk__str_casei)) { continue; } crm_notice("Merging fencing action '%s'%s%s originating from " "client %s with identical fencing request from client %s", cmd_other->action, (cmd_other->target == NULL)? "" : " targeting ", pcmk__s(cmd_other->target, ""), cmd_other->client_name, cmd->client_name); // Stop tracking the duplicate, send its result, and cancel it cmd_list = g_list_remove_link(cmd_list, iter); send_async_reply(cmd_other, result, pid, true); cancel_stonith_command(cmd_other); free_async_command(cmd_other); g_list_free_1(iter); } } /*! * \internal * \brief Return the next required device (if any) for an operation * * \param[in,out] cmd Fencing operation that just succeeded * * \return Next device required for action if any, otherwise NULL */ static stonith_device_t * next_required_device(async_command_t *cmd) { for (GList *iter = cmd->next_device_iter; iter != NULL; iter = iter->next) { stonith_device_t *next_device = g_hash_table_lookup(device_list, iter->data); if (is_action_required(cmd->action, next_device)) { /* This is only called for successful actions, so it's OK to skip * non-required devices. */ cmd->next_device_iter = iter->next; return next_device; } } return NULL; } static void st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) { async_command_t *cmd = user_data; stonith_device_t *device = NULL; stonith_device_t *next_device = NULL; CRM_CHECK(cmd != NULL, return); device = cmd_device(cmd); cmd->active_on = NULL; /* The device is ready to do something else now */ if (device) { if (!device->verified && pcmk__result_ok(result) && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_LIST, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL)) { device->verified = TRUE; } mainloop_set_trigger(device->work); } if (pcmk__result_ok(result)) { next_device = next_required_device(cmd); } else if ((cmd->next_device_iter != NULL) && !is_action_required(cmd->action, device)) { /* if this device didn't work out, see if there are any others we can try. * if the failed device was 'required', we can't pick another device. */ next_device = g_hash_table_lookup(device_list, cmd->next_device_iter->data); cmd->next_device_iter = cmd->next_device_iter->next; } if (next_device == NULL) { send_async_reply(cmd, result, pid, false); if (pcmk__result_ok(result)) { reply_to_duplicates(cmd, result, pid); } free_async_command(cmd); } else { // This operation requires more fencing log_async_result(cmd, result, pid, next_device->id, false); schedule_stonith_command(cmd, next_device); } } static gint sort_device_priority(gconstpointer a, gconstpointer b) { const stonith_device_t *dev_a = a; const stonith_device_t *dev_b = b; if (dev_a->priority > dev_b->priority) { return -1; } else if (dev_a->priority < dev_b->priority) { return 1; } return 0; } static void stonith_fence_get_devices_cb(GList * devices, void *user_data) { async_command_t *cmd = user_data; stonith_device_t *device = NULL; guint ndevices = g_list_length(devices); crm_info("Found %d matching device%s for target '%s'", ndevices, pcmk__plural_s(ndevices), cmd->target); if (devices != NULL) { /* Order based on priority */ devices = g_list_sort(devices, sort_device_priority); device = g_hash_table_lookup(device_list, devices->data); } if (device == NULL) { // No device found pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; pcmk__format_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "No device configured for target '%s'", cmd->target); send_async_reply(cmd, &result, 0, false); pcmk__reset_result(&result); free_async_command(cmd); g_list_free_full(devices, free); } else { // Device found, schedule it for fencing cmd->device_list = devices; cmd->next_device_iter = devices->next; schedule_stonith_command(cmd, device); } } /*! * \internal * \brief Execute a fence action via the local node * * \param[in] msg Fencing request * \param[out] result Where to store result of fence action */ static void fence_locally(xmlNode *msg, pcmk__action_result_t *result) { const char *device_id = NULL; stonith_device_t *device = NULL; async_command_t *cmd = NULL; xmlNode *dev = NULL; CRM_CHECK((msg != NULL) && (result != NULL), return); dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR); cmd = create_async_command(msg); if (cmd == NULL) { crm_log_xml_warn(msg, "invalid"); fenced_set_protocol_error(result); return; } device_id = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID); if (device_id != NULL) { device = g_hash_table_lookup(device_list, device_id); if (device == NULL) { crm_err("Requested device '%s' is not available", device_id); pcmk__format_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, "Requested device '%s' not found", device_id); return; } schedule_stonith_command(cmd, device); } else { const char *host = crm_element_value(dev, PCMK__XA_ST_TARGET); if (pcmk_is_set(cmd->options, st_opt_cs_nodeid)) { int nodeid = 0; pcmk__node_status_t *node = NULL; pcmk__scan_min_int(host, &nodeid, 0); node = pcmk__search_node_caches(nodeid, NULL, pcmk__node_search_any |pcmk__node_search_cluster_cib); if (node != NULL) { host = node->name; } } /* If we get to here, then self-fencing is implicitly allowed */ get_capable_devices(host, cmd->action, cmd->default_timeout, TRUE, cmd, stonith_fence_get_devices_cb, fenced_support_flag(cmd->action)); } pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); } /*! * \internal * \brief Build an XML reply for a fencing operation * * \param[in] request Request that reply is for * \param[in] data If not NULL, add to reply as call data * \param[in] result Full result of fencing operation * * \return Newly created XML reply * \note The caller is responsible for freeing the result. * \note This has some overlap with construct_async_reply(), but that copies * values from an async_command_t, whereas this one copies them from the * request. */ xmlNode * fenced_construct_reply(const xmlNode *request, xmlNode *data, const pcmk__action_result_t *result) { xmlNode *reply = NULL; reply = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY); crm_xml_add(reply, PCMK__XA_ST_ORIGIN, __func__); crm_xml_add(reply, PCMK__XA_T, PCMK__VALUE_STONITH_NG); stonith__xe_set_result(reply, result); if (request == NULL) { /* Most likely, this is the result of a stonith operation that was * initiated before we came up. Unfortunately that means we lack enough * information to provide clients with a full result. * * @TODO Maybe synchronize this information at start-up? */ crm_warn("Missing request information for client notifications for " "operation with result '%s' (initiated before we came up?)", pcmk_exec_status_str(result->execution_status)); } else { const char *name = NULL; const char *value = NULL; // Attributes to copy from request to reply const char *names[] = { PCMK__XA_ST_OP, PCMK__XA_ST_CALLID, PCMK__XA_ST_CLIENTID, PCMK__XA_ST_CLIENTNAME, PCMK__XA_ST_REMOTE_OP, PCMK__XA_ST_CALLOPT, }; for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { name = names[lpc]; value = crm_element_value(request, name); crm_xml_add(reply, name, value); } if (data != NULL) { xmlNode *wrapper = pcmk__xe_create(reply, PCMK__XE_ST_CALLDATA); pcmk__xml_copy(wrapper, data); } } return reply; } /*! * \internal * \brief Build an XML reply to an asynchronous fencing command * * \param[in] cmd Fencing command that reply is for * \param[in] result Command result */ static xmlNode * construct_async_reply(const async_command_t *cmd, const pcmk__action_result_t *result) { xmlNode *reply = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY); crm_xml_add(reply, PCMK__XA_ST_ORIGIN, __func__); crm_xml_add(reply, PCMK__XA_T, PCMK__VALUE_STONITH_NG); crm_xml_add(reply, PCMK__XA_ST_OP, cmd->op); crm_xml_add(reply, PCMK__XA_ST_DEVICE_ID, cmd->device); crm_xml_add(reply, PCMK__XA_ST_REMOTE_OP, cmd->remote_op_id); crm_xml_add(reply, PCMK__XA_ST_CLIENTID, cmd->client); crm_xml_add(reply, PCMK__XA_ST_CLIENTNAME, cmd->client_name); crm_xml_add(reply, PCMK__XA_ST_TARGET, cmd->target); crm_xml_add(reply, PCMK__XA_ST_DEVICE_ACTION, cmd->op); crm_xml_add(reply, PCMK__XA_ST_ORIGIN, cmd->origin); crm_xml_add_int(reply, PCMK__XA_ST_CALLID, cmd->id); crm_xml_add_int(reply, PCMK__XA_ST_CALLOPT, cmd->options); stonith__xe_set_result(reply, result); return reply; } bool fencing_peer_active(pcmk__node_status_t *peer) { return (peer != NULL) && (peer->name != NULL) && pcmk_is_set(peer->processes, crm_get_cluster_proc()); } void set_fencing_completed(remote_fencing_op_t *op) { struct timespec tv; qb_util_timespec_from_epoch_get(&tv); op->completed = tv.tv_sec; op->completed_nsec = tv.tv_nsec; } /*! * \internal * \brief Look for alternate node needed if local node shouldn't fence target * * \param[in] target Node that must be fenced * * \return Name of an alternate node that should fence \p target if any, * or NULL otherwise */ static const char * check_alternate_host(const char *target) { if (pcmk__str_eq(target, fenced_get_local_node(), pcmk__str_casei)) { GHashTableIter gIter; pcmk__node_status_t *entry = NULL; g_hash_table_iter_init(&gIter, pcmk__peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { if (fencing_peer_active(entry) && !pcmk__str_eq(entry->name, target, pcmk__str_casei)) { crm_notice("Forwarding self-fencing request to %s", entry->name); return entry->name; } } crm_warn("Will handle own fencing because no peer can"); } return NULL; } static void remove_relay_op(xmlNode * request) { xmlNode *dev = get_xpath_object("//@" PCMK__XE_ST_DEVICE_ACTION, request, LOG_TRACE); const char *relay_op_id = NULL; const char *op_id = NULL; const char *client_name = NULL; const char *target = NULL; remote_fencing_op_t *relay_op = NULL; if (dev) { target = crm_element_value(dev, PCMK__XA_ST_TARGET); } relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP_RELAY); op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP); client_name = crm_element_value(request, PCMK__XA_ST_CLIENTNAME); /* Delete RELAY operation. */ if ((relay_op_id != NULL) && (target != NULL) && pcmk__str_eq(target, fenced_get_local_node(), pcmk__str_casei)) { relay_op = g_hash_table_lookup(stonith_remote_op_list, relay_op_id); if (relay_op) { GHashTableIter iter; remote_fencing_op_t *list_op = NULL; g_hash_table_iter_init(&iter, stonith_remote_op_list); /* If the operation to be deleted is registered as a duplicate, delete the registration. */ while (g_hash_table_iter_next(&iter, NULL, (void **)&list_op)) { GList *dup_iter = NULL; if (list_op != relay_op) { for (dup_iter = list_op->duplicates; dup_iter != NULL; dup_iter = dup_iter->next) { remote_fencing_op_t *other = dup_iter->data; if (other == relay_op) { other->duplicates = g_list_remove(other->duplicates, relay_op); break; } } } } crm_debug("Deleting relay op %s ('%s'%s%s for %s), " "replaced by op %s ('%s'%s%s for %s)", relay_op->id, relay_op->action, (relay_op->target == NULL)? "" : " targeting ", pcmk__s(relay_op->target, ""), relay_op->client_name, op_id, relay_op->action, (target == NULL)? "" : " targeting ", pcmk__s(target, ""), client_name); g_hash_table_remove(stonith_remote_op_list, relay_op_id); } } } /*! * \internal * \brief Check whether an API request was sent by a privileged user * * API commands related to fencing configuration may be done only by privileged * IPC users (i.e. root or hacluster), because all other users should go through * the CIB to have ACLs applied. If no client was given, this is a peer request, * which is always allowed. * * \param[in] c IPC client that sent request (or NULL if sent by CPG peer) * \param[in] op Requested API operation (for logging only) * * \return true if sender is peer or privileged client, otherwise false */ static inline bool is_privileged(const pcmk__client_t *c, const char *op) { if ((c == NULL) || pcmk_is_set(c->flags, pcmk__client_privileged)) { return true; } else { crm_warn("Rejecting IPC request '%s' from unprivileged client %s", pcmk__s(op, ""), pcmk__client_name(c)); return false; } } // CRM_OP_REGISTER static xmlNode * handle_register_request(pcmk__request_t *request) { xmlNode *reply = pcmk__xe_create(NULL, "reply"); CRM_ASSERT(request->ipc_client != NULL); crm_xml_add(reply, PCMK__XA_ST_OP, CRM_OP_REGISTER); crm_xml_add(reply, PCMK__XA_ST_CLIENTID, request->ipc_client->id); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); pcmk__set_request_flags(request, pcmk__request_reuse_options); return reply; } // STONITH_OP_EXEC static xmlNode * handle_agent_request(pcmk__request_t *request) { execute_agent_action(request->xml, &request->result); if (request->result.execution_status == PCMK_EXEC_PENDING) { return NULL; } return fenced_construct_reply(request->xml, NULL, &request->result); } // STONITH_OP_TIMEOUT_UPDATE static xmlNode * handle_update_timeout_request(pcmk__request_t *request) { const char *call_id = crm_element_value(request->xml, PCMK__XA_ST_CALLID); const char *client_id = crm_element_value(request->xml, PCMK__XA_ST_CLIENTID); int op_timeout = 0; crm_element_value_int(request->xml, PCMK__XA_ST_TIMEOUT, &op_timeout); do_stonith_async_timeout_update(client_id, call_id, op_timeout); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); return NULL; } // STONITH_OP_QUERY static xmlNode * handle_query_request(pcmk__request_t *request) { int timeout = 0; xmlNode *dev = NULL; const char *action = NULL; const char *target = NULL; const char *client_id = crm_element_value(request->xml, PCMK__XA_ST_CLIENTID); struct st_query_data *query = NULL; if (request->peer != NULL) { // Record it for the future notification create_remote_stonith_op(client_id, request->xml, TRUE); } /* Delete the DC node RELAY operation. */ remove_relay_op(request->xml); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); dev = get_xpath_object("//@" PCMK__XE_ST_DEVICE_ACTION, request->xml, LOG_NEVER); if (dev != NULL) { const char *device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID); if (pcmk__str_eq(device, "manual_ack", pcmk__str_casei)) { return NULL; // No query or reply necessary } target = crm_element_value(dev, PCMK__XA_ST_TARGET); action = crm_element_value(dev, PCMK__XA_ST_DEVICE_ACTION); } crm_log_xml_trace(request->xml, "Query"); query = pcmk__assert_alloc(1, sizeof(struct st_query_data)); query->reply = fenced_construct_reply(request->xml, NULL, &request->result); query->remote_peer = pcmk__str_copy(request->peer); query->client_id = pcmk__str_copy(client_id); query->target = pcmk__str_copy(target); query->action = pcmk__str_copy(action); query->call_options = request->call_options; crm_element_value_int(request->xml, PCMK__XA_ST_TIMEOUT, &timeout); get_capable_devices(target, action, timeout, pcmk_is_set(query->call_options, st_opt_allow_suicide), query, stonith_query_capable_device_cb, st_device_supports_none); return NULL; } // STONITH_OP_NOTIFY static xmlNode * handle_notify_request(pcmk__request_t *request) { const char *flag_name = NULL; CRM_ASSERT(request->ipc_client != NULL); flag_name = crm_element_value(request->xml, PCMK__XA_ST_NOTIFY_ACTIVATE); if (flag_name != NULL) { crm_debug("Enabling %s callbacks for client %s", flag_name, pcmk__request_origin(request)); pcmk__set_client_flags(request->ipc_client, get_stonith_flag(flag_name)); } flag_name = crm_element_value(request->xml, PCMK__XA_ST_NOTIFY_DEACTIVATE); if (flag_name != NULL) { crm_debug("Disabling %s callbacks for client %s", flag_name, pcmk__request_origin(request)); pcmk__clear_client_flags(request->ipc_client, get_stonith_flag(flag_name)); } pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); pcmk__set_request_flags(request, pcmk__request_reuse_options); return pcmk__ipc_create_ack(request->ipc_flags, PCMK__XE_ACK, NULL, CRM_EX_OK); } // STONITH_OP_RELAY static xmlNode * handle_relay_request(pcmk__request_t *request) { xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request->xml, LOG_TRACE); crm_notice("Received forwarded fencing request from " "%s %s to fence (%s) peer %s", pcmk__request_origin_type(request), pcmk__request_origin(request), crm_element_value(dev, PCMK__XA_ST_DEVICE_ACTION), crm_element_value(dev, PCMK__XA_ST_TARGET)); if (initiate_remote_stonith_op(NULL, request->xml, FALSE) == NULL) { fenced_set_protocol_error(&request->result); return fenced_construct_reply(request->xml, NULL, &request->result); } pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); return NULL; } // STONITH_OP_FENCE static xmlNode * handle_fence_request(pcmk__request_t *request) { - if ((request->peer != NULL) || stand_alone) { + if (request->peer != NULL) { fence_locally(request->xml, &request->result); } else if (pcmk_is_set(request->call_options, st_opt_manual_ack)) { switch (fenced_handle_manual_confirmation(request->ipc_client, request->xml)) { case pcmk_rc_ok: pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); break; case EINPROGRESS: pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); break; default: fenced_set_protocol_error(&request->result); break; } } else { const char *alternate_host = NULL; xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request->xml, LOG_TRACE); const char *target = crm_element_value(dev, PCMK__XA_ST_TARGET); const char *action = crm_element_value(dev, PCMK__XA_ST_DEVICE_ACTION); const char *device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID); if (request->ipc_client != NULL) { int tolerance = 0; crm_notice("Client %s wants to fence (%s) %s using %s", pcmk__request_origin(request), action, target, (device? device : "any device")); crm_element_value_int(dev, PCMK__XA_ST_TOLERANCE, &tolerance); if (stonith_check_fence_tolerance(tolerance, target, action)) { pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); return fenced_construct_reply(request->xml, NULL, &request->result); } alternate_host = check_alternate_host(target); } else { crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'", request->peer, action, target, (device == NULL)? "(any)" : device); } if (alternate_host != NULL) { const char *client_id = NULL; remote_fencing_op_t *op = NULL; pcmk__node_status_t *node = pcmk__get_node(0, alternate_host, NULL, pcmk__node_search_cluster_member); if (request->ipc_client->id == 0) { client_id = crm_element_value(request->xml, PCMK__XA_ST_CLIENTID); } else { client_id = request->ipc_client->id; } /* Create a duplicate fencing operation to relay with the client ID. * When a query response is received, this operation should be * deleted to avoid keeping the duplicate around. */ op = create_remote_stonith_op(client_id, request->xml, FALSE); crm_xml_add(request->xml, PCMK__XA_ST_OP, STONITH_OP_RELAY); crm_xml_add(request->xml, PCMK__XA_ST_CLIENTID, request->ipc_client->id); crm_xml_add(request->xml, PCMK__XA_ST_REMOTE_OP, op->id); pcmk__cluster_send_message(node, pcmk_ipc_fenced, request->xml); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); } else if (initiate_remote_stonith_op(request->ipc_client, request->xml, FALSE) == NULL) { fenced_set_protocol_error(&request->result); } else { pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); } } if (request->result.execution_status == PCMK_EXEC_PENDING) { return NULL; } return fenced_construct_reply(request->xml, NULL, &request->result); } // STONITH_OP_FENCE_HISTORY static xmlNode * handle_history_request(pcmk__request_t *request) { xmlNode *reply = NULL; xmlNode *data = NULL; stonith_fence_history(request->xml, &data, request->peer, request->call_options); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); if (!pcmk_is_set(request->call_options, st_opt_discard_reply)) { /* When the local node broadcasts its history, it sets * st_opt_discard_reply and doesn't need a reply. */ reply = fenced_construct_reply(request->xml, data, &request->result); } pcmk__xml_free(data); return reply; } // STONITH_OP_DEVICE_ADD static xmlNode * handle_device_add_request(pcmk__request_t *request) { const char *op = crm_element_value(request->xml, PCMK__XA_ST_OP); xmlNode *dev = get_xpath_object("//" PCMK__XE_ST_DEVICE_ID, request->xml, LOG_ERR); if (is_privileged(request->ipc_client, op)) { int rc = stonith_device_register(dev, FALSE); pcmk__set_result(&request->result, ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), stonith__legacy2status(rc), ((rc == pcmk_ok)? NULL : pcmk_strerror(rc))); } else { pcmk__set_result(&request->result, CRM_EX_INSUFFICIENT_PRIV, PCMK_EXEC_INVALID, "Unprivileged users must register device via CIB"); } fenced_send_config_notification(op, &request->result, (dev == NULL)? NULL : pcmk__xe_id(dev)); return fenced_construct_reply(request->xml, NULL, &request->result); } // STONITH_OP_DEVICE_DEL static xmlNode * handle_device_delete_request(pcmk__request_t *request) { xmlNode *dev = get_xpath_object("//" PCMK__XE_ST_DEVICE_ID, request->xml, LOG_ERR); const char *device_id = crm_element_value(dev, PCMK_XA_ID); const char *op = crm_element_value(request->xml, PCMK__XA_ST_OP); if (is_privileged(request->ipc_client, op)) { stonith_device_remove(device_id, false); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } else { pcmk__set_result(&request->result, CRM_EX_INSUFFICIENT_PRIV, PCMK_EXEC_INVALID, "Unprivileged users must delete device via CIB"); } fenced_send_config_notification(op, &request->result, device_id); return fenced_construct_reply(request->xml, NULL, &request->result); } // STONITH_OP_LEVEL_ADD static xmlNode * handle_level_add_request(pcmk__request_t *request) { char *desc = NULL; const char *op = crm_element_value(request->xml, PCMK__XA_ST_OP); if (is_privileged(request->ipc_client, op)) { fenced_register_level(request->xml, &desc, &request->result); } else { unpack_level_request(request->xml, NULL, NULL, NULL, &desc); pcmk__set_result(&request->result, CRM_EX_INSUFFICIENT_PRIV, PCMK_EXEC_INVALID, "Unprivileged users must add level via CIB"); } fenced_send_config_notification(op, &request->result, desc); free(desc); return fenced_construct_reply(request->xml, NULL, &request->result); } // STONITH_OP_LEVEL_DEL static xmlNode * handle_level_delete_request(pcmk__request_t *request) { char *desc = NULL; const char *op = crm_element_value(request->xml, PCMK__XA_ST_OP); if (is_privileged(request->ipc_client, op)) { fenced_unregister_level(request->xml, &desc, &request->result); } else { unpack_level_request(request->xml, NULL, NULL, NULL, &desc); pcmk__set_result(&request->result, CRM_EX_INSUFFICIENT_PRIV, PCMK_EXEC_INVALID, "Unprivileged users must delete level via CIB"); } fenced_send_config_notification(op, &request->result, desc); free(desc); return fenced_construct_reply(request->xml, NULL, &request->result); } // CRM_OP_RM_NODE_CACHE static xmlNode * handle_cache_request(pcmk__request_t *request) { int node_id = 0; const char *name = NULL; crm_element_value_int(request->xml, PCMK_XA_ID, &node_id); name = crm_element_value(request->xml, PCMK_XA_UNAME); pcmk__cluster_forget_cluster_node(node_id, name); pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); return NULL; } static xmlNode * handle_unknown_request(pcmk__request_t *request) { crm_err("Unknown IPC request %s from %s %s", request->op, pcmk__request_origin_type(request), pcmk__request_origin(request)); pcmk__format_result(&request->result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, "Unknown IPC request type '%s' (bug?)", request->op); return fenced_construct_reply(request->xml, NULL, &request->result); } static void fenced_register_handlers(void) { pcmk__server_command_t handlers[] = { { CRM_OP_REGISTER, handle_register_request }, { STONITH_OP_EXEC, handle_agent_request }, { STONITH_OP_TIMEOUT_UPDATE, handle_update_timeout_request }, { STONITH_OP_QUERY, handle_query_request }, { STONITH_OP_NOTIFY, handle_notify_request }, { STONITH_OP_RELAY, handle_relay_request }, { STONITH_OP_FENCE, handle_fence_request }, { STONITH_OP_FENCE_HISTORY, handle_history_request }, { STONITH_OP_DEVICE_ADD, handle_device_add_request }, { STONITH_OP_DEVICE_DEL, handle_device_delete_request }, { STONITH_OP_LEVEL_ADD, handle_level_add_request }, { STONITH_OP_LEVEL_DEL, handle_level_delete_request }, { CRM_OP_RM_NODE_CACHE, handle_cache_request }, { NULL, handle_unknown_request }, }; fenced_handlers = pcmk__register_handlers(handlers); } void fenced_unregister_handlers(void) { if (fenced_handlers != NULL) { g_hash_table_destroy(fenced_handlers); fenced_handlers = NULL; } } static void handle_request(pcmk__request_t *request) { xmlNode *reply = NULL; const char *reason = NULL; if (fenced_handlers == NULL) { fenced_register_handlers(); } reply = pcmk__process_request(request, fenced_handlers); if (reply != NULL) { if (pcmk_is_set(request->flags, pcmk__request_reuse_options) && (request->ipc_client != NULL)) { /* Certain IPC-only commands must reuse the call options from the * original request rather than the ones set by stonith_send_reply() * -> do_local_reply(). */ pcmk__ipc_send_xml(request->ipc_client, request->ipc_id, reply, request->ipc_flags); request->ipc_client->request_id = 0; } else { stonith_send_reply(reply, request->call_options, request->peer, request->ipc_client); } pcmk__xml_free(reply); } reason = request->result.exit_reason; crm_debug("Processed %s request from %s %s: %s%s%s%s", request->op, pcmk__request_origin_type(request), pcmk__request_origin(request), pcmk_exec_status_str(request->result.execution_status), (reason == NULL)? "" : " (", (reason == NULL)? "" : reason, (reason == NULL)? "" : ")"); } static void handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) { // Copy, because request might be freed before we want to log this char *op = crm_element_value_copy(request, PCMK__XA_ST_OP); if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { process_remote_stonith_query(request); } else if (pcmk__str_any_of(op, STONITH_OP_NOTIFY, STONITH_OP_FENCE, NULL)) { fenced_process_fencing_reply(request); } else { crm_err("Ignoring unknown %s reply from %s %s", pcmk__s(op, "untyped"), ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client))); crm_log_xml_warn(request, "UnknownOp"); free(op); return; } crm_debug("Processed %s reply from %s %s", op, ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client))); free(op); } /*! * \internal * \brief Handle a message from an IPC client or CPG peer * * \param[in,out] client If not NULL, IPC client that sent message * \param[in] id If from IPC client, IPC message ID * \param[in] flags Message flags * \param[in,out] message Message XML * \param[in] remote_peer If not NULL, CPG peer that sent message */ void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, xmlNode *message, const char *remote_peer) { int call_options = st_opt_none; bool is_reply = false; CRM_CHECK(message != NULL, return); if (get_xpath_object("//" PCMK__XE_ST_REPLY, message, LOG_NEVER) != NULL) { is_reply = true; } crm_element_value_int(message, PCMK__XA_ST_CALLOPT, &call_options); crm_debug("Processing %ssynchronous %s %s %u from %s %s", pcmk_is_set(call_options, st_opt_sync_call)? "" : "a", crm_element_value(message, PCMK__XA_ST_OP), (is_reply? "reply" : "request"), id, ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client))); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (is_reply) { handle_reply(client, message, remote_peer); } else { pcmk__request_t request = { .ipc_client = client, .ipc_id = id, .ipc_flags = flags, .peer = remote_peer, .xml = message, .call_options = call_options, .result = PCMK__UNKNOWN_RESULT, }; request.op = crm_element_value_copy(request.xml, PCMK__XA_ST_OP); CRM_CHECK(request.op != NULL, return); if (pcmk_is_set(request.call_options, st_opt_sync_call)) { pcmk__set_request_flags(&request, pcmk__request_sync); } handle_request(&request); pcmk__reset_request(&request); } } diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index 9dc36d62ce..cbd8f2a6a4 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -1,677 +1,665 @@ /* * Copyright 2009-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include // PRIu32, PRIx32 #include #include #include #include #include #include #include #include #include #include #include #include #define SUMMARY "daemon for executing fencing devices in a Pacemaker cluster" long long stonith_watchdog_timeout_ms = 0; GList *stonith_watchdog_targets = NULL; static GMainLoop *mainloop = NULL; -gboolean stand_alone = FALSE; gboolean stonith_shutdown_flag = FALSE; static qb_ipcs_service_t *ipcs = NULL; static pcmk__output_t *out = NULL; pcmk__supported_format_t formats[] = { PCMK__SUPPORTED_FORMAT_NONE, PCMK__SUPPORTED_FORMAT_TEXT, PCMK__SUPPORTED_FORMAT_XML, { NULL, NULL, NULL } }; static struct { bool no_cib_connect; gchar **log_files; } options; crm_exit_t exit_code = CRM_EX_OK; static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", pcmk__client_pid(c)); return -ECONNREFUSED; } if (pcmk__new_client(c, uid, gid) == NULL) { return -ENOMEM; } return 0; } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; pcmk__client_t *c = pcmk__find_client(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = pcmk__client_data2xml(c, data, &id, &flags); if (request == NULL) { pcmk__ipc_send_ack(c, id, flags, PCMK__XE_NACK, NULL, CRM_EX_PROTOCOL); return 0; } op = crm_element_value(request, PCMK__XA_CRM_TASK); if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { crm_xml_add(request, PCMK__XA_T, PCMK__VALUE_STONITH_NG); crm_xml_add(request, PCMK__XA_ST_OP, op); crm_xml_add(request, PCMK__XA_ST_CLIENTID, c->id); crm_xml_add(request, PCMK__XA_ST_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, PCMK__XA_ST_CLIENTNODE, fenced_get_local_node()); pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, request); pcmk__xml_free(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, PCMK__XA_ST_CLIENTNAME); c->name = crm_strdup_printf("%s.%u", pcmk__s(value, "unknown"), c->pid); } crm_element_value_int(request, PCMK__XA_ST_CALLOPT, &call_options); crm_trace("Flags %#08" PRIx32 "/%#08x for command %" PRIu32 " from client %s", flags, call_options, id, pcmk__client_name(c)); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, PCMK__XA_ST_CLIENTID, c->id); crm_xml_add(request, PCMK__XA_ST_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, PCMK__XA_ST_CLIENTNODE, fenced_get_local_node()); crm_log_xml_trace(request, "ipc-received"); stonith_command(c, id, flags, request, NULL); pcmk__xml_free(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { pcmk__client_t *client = pcmk__find_client(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); pcmk__free_client(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, PCMK__XA_SRC); const char *op = crm_element_value(msg, PCMK__XA_ST_OP); if (pcmk__str_eq(op, STONITH_OP_POKE, pcmk__str_none)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk__cpg_message_data(handle, nodeid, pid, msg, &from); if(data == NULL) { return; } xml = pcmk__xml_parse(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, PCMK__XA_SRC, from); stonith_peer_callback(xml, NULL); pcmk__xml_free(xml); free(data); } static void stonith_peer_cs_destroy(gpointer user_data) { crm_crit("Lost connection to cluster layer, shutting down"); stonith_shutdown(0); } #endif void do_local_reply(const xmlNode *notify_src, pcmk__client_t *client, int call_options) { /* send callback to originating child */ int local_rc = pcmk_rc_ok; int rid = 0; uint32_t ipc_flags = crm_ipc_server_event; if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_LOG_ASSERT(client->request_id); rid = client->request_id; client->request_id = 0; ipc_flags = crm_ipc_flags_none; } local_rc = pcmk__ipc_send_xml(client, rid, notify_src, ipc_flags); if (local_rc == pcmk_rc_ok) { crm_trace("Sent response %d to client %s", rid, pcmk__client_name(client)); } else { crm_warn("%synchronous reply to client %s failed: %s", (pcmk_is_set(call_options, st_opt_sync_call)? "S" : "As"), pcmk__client_name(client), pcmk_rc_str(local_rc)); } } uint64_t get_stonith_flag(const char *name) { if (pcmk__str_eq(name, PCMK__VALUE_ST_NOTIFY_FENCE, pcmk__str_none)) { return st_callback_notify_fence; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_ADD, pcmk__str_casei)) { return st_callback_device_add; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_DEL, pcmk__str_casei)) { return st_callback_device_del; } else if (pcmk__str_eq(name, PCMK__VALUE_ST_NOTIFY_HISTORY, pcmk__str_none)) { return st_callback_notify_history; } else if (pcmk__str_eq(name, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED, pcmk__str_none)) { return st_callback_notify_history_synced; } return st_callback_unknown; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { const xmlNode *update_msg = user_data; pcmk__client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, PCMK__XA_SUBT); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (pcmk_is_set(client->flags, get_stonith_flag(type))) { int rc = pcmk__ipc_send_xml(client, 0, update_msg, crm_ipc_server_event); if (rc != pcmk_rc_ok) { crm_warn("%s notification of client %s failed: %s " QB_XS " id=%.8s rc=%d", type, pcmk__client_name(client), pcmk_rc_str(rc), client->id, rc); } else { crm_trace("Sent %s notification to client %s", type, pcmk__client_name(client)); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { pcmk__client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = pcmk__find_client_by_id(client_id); if (!client) { return; } notify_data = pcmk__xe_create(NULL, PCMK__XE_ST_ASYNC_TIMEOUT_VALUE); crm_xml_add(notify_data, PCMK__XA_T, PCMK__VALUE_ST_ASYNC_TIMEOUT_VALUE); crm_xml_add(notify_data, PCMK__XA_ST_CALLID, call_id); crm_xml_add_int(notify_data, PCMK__XA_ST_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { pcmk__ipc_send_xml(client, 0, notify_data, crm_ipc_server_event); } pcmk__xml_free(notify_data); } /*! * \internal * \brief Notify relevant IPC clients of a fencing operation result * * \param[in] type Notification type * \param[in] result Result of fencing operation (assume success if NULL) * \param[in] data If not NULL, add to notification as call data */ void fenced_send_notification(const char *type, const pcmk__action_result_t *result, xmlNode *data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = pcmk__xe_create(NULL, PCMK__XE_NOTIFY); CRM_LOG_ASSERT(type != NULL); crm_xml_add(update_msg, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY); crm_xml_add(update_msg, PCMK__XA_SUBT, type); crm_xml_add(update_msg, PCMK__XA_ST_OP, type); stonith__xe_set_result(update_msg, result); if (data != NULL) { xmlNode *wrapper = pcmk__xe_create(update_msg, PCMK__XE_ST_CALLDATA); pcmk__xml_copy(wrapper, data); } crm_trace("Notifying clients"); pcmk__foreach_ipc_client(stonith_notify_client, update_msg); pcmk__xml_free(update_msg); crm_trace("Notify complete"); } /*! * \internal * \brief Send notifications for a configuration change to subscribed clients * * \param[in] op Notification type (\c STONITH_OP_DEVICE_ADD, * \c STONITH_OP_DEVICE_DEL, \c STONITH_OP_LEVEL_ADD, or * \c STONITH_OP_LEVEL_DEL) * \param[in] result Operation result * \param[in] desc Description of what changed (either device ID or string * representation of level * ([])) */ void fenced_send_config_notification(const char *op, const pcmk__action_result_t *result, const char *desc) { xmlNode *notify_data = pcmk__xe_create(NULL, op); crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ID, desc); fenced_send_notification(op, result, notify_data); pcmk__xml_free(notify_data); } /*! * \internal * \brief Check whether a node does watchdog-fencing * * \param[in] node Name of node to check * * \return TRUE if node found in stonith_watchdog_targets * or stonith_watchdog_targets is empty indicating * all nodes are doing watchdog-fencing */ gboolean node_does_watchdog_fencing(const char *node) { return ((stonith_watchdog_targets == NULL) || pcmk__str_in_list(node, stonith_watchdog_targets, pcmk__str_casei)); } void stonith_shutdown(int nsig) { crm_info("Terminating with %d clients", pcmk__ipc_client_count()); stonith_shutdown_flag = TRUE; if (mainloop != NULL && g_main_loop_is_running(mainloop)) { g_main_loop_quit(mainloop); } } static void stonith_cleanup(void) { fenced_cib_cleanup(); if (ipcs) { qb_ipcs_destroy(ipcs); } pcmk__cluster_destroy_node_caches(); pcmk__client_cleanup(); free_stonith_remote_op_list(); free_topology_list(); free_device_list(); free_metadata_cache(); fenced_unregister_handlers(); } static gboolean stand_alone_cpg_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { - stand_alone = FALSE; options.no_cib_connect = true; return TRUE; } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = NULL, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node, const void *data) { if ((type != pcmk__node_update_processes) && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ xmlNode *query = pcmk__xe_create(NULL, PCMK__XE_STONITH_COMMAND); crm_xml_add(query, PCMK__XA_T, PCMK__VALUE_STONITH_NG); crm_xml_add(query, PCMK__XA_ST_OP, STONITH_OP_POKE); crm_debug("Broadcasting our uname because of node %" PRIu32, node->cluster_layer_id); pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, query); pcmk__xml_free(query); } } /* @COMPAT Deprecated since 2.1.8. Use pcmk_list_fence_attrs() or * crm_resource --list-options=fencing instead of querying daemon metadata. * * NOTE: pcs (as of at least 0.11.8) uses this */ static int fencer_metadata(void) { const char *name = PCMK__SERVER_FENCED; const char *desc_short = N_("Instance attributes available for all " "\"stonith\"-class resources"); const char *desc_long = N_("Instance attributes available for all " "\"stonith\"-class resources and used by " "Pacemaker's fence daemon"); return pcmk__daemon_metadata(out, name, desc_short, desc_long, pcmk__opt_fencing); } static GOptionEntry entries[] = { - { "stand-alone", 's', G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &stand_alone, - N_("Deprecated (will be removed in a future release)"), NULL }, - { "stand-alone-w-cpg", 'c', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, stand_alone_cpg_cb, N_("Intended for use in regression testing only"), NULL }, { "logfile", 'l', G_OPTION_FLAG_NONE, G_OPTION_ARG_FILENAME_ARRAY, &options.log_files, N_("Send logs to the additional named logfile"), NULL }, { NULL } }; static GOptionContext * build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { GOptionContext *context = NULL; context = pcmk__build_arg_context(args, "text (default), xml", group, NULL); pcmk__add_main_args(context, entries); return context; } int main(int argc, char **argv) { int rc = pcmk_rc_ok; pcmk_cluster_t *cluster = NULL; crm_ipc_t *old_instance = NULL; GError *error = NULL; GOptionGroup *output_group = NULL; pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); gchar **processed_args = pcmk__cmdline_preproc(argv, "l"); GOptionContext *context = build_arg_context(args, &output_group); crm_log_preinit(NULL, argc, argv); pcmk__register_formats(output_group, formats); if (!g_option_context_parse_strv(context, &processed_args, &error)) { exit_code = CRM_EX_USAGE; goto done; } rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); if (rc != pcmk_rc_ok) { exit_code = CRM_EX_ERROR; g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Error creating output format %s: %s", args->output_ty, pcmk_rc_str(rc)); goto done; } if (args->version) { out->version(out, false); goto done; } if ((g_strv_length(processed_args) >= 2) && pcmk__str_eq(processed_args[1], "metadata", pcmk__str_none)) { rc = fencer_metadata(); if (rc != pcmk_rc_ok) { exit_code = CRM_EX_FATAL; g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Unable to display metadata: %s", pcmk_rc_str(rc)); } goto done; } // Open additional log files pcmk__add_logfiles(options.log_files, out); crm_log_init(NULL, LOG_INFO + args->verbosity, TRUE, (args->verbosity > 0), argc, argv, FALSE); crm_notice("Starting Pacemaker fencer"); old_instance = crm_ipc_new("stonith-ng", 0); if (old_instance == NULL) { /* crm_ipc_new() will have already logged an error message with * crm_err() */ exit_code = CRM_EX_FATAL; goto done; } if (pcmk__connect_generic_ipc(old_instance) == pcmk_rc_ok) { // IPC endpoint already up crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); crm_crit("Aborting start-up because another fencer instance is " "already active"); goto done; } else { // Not up or not authentic, we'll proceed either way crm_ipc_destroy(old_instance); old_instance = NULL; } mainloop_add_signal(SIGTERM, stonith_shutdown); pcmk__cluster_init_node_caches(); rc = fenced_scheduler_init(); if (rc != pcmk_rc_ok) { exit_code = CRM_EX_FATAL; g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Error initializing scheduler data: %s", pcmk_rc_str(rc)); goto done; } cluster = pcmk_cluster_new(); - if (!stand_alone) { #if SUPPORT_COROSYNC - if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { - pcmk_cluster_set_destroy_fn(cluster, stonith_peer_cs_destroy); - pcmk_cpg_set_deliver_fn(cluster, stonith_peer_ais_callback); - pcmk_cpg_set_confchg_fn(cluster, pcmk__cpg_confchg_cb); - } + if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { + pcmk_cluster_set_destroy_fn(cluster, stonith_peer_cs_destroy); + pcmk_cpg_set_deliver_fn(cluster, stonith_peer_ais_callback); + pcmk_cpg_set_confchg_fn(cluster, pcmk__cpg_confchg_cb); + } #endif // SUPPORT_COROSYNC - pcmk__cluster_set_status_callback(&st_peer_update_callback); - - if (pcmk_cluster_connect(cluster) != pcmk_rc_ok) { - exit_code = CRM_EX_FATAL; - crm_crit("Cannot sign in to the cluster... terminating"); - goto done; - } - fenced_set_local_node(cluster->priv->node_name); + pcmk__cluster_set_status_callback(&st_peer_update_callback); - if (!options.no_cib_connect) { - setup_cib(); - } + if (pcmk_cluster_connect(cluster) != pcmk_rc_ok) { + exit_code = CRM_EX_FATAL; + crm_crit("Cannot sign in to the cluster... terminating"); + goto done; + } + fenced_set_local_node(cluster->priv->node_name); - } else { - fenced_set_local_node("localhost"); - crm_warn("Stand-alone mode is deprecated and will be removed " - "in a future release"); + if (!options.no_cib_connect) { + setup_cib(); } init_device_list(); init_topology_list(); pcmk__serve_fenced_ipc(&ipcs, &ipc_callbacks); // Create the mainloop and run it... mainloop = g_main_loop_new(NULL, FALSE); crm_notice("Pacemaker fencer successfully started and accepting connections"); g_main_loop_run(mainloop); done: g_strfreev(processed_args); pcmk__free_arg_context(context); g_strfreev(options.log_files); stonith_cleanup(); pcmk_cluster_free(cluster); fenced_scheduler_cleanup(); pcmk__output_and_clear_error(&error, out); if (out != NULL) { out->finish(out, exit_code, true, NULL); pcmk__output_free(out); } pcmk__unregister_formats(); crm_exit(exit_code); } diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h index 9c61c200b8..c4ea58cd74 100644 --- a/daemons/fenced/pacemaker-fenced.h +++ b/daemons/fenced/pacemaker-fenced.h @@ -1,331 +1,330 @@ /* * Copyright 2009-2024 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include // uint32_t, uint64_t #include // xmlNode #include #include #include #include /*! * \internal * \brief Check whether target has already been fenced recently * * \param[in] tolerance Number of seconds to look back in time * \param[in] target Name of node to search for * \param[in] action Action we want to match * * \return TRUE if an equivalent fencing operation took place in the last * \p tolerance seconds, FALSE otherwise */ gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action); typedef struct stonith_device_s { char *id; char *agent; char *namespace; /*! list of actions that must execute on the target node. Used for unfencing */ GString *on_target_actions; GList *targets; time_t targets_age; gboolean has_attr_map; // Whether target's nodeid should be passed as a parameter to the agent gboolean include_nodeid; /* whether the cluster should automatically unfence nodes with the device */ gboolean automatic_unfencing; guint priority; uint32_t flags; // Group of enum st_device_flags GHashTable *params; GHashTable *aliases; GList *pending_ops; mainloop_timer_t *timer; crm_trigger_t *work; xmlNode *agent_metadata; /*! A verified device is one that has contacted the * agent successfully to perform a monitor operation */ gboolean verified; gboolean cib_registered; gboolean api_registered; gboolean dirty; } stonith_device_t; /* These values are used to index certain arrays by "phase". Usually an * operation has only one "phase", so phase is always zero. However, some * reboots are remapped to "off" then "on", in which case "reboot" will be * phase 0, "off" will be phase 1 and "on" will be phase 2. */ enum st_remap_phase { st_phase_requested = 0, st_phase_off = 1, st_phase_on = 2, st_phase_max = 3 }; typedef struct remote_fencing_op_s { /* The unique id associated with this operation */ char *id; /*! The node this operation will fence */ char *target; /*! The fencing action to perform on the target. (reboot, on, off) */ char *action; /*! When was the fencing action recorded (seconds since epoch) */ time_t created; /*! Marks if the final notifications have been sent to local stonith clients. */ gboolean notify_sent; /*! The number of query replies received */ guint replies; /*! The number of query replies expected */ guint replies_expected; /*! Does this node own control of this operation */ gboolean owner; /*! After query is complete, This the high level timer that expires the entire operation */ guint op_timer_total; /*! This timer expires the current fencing request. Many fencing * requests may exist in a single operation */ guint op_timer_one; /*! This timer expires the query request sent out to determine * what nodes are contain what devices, and who those devices can fence */ guint query_timer; /*! This is the default timeout to use for each fencing device if no * custom timeout is received in the query. */ gint base_timeout; /*! This is the calculated total timeout an operation can take before * expiring. This is calculated by adding together all the timeout * values associated with the devices this fencing operation may call */ gint total_timeout; /*! * Fencing delay (in seconds) requested by API client (used by controller to * implement \c PCMK_OPT_PRIORITY_FENCING_DELAY). A value of -1 means * disable all configured delays. */ int client_delay; /*! Delegate is the node being asked to perform a fencing action * on behalf of the node that owns the remote operation. Some operations * will involve multiple delegates. This value represents the final delegate * that is used. */ char *delegate; /*! The point at which the remote operation completed */ time_t completed; //! Group of enum stonith_call_options associated with this operation uint32_t call_options; /*! The current state of the remote operation. This indicates * what stage the op is in, query, exec, done, duplicate, failed. */ enum op_state state; /*! The node that owns the remote operation */ char *originator; /*! The local client id that initiated the fencing request */ char *client_id; /*! The client's call_id that initiated the fencing request */ int client_callid; /*! The name of client that initiated the fencing request */ char *client_name; /*! List of the received query results for all the nodes in the cpg group */ GList *query_results; /*! The original request that initiated the remote stonith operation */ xmlNode *request; /*! The current topology level being executed */ guint level; /*! The current operation phase being executed */ enum st_remap_phase phase; /*! Devices with automatic unfencing (always run if "on" requested, never if remapped) */ GList *automatic_list; /*! List of all devices at the currently executing topology level */ GList *devices_list; /*! Current entry in the topology device list */ GList *devices; /*! List of duplicate operations attached to this operation. Once this operation * completes, the duplicate operations will be closed out as well. */ GList *duplicates; /*! The point at which the remote operation completed(nsec) */ long long completed_nsec; /*! The (potentially intermediate) result of the operation */ pcmk__action_result_t result; } remote_fencing_op_t; void fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged); // Fencer-specific client flags enum st_client_flags { st_callback_unknown = UINT64_C(0), st_callback_notify_fence = (UINT64_C(1) << 0), st_callback_device_add = (UINT64_C(1) << 2), st_callback_device_del = (UINT64_C(1) << 4), st_callback_notify_history = (UINT64_C(1) << 5), st_callback_notify_history_synced = (UINT64_C(1) << 6) }; // How the user specified the target of a topology level enum fenced_target_by { fenced_target_by_unknown = -1, // Invalid or not yet parsed fenced_target_by_name, // By target name fenced_target_by_pattern, // By a pattern matching target names fenced_target_by_attribute, // By a node attribute/value on target }; /* * Complex fencing requirements are specified via fencing topologies. * A topology consists of levels; each level is a list of fencing devices. * Topologies are stored in a hash table by node name. When a node needs to be * fenced, if it has an entry in the topology table, the levels are tried * sequentially, and the devices in each level are tried sequentially. * Fencing is considered successful as soon as any level succeeds; * a level is considered successful if all its devices succeed. * Essentially, all devices at a given level are "and-ed" and the * levels are "or-ed". * * This structure is used for the topology table entries. * Topology levels start from 1, so levels[0] is unused and always NULL. */ typedef struct stonith_topology_s { enum fenced_target_by kind; // How target was specified /*! Node name regex or attribute name=value for which topology applies */ char *target; char *target_value; char *target_pattern; char *target_attribute; /*! Names of fencing devices at each topology level */ GList *levels[ST__LEVEL_COUNT]; } stonith_topology_t; void stonith_shutdown(int nsig); void init_device_list(void); void free_device_list(void); void init_topology_list(void); void free_topology_list(void); void free_stonith_remote_op_list(void); void init_stonith_remote_op_hash_table(GHashTable **table); void free_metadata_cache(void); void fenced_unregister_handlers(void); uint64_t get_stonith_flag(const char *name); void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, xmlNode *op_request, const char *remote_peer); int stonith_device_register(xmlNode *msg, gboolean from_cib); void stonith_device_remove(const char *id, bool from_cib); char *stonith_level_key(const xmlNode *msg, enum fenced_target_by); void fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result); void fenced_unregister_level(xmlNode *msg, char **desc, pcmk__action_result_t *result); stonith_topology_t *find_topology_for_host(const char *host); void do_local_reply(const xmlNode *notify_src, pcmk__client_t *client, int call_options); xmlNode *fenced_construct_reply(const xmlNode *request, xmlNode *data, const pcmk__action_result_t *result); void do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); void fenced_send_notification(const char *type, const pcmk__action_result_t *result, xmlNode *data); void fenced_send_config_notification(const char *op, const pcmk__action_result_t *result, const char *desc); remote_fencing_op_t *initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request, gboolean manual_ack); void fenced_process_fencing_reply(xmlNode *msg); int process_remote_stonith_query(xmlNode * msg); void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); void stonith_fence_history(xmlNode *msg, xmlNode **output, const char *remote_peer, int options); void stonith_fence_history_trim(void); bool fencing_peer_active(pcmk__node_status_t *peer); void set_fencing_completed(remote_fencing_op_t * op); int fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg); const char *fenced_device_reboot_action(const char *device_id); bool fenced_device_supports_on(const char *device_id); gboolean node_has_attr(const char *node, const char *name, const char *value); gboolean node_does_watchdog_fencing(const char *node); void fencing_topology_init(void); void setup_cib(void); void fenced_cib_cleanup(void); int fenced_scheduler_init(void); void fenced_set_local_node(const char *node_name); const char *fenced_get_local_node(void); void fenced_scheduler_cleanup(void); void fenced_scheduler_run(xmlNode *cib); static inline void fenced_set_protocol_error(pcmk__action_result_t *result) { pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, "Fencer API request missing required information (bug?)"); } /*! * \internal * \brief Get the device flag to use with a given action when searching devices * * \param[in] action Action to check * * \return st_device_supports_on if \p action is "on", otherwise * st_device_supports_none */ static inline uint32_t fenced_support_flag(const char *action) { if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) { return st_device_supports_on; } return st_device_supports_none; } -extern gboolean stand_alone; extern GHashTable *device_list; extern GHashTable *topology; extern long long stonith_watchdog_timeout_ms; extern GList *stonith_watchdog_targets; extern GHashTable *stonith_remote_op_list; extern crm_exit_t exit_code; extern gboolean stonith_shutdown_flag;