diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in index 8dc5ec8783..470abd41e4 100644 --- a/cts/cts-fencing.in +++ b/cts/cts-fencing.in @@ -1,1590 +1,1590 @@ #!@PYTHON@ """ Regression tests for Pacemaker's fencer """ __copyright__ = "Copyright 2012-2021 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import io import os import re import sys import subprocess import shlex import time import tempfile import signal # Where to find test binaries # Prefer the source tree if available BUILD_DIR = "@abs_top_builddir@" SCHEMA_DIR = "@CRM_SCHEMA_DIRECTORY@" TEST_DIR = sys.path[0] AUTOGEN_COROSYNC_TEMPLATE = """ totem { version: 2 cluster_name: cts-fencing crypto_cipher: none crypto_hash: none transport: udp } nodelist { node { nodeid: 1 name: %s ring0_addr: 127.0.0.1 } } logging { debug: off to_syslog: no to_stderr: no to_logfile: yes logfile: %s } """ # These values must be kept in sync with include/crm/crm.h class CrmExit(object): OK = 0 ERROR = 1 INVALID_PARAM = 2 UNIMPLEMENT_FEATURE = 3 INSUFFICIENT_PRIV = 4 NOT_INSTALLED = 5 NOT_CONFIGURED = 6 NOT_RUNNING = 7 USAGE = 64 DATAERR = 65 NOINPUT = 66 NOUSER = 67 NOHOST = 68 UNAVAILABLE = 69 SOFTWARE = 70 OSERR = 71 OSFILE = 72 CANTCREAT = 73 IOERR = 74 TEMPFAIL = 75 PROTOCOL = 76 NOPERM = 77 CONFIG = 78 FATAL = 100 PANIC = 101 DISCONNECT = 102 SOLO = 103 DIGEST = 104 NOSUCH = 105 QUORUM = 106 UNSAFE = 107 EXISTS = 108 MULTIPLE = 109 OLD = 110 TIMEOUT = 124 MAX = 255 def update_path(): """ Set the PATH environment variable appropriately for the tests """ new_path = os.environ['PATH'] if os.path.exists("%s/cts-fencing.in" % TEST_DIR): print("Running tests from the source tree: %s (%s)" % (BUILD_DIR, TEST_DIR)) # For pacemaker-fenced and cts-fence-helper new_path = "%s/daemons/fenced:%s" % (BUILD_DIR, new_path) new_path = "%s/tools:%s" % (BUILD_DIR, new_path) # For stonith_admin new_path = "%s/cts/support:%s" % (BUILD_DIR, new_path) # For cts-support else: print("Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % TEST_DIR) # For pacemaker-fenced, cts-fence-helper, and cts-support new_path = "@CRM_DAEMON_DIR@:%s" % (new_path) print('Using PATH="{}"'.format(new_path)) os.environ['PATH'] = new_path def find_validator(rng_file): if os.access("/usr/bin/xmllint", os.X_OK): return ["xmllint", "--relaxng", rng_file, "-"] else: return None def rng_directory(): if "PCMK_schema_directory" in os.environ: return os.environ["PCMK_schema_directory"] elif os.path.exists("%s/cts-fencing.in" % TEST_DIR): return "xml" else: return SCHEMA_DIR def pipe_communicate(pipes, stderr=False, stdin=None): """ Get text output from pipes """ if stdin is not None: pipe_outputs = pipes.communicate(input=stdin.encode()) else: pipe_outputs = pipes.communicate() output = pipe_outputs[0].decode(sys.stdout.encoding) if stderr: output = output + pipe_outputs[1].decode(sys.stderr.encoding) return output def output_from_command(command): """ Execute command and return its standard output """ test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) test.wait() return pipe_communicate(test).split("\n") def localname(): """ Return the uname of the local host """ our_uname = output_from_command("uname -n") if our_uname: our_uname = our_uname[0] else: our_uname = "localhost" return our_uname def killall(process): """ Kill all instances of a process """ cmd = shlex.split("killall -9 -q %s" % process) test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() class TestError(Exception): """ Base class for exceptions in this module """ pass class ExitCodeError(TestError): """ Exception raised when command exit status is unexpected """ def __init__(self, exit_code): self.exit_code = exit_code def __str__(self): return repr(self.exit_code) class OutputNotFoundError(TestError): """ Exception raised when command output does not contain wanted string """ def __init__(self, output): self.output = output def __str__(self): return repr(self.output) class OutputFoundError(TestError): """ Exception raised when command output contains unwanted string """ def __init__(self, output): self.output = output def __str__(self): return repr(self.output) class XmlValidationError(TestError): """ Exception raised when xmllint fails """ def __init__(self, output): self.output = output def __str__(self): return repr(self.output) class Test(object): """ Executor for a single test """ def __init__(self, name, description, verbose=0, with_cpg=0, timeout=2, force_wait=0, logdir="/tmp"): self.name = name self.description = description self.cmds = [] self.verbose = verbose self.timeout = timeout self.force_wait = force_wait self.logpath = os.path.join(logdir, "pacemaker-fenced.log") self.result_txt = "" self.cmd_tool_output = "" self.result_exitcode = CrmExit.OK if with_cpg: self.stonith_options = "-c" self.enable_corosync = 1 else: self.stonith_options = "-s" self.enable_corosync = 0 self.stonith_process = None self.stonith_output = "" self.stonith_patterns = [] self.negative_stonith_patterns = [] self.executed = 0 def __new_cmd(self, cmd, args, exitcode, stdout_match="", no_wait=0, stdout_negative_match="", kill=None, validate=True): """ Add a command to be executed as part of this test """ self.cmds.append( { "cmd" : cmd, "kill" : kill, "args" : args, "expected_exitcode" : exitcode, "stdout_match" : stdout_match, "stdout_negative_match" : stdout_negative_match, "no_wait" : no_wait, "validate" : validate, } ) def start_environment(self): """ Prepare the host for executing a test """ # Make sure we are in full control killall("pacemakerd") killall("pacemaker-fenced") if self.verbose: self.stonith_options = self.stonith_options + " -V" print("Starting pacemaker-fenced with %s" % self.stonith_options) if os.path.exists(self.logpath): os.remove(self.logpath) cmd = "pacemaker-fenced %s -l %s" % (self.stonith_options, self.logpath) self.stonith_process = subprocess.Popen(shlex.split(cmd)) logfile = None init_time = time.time() update_time = init_time while True: time.sleep(0.1) if self.force_wait == 0 and logfile == None \ and os.path.exists(self.logpath): logfile = io.open(self.logpath, 'rt', encoding = "ISO-8859-1") if self.force_wait == 0 and logfile != None: for line in logfile.readlines(): if "successfully started" in line: return now = time.time() if self.timeout > 0 and (now - init_time) >= self.timeout: if self.force_wait == 0: print("\tDaemon pacemaker-fenced doesn't seem to have been initialized within %fs." "\n\tConsider specifying a longer '--timeout' value." %(self.timeout)) return if self.verbose and (now - update_time) >= 5: print("Waiting for pacemaker-fenced to be initialized: %fs ..." %(now - init_time)) update_time = now def clean_environment(self): """ Clean up the host after executing a test """ if self.stonith_process: if self.stonith_process.poll() == None: self.stonith_process.terminate() self.stonith_process.wait() else: return_code = { getattr(signal, _signame): _signame for _signame in dir(signal) if _signame.startswith('SIG') and not _signame.startswith("SIG_") }.get(-self.stonith_process.returncode, "RET=%d" % (self.stonith_process.returncode)) msg = "FAILURE - '%s' failed. pacemaker-fenced abnormally exited during test (%s)." self.result_txt = msg % (self.name, return_code) self.result_exitcode = CrmExit.ERROR self.stonith_output = "" self.stonith_process = None # the default for utf-8 encoding would error out if e.g. memory corruption # makes fenced output any kind of 8 bit value - while still interesting # for debugging and we'd still like the regression-test to go over the # full set of test-cases logfile = io.open(self.logpath, 'rt', encoding = "ISO-8859-1") for line in logfile.readlines(): self.stonith_output = self.stonith_output + line if self.verbose: print("Daemon Output Start") print(self.stonith_output) print("Daemon Output End") def add_stonith_log_pattern(self, pattern): """ Add a log pattern to expect from this test """ self.stonith_patterns.append(pattern) def add_stonith_neg_log_pattern(self, pattern): """ Add a log pattern that should not occur with this test """ self.negative_stonith_patterns.append(pattern) def add_cmd(self, cmd, args, validate=True): """ Add a simple command to be executed as part of this test """ self.__new_cmd(cmd, args, CrmExit.OK, "", validate=validate) def add_cmd_no_wait(self, cmd, args): """ Add a simple command to be executed (without waiting) as part of this test """ self.__new_cmd(cmd, args, CrmExit.OK, "", 1) def add_cmd_check_stdout(self, cmd, args, match, no_match=""): """ Add a simple command with expected output to be executed as part of this test """ self.__new_cmd(cmd, args, CrmExit.OK, match, 0, no_match) def add_expected_fail_cmd(self, cmd, args, exitcode=CrmExit.ERROR): """ Add a command to be executed as part of this test and expected to fail """ self.__new_cmd(cmd, args, exitcode, "") def get_exitcode(self): """ Return the exit status of the last test execution """ return self.result_exitcode def print_result(self, filler): """ Print the result of the last test execution """ print("%s%s" % (filler, self.result_txt)) def run_cmd(self, args): """ Execute a command as part of this test """ cmd = shlex.split(args['args']) cmd.insert(0, args['cmd']) if self.verbose: print("\n\nRunning: "+" ".join(cmd)) test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if args['kill']: if self.verbose: print("Also running: "+args['kill']) subprocess.Popen(shlex.split(args['kill'])) if args['no_wait'] == 0: test.wait() else: return CrmExit.OK output = pipe_communicate(test, stderr=True) if self.verbose: print(output) if test.returncode != args['expected_exitcode']: raise ExitCodeError(test.returncode) if (args['stdout_match'] != "" and re.search(args['stdout_match'], output) is None): raise OutputNotFoundError(output) if (args['stdout_negative_match'] != "" and re.search(args['stdout_negative_match'], output) is not None): raise OutputFoundError(output) if args['validate']: rng_file = rng_directory() + "/api/api-result.rng" cmd = find_validator(rng_file) if not cmd: return if self.verbose: print("\nRunning: "+" ".join(cmd)) validator = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = pipe_communicate(validator, stderr=True, stdin=output) if self.verbose: print(output) if validator.returncode != 0: raise XmlValidationError(output) def count_negative_matches(self, outline): """ Return 1 if a line matches patterns that shouldn't have occurred """ count = 0 for line in self.negative_stonith_patterns: if outline.count(line): count = 1 if self.verbose: print("This pattern should not have matched = '%s" % (line)) return count def match_stonith_patterns(self): """ Check test output for expected patterns """ negative_matches = 0 cur = 0 pats = self.stonith_patterns total_patterns = len(self.stonith_patterns) if len(self.stonith_patterns) == 0 and len(self.negative_stonith_patterns) == 0: return for line in self.stonith_output.split("\n"): negative_matches = negative_matches + self.count_negative_matches(line) if len(pats) == 0: continue cur = -1 for pat in pats: cur = cur + 1 if line.count(pats[cur]): del pats[cur] break if len(pats) > 0 or negative_matches: if self.verbose: for pat in pats: print("Pattern Not Matched = '%s'" % pat) msg = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." self.result_txt = msg % (self.name, len(pats), total_patterns, negative_matches) self.result_exitcode = CrmExit.ERROR def set_error(self, step, cmd): """ Record failure of this test """ msg = "FAILURE - '%s' failed at step %d. Command: %s %s" self.result_txt = msg % (self.name, step, cmd['cmd'], cmd['args']) self.result_exitcode = CrmExit.ERROR def run(self): """ Execute this test. """ res = 0 i = 1 self.start_environment() if self.verbose: print("\n--- START TEST - %s" % self.name) self.result_txt = "SUCCESS - '%s'" % (self.name) self.result_exitcode = CrmExit.OK for cmd in self.cmds: try: self.run_cmd(cmd) except ExitCodeError as e: print("Step %d FAILED - command returned %s, expected %d" % (i, e, cmd['expected_exitcode'])) self.set_error(i, cmd); break except OutputNotFoundError as e: print("Step %d FAILED - '%s' was not found in command output: %s" % (i, cmd['stdout_match'], e)) self.set_error(i, cmd); break except OutputFoundError as e: print("Step %d FAILED - '%s' was found in command output: %s" % (i, cmd['stdout_negative_match'], e)) self.set_error(i, cmd); break if self.verbose: print("Step %d SUCCESS" % (i)) i = i + 1 self.clean_environment() if self.result_exitcode == CrmExit.OK: self.match_stonith_patterns() print(self.result_txt) if self.verbose: print("--- END TEST - %s\n" % self.name) self.executed = 1 return res class Tests(object): """ Collection of all fencing regression tests """ def __init__(self, verbose=0, timeout=2, force_wait=0, logdir="/tmp"): self.tests = [] self.verbose = verbose self.timeout = timeout self.force_wait = force_wait self.logdir = logdir self.autogen_corosync_cfg = not os.path.exists("/etc/corosync/corosync.conf") def new_test(self, name, description, with_cpg=0): """ Create a named test """ test = Test(name, description, self.verbose, with_cpg, self.timeout, self.force_wait, self.logdir) self.tests.append(test) return test def print_list(self): """ List all registered tests """ print("\n==== %d TESTS FOUND ====" % (len(self.tests))) print("%35s - %s" % ("TEST NAME", "TEST DESCRIPTION")) print("%35s - %s" % ("--------------------", "--------------------")) for test in self.tests: print("%35s - %s" % (test.name, test.description)) print("==== END OF LIST ====\n") def start_corosync(self): """ Start the corosync process """ if self.verbose: print("Starting corosync") test = subprocess.Popen("corosync", stdout=subprocess.PIPE) test.wait() time.sleep(10) def run_single(self, name): """ Run a single named test """ for test in self.tests: if test.name == name: test.run() break def run_tests_matching(self, pattern): """ Run all tests whose name matches a pattern """ for test in self.tests: if test.name.count(pattern) != 0: test.run() def run_cpg_only(self): """ Run all corosync-enabled tests """ for test in self.tests: if test.enable_corosync: test.run() def run_no_cpg(self): """ Run all standalone tests """ for test in self.tests: if not test.enable_corosync: test.run() def run_tests(self): """ Run all tests """ for test in self.tests: test.run() def exit(self): """ Exit (with error status code if any test failed) """ for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != CrmExit.OK: sys.exit(CrmExit.ERROR) sys.exit(CrmExit.OK) def print_results(self): """ Print summary of results of executed tests """ failures = 0 success = 0 print("\n\n======= FINAL RESULTS ==========") print("\n--- FAILURE RESULTS:") for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != CrmExit.OK: failures = failures + 1 test.print_result(" ") else: success = success + 1 if failures == 0: print(" None") print("\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures)) def build_api_sanity_tests(self): """ Register tests to verify basic API usage """ verbose_arg = "" if self.verbose: verbose_arg = "-V" test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.") test.add_cmd("cts-fence-helper", "-t %s" % (verbose_arg), validate=False) test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1) test.add_cmd("cts-fence-helper", "-m %s" % (verbose_arg), validate=False) def build_custom_timeout_tests(self): """ Register tests to verify custom timeout usage """ # custom timeout without topology test = self.new_test("cpg_custom_timeout_1", "Verify per device timeouts work as expected without using topology.", 1) test.add_cmd('stonith_admin', '--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"') test.add_cmd('stonith_admin', '--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4"') test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") # timeout is 5+1+4 = 10 test.add_stonith_log_pattern("Total timeout set to 10") # custom timeout _WITH_ topology test = self.new_test("cpg_custom_timeout_2", "Verify per device timeouts work as expected _WITH_ topology.", 1) test.add_cmd('stonith_admin', '--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"') test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"') test.add_cmd('stonith_admin', '--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4000"') test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") # timeout is 5+1+4000 = 4006 test.add_stonith_log_pattern("Total timeout set to 4006") def build_fence_merge_tests(self): """ Register tests to verify when fence operations should be merged """ ### Simple test that overlapping fencing operations get merged test = self.new_test("cpg_custom_merge_single", "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10") ### one merger will happen test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") ### Test that multiple mergers occur test = self.new_test("cpg_custom_merge_multiple", "Verify multiple overlapping identical fencing operations are merged", 1) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"delay=2\" -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10") ### 4 mergers should occur test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") ### Test that multiple mergers occur with topologies used test = self.new_test("cpg_custom_merge_with_topology", "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10") ### 4 mergers should occur test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging fencing action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") test.add_stonith_log_pattern("Operation 'off' targeting node3 by ") def build_fence_no_merge_tests(self): """ Register tests to verify when fence operations should not be merged """ test = self.new_test("cpg_custom_no_merge", "Verify differing fencing operations are not merged", 1) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1") test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node2 -t 10") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10") test.add_stonith_neg_log_pattern("Merging fencing action 'off' targeting node3 originating from client") def build_standalone_tests(self): """ Register a grab bag of tests that can be executed in standalone or corosync mode """ test_types = [ { "prefix" : "standalone", "use_cpg" : 0, }, { "prefix" : "cpg", "use_cpg" : 1, }, ] # test what happens when all devices timeout for test_type in test_types: test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"], "Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") if test_type["use_cpg"] == 1: test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 2", CrmExit.TIMEOUT) test.add_stonith_log_pattern("Total timeout set to 6") else: test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 2", CrmExit.ERROR) test.add_stonith_log_pattern("targeting node3 using false1 returned ") test.add_stonith_log_pattern("targeting node3 using false2 returned ") test.add_stonith_log_pattern("targeting node3 using false3 returned ") # test what happens when multiple devices can fence a node, but the first device fails. for test_type in test_types: test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"], "Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") if test_type["use_cpg"] == 1: test.add_stonith_log_pattern("Total timeout set to 15") # test what happens when we try to use a missing fence-agent. for test_type in test_types: test = self.new_test("%s_fence_missing_agent" % test_type["prefix"], "Verify proper error-handling when using a non-existent fence-agent.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_missing -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node2\"") test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -F node3 -t 5", CrmExit.ERROR) test.add_cmd("stonith_admin", "--output-as=xml -F node2 -t 5") # simple topology test for one device for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_simple" % test_type["prefix"], "Verify all fencing devices at a level are used.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") test.add_stonith_log_pattern("Total timeout set to 5") test.add_stonith_log_pattern("targeting node3 using true returned 0") # add topology, delete topology, verify fencing still works for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_add_remove" % test_type["prefix"], "Verify fencing occurrs after all topology levels are removed", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true") test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 1") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") test.add_stonith_log_pattern("Total timeout set to 5") test.add_stonith_log_pattern("targeting node3 using true returned 0") # test what happens when the first fencing level has multiple devices. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_device_fails" % test_type["prefix"], "Verify if one device in a level fails, the other is tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R false -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") test.add_stonith_log_pattern("Total timeout set to 40") test.add_stonith_log_pattern("targeting node3 using false returned -201") test.add_stonith_log_pattern("targeting node3 using true returned 0") # test what happens when the first fencing level fails. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"], "Verify if one level fails, the next leve is tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3") test.add_stonith_log_pattern("Total timeout set to 18") test.add_stonith_log_pattern("targeting node3 using false1 returned -201") test.add_stonith_log_pattern("targeting node3 using false2 returned -201") test.add_stonith_log_pattern("targeting node3 using true3 returned 0") test.add_stonith_log_pattern("targeting node3 using true4 returned 0") # test what happens when the first fencing level had devices that no one has registered for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], "Verify topology can continue with missing devices.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") # Test what happens if multiple fencing levels are defined, and then the first one is removed. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_level_removal" % test_type["prefix"], "Verify level removal works.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4") # Now remove level 2, verify none of the devices in level two are hit. test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 2") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") test.add_stonith_log_pattern("Total timeout set to 8") test.add_stonith_log_pattern("targeting node3 using false1 returned -201") test.add_stonith_neg_log_pattern("targeting node3 using false2 returned ") test.add_stonith_log_pattern("targeting node3 using true3 returned 0") test.add_stonith_log_pattern("targeting node3 using true4 returned 0") # Test targeting a topology level by node name pattern. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_level_pattern" % test_type["prefix"], "Verify targeting topology by node name pattern works.", test_type["use_cpg"]) test.add_cmd("stonith_admin", """--output-as=xml -R true -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1 node2 node3" """) test.add_cmd("stonith_admin", """--output-as=xml -r '@node.*' -i 1 -v true""") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5") test.add_stonith_log_pattern("targeting node3 using true returned 0") # test allowing commas and semicolons as delimiters in pcmk_host_list for test_type in test_types: test = self.new_test("%s_host_list_delimiters" % test_type["prefix"], "Verify commas and semicolons can be used as pcmk_host_list delimiters", test_type["use_cpg"]) test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1,node2,node3" """) test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=pcmk1;pcmk2;pcmk3" """) test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F node2 -t 5") test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F pcmk3 -t 5") test.add_stonith_log_pattern("targeting node2 using true1 returned 0") test.add_stonith_log_pattern("targeting pcmk3 using true2 returned 0") # test the stonith builds the correct list of devices that can fence a node. for test_type in test_types: test = self.new_test("%s_list_devices" % test_type["prefix"], "Verify list of devices that can fence a node is correct", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true2", "true1") test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true3", "true1") # simple test of device monitor for test_type in test_types: test = self.new_test("%s_monitor" % test_type["prefix"], "Verify device is reachable", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -Q true1") test.add_cmd("stonith_admin", "--output-as=xml -Q false1") test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true2", CrmExit.ERROR) # Verify monitor occurs for duration of timeout period on failure for test_type in test_types: test = self.new_test("%s_monitor_timeout" % test_type["prefix"], "Verify monitor uses duration of timeout period given.", test_type["use_cpg"]) test.add_cmd("stonith_admin", '--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"') test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1 -t 5", CrmExit.ERROR) test.add_stonith_log_pattern("Attempt 2 to execute") # Verify monitor occurs for duration of timeout period on failure, but stops at max retries for test_type in test_types: test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"], "Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"]) test.add_cmd("stonith_admin", '--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"') test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1 -t 15", CrmExit.ERROR) test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times") # simple register test for test_type in test_types: test = self.new_test("%s_register" % test_type["prefix"], "Verify devices can be registered and un-registered", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -Q true1") test.add_cmd("stonith_admin", "--output-as=xml -D true1") test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1", CrmExit.ERROR) # simple reboot test for test_type in test_types: test = self.new_test("%s_reboot" % test_type["prefix"], "Verify devices can be rebooted", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -B node3 -t 5") test.add_cmd("stonith_admin", "--output-as=xml -D true1") test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -Q true1", CrmExit.ERROR) # test fencing history. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_fence_history" % test_type["prefix"], "Verify last fencing operation is returned.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5 -V") test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -H node3", 'action="off" target="node3" .* status="success"') # simple test of dynamic list query for test_type in test_types: test = self.new_test("%s_dynamic_list_query" % test_type["prefix"], "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"') # fence using dynamic list query for test_type in test_types: test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"], "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1") test.add_cmd("stonith_admin", "--output-as=xml -F fake_port_1 -t 5 -V") # simple test of query using status action for test_type in test_types: test = self.new_test("%s_status_query" % test_type["prefix"], "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"') # test what happens when no reboot action is advertised for test_type in test_types: test = self.new_test("%s_no_reboot_support" % test_type["prefix"], "Verify reboot action defaults to off when no reboot action is advertised by agent.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") - test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'") + test.add_stonith_log_pattern("does not support reboot") test.add_stonith_log_pattern("using true1 returned 0 (OK)") # make sure reboot is used when reboot action is advertised for test_type in test_types: test = self.new_test("%s_with_reboot_support" % test_type["prefix"], "Verify reboot action can be used when metadata advertises it.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") test.add_stonith_neg_log_pattern("does not advertise support for 'reboot', performing 'off'") test.add_stonith_log_pattern("using true1 returned 0 (OK)") # make sure requested fencing delay is applied only for the first device in the first level # make sure static delay from pcmk_delay_base is added for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_delay" % test_type["prefix"], "Verify requested fencing delay is applied only for the first device in the first level and pcmk_delay_base is added.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\" -o \"pcmk_delay_base=1\"") test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\" -o \"pcmk_delay_base=1\"") test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true3") test.add_cmd("stonith_admin", "--output-as=xml -F node3 --delay 1") test.add_stonith_log_pattern("Delaying 'off' action targeting node3 using true1 for 2s | timeout=120s requested_delay=1s base=1s max=1s") test.add_stonith_log_pattern("Delaying 'off' action targeting node3 using false1 for 1s | timeout=120s requested_delay=0s base=1s max=1s") test.add_stonith_neg_log_pattern("Delaying 'off' action targeting node3 using true2") test.add_stonith_neg_log_pattern("Delaying 'off' action targeting node3 using true3") def build_nodeid_tests(self): """ Register tests that use a corosync node id """ our_uname = localname() ### verify nodeid is supplied when nodeid is in the metadata parameters test = self.new_test("cpg_supply_nodeid", "Verify nodeid is given when fence agent has nodeid as parameter", 1) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname)) test.add_stonith_log_pattern("as nodeid with fence action 'off' targeting %s" % (our_uname)) ### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters test = self.new_test("cpg_do_not_supply_nodeid", "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", 1) # use a host name that won't be in corosync.conf test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=regr-test\"") test.add_cmd("stonith_admin", "--output-as=xml -F regr-test -t 3") test.add_stonith_neg_log_pattern("as nodeid with fence action 'off' targeting regr-test") ### verify nodeid use doesn't explode standalone mode test = self.new_test("standalone_do_not_supply_nodeid", "Verify nodeid in metadata parameter list doesn't kill standalone mode", 0) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname)) test.add_stonith_neg_log_pattern("as nodeid with fence action 'off' targeting %s" % (our_uname)) def build_unfence_tests(self): """ Register tests that verify unfencing """ our_uname = localname() ### verify unfencing using automatic unfencing test = self.new_test("cpg_unfence_required_1", "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) # both devices should be executed test.add_stonith_log_pattern("using true1 returned 0 (OK)") test.add_stonith_log_pattern("using true2 returned 0 (OK)") ### verify unfencing using automatic unfencing fails if any of the required agents fail test = self.new_test("cpg_unfence_required_2", "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=fail" -o "pcmk_host_list=%s"' % (our_uname)) test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U %s -t 6" % (our_uname), CrmExit.ERROR) ### verify unfencing using automatic devices with topology test = self.new_test("cpg_unfence_required_3", "Verify require unfencing on all devices even when at different topology levels", 1) test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("using true1 returned 0 (OK)") test.add_stonith_log_pattern("using true2 returned 0 (OK)") ### verify unfencing using automatic devices with topology test = self.new_test("cpg_unfence_required_4", "Verify all required devices are executed even with topology levels fail.", 1) test.add_cmd('stonith_admin', '--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true3 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R true4 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R false3 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd('stonith_admin', '--output-as=xml -R false4 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v false1" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false2" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false3" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true3" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("using true1 returned 0 (OK)") test.add_stonith_log_pattern("using true2 returned 0 (OK)") test.add_stonith_log_pattern("using true3 returned 0 (OK)") test.add_stonith_log_pattern("using true4 returned 0 (OK)") def build_unfence_on_target_tests(self): """ Register tests that verify unfencing that runs on the target """ our_uname = localname() ### verify unfencing using on_target device test = self.new_test("cpg_unfence_on_target_1", "Verify unfencing with on_target = true", 1) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("(on) to be executed on target") ### verify failure of unfencing using on_target device test = self.new_test("cpg_unfence_on_target_2", "Verify failure unfencing with on_target = true", 1) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname)) test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U node_fake_1234 -t 3", CrmExit.ERROR) test.add_stonith_log_pattern("(on) to be executed on target") ### verify unfencing using on_target device with topology test = self.new_test("cpg_unfence_on_target_3", "Verify unfencing with on_target = true using topology", 1) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("(on) to be executed on target") ### verify unfencing using on_target device with topology fails when victim node doesn't exist test = self.new_test("cpg_unfence_on_target_4", "Verify unfencing failure with on_target = true using topology", 1) test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1") test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true2") test.add_expected_fail_cmd("stonith_admin", "--output-as=xml -U node_fake -t 3", CrmExit.ERROR) test.add_stonith_log_pattern("(on) to be executed on target") def build_remap_tests(self): """ Register tests that verify remapping of reboots to off-on """ test = self.new_test("cpg_remap_simple", "Verify sequential topology reboot is remapped to all-off-then-all-on", 1) test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """ """-o "pcmk_off_timeout=1" -o "pcmk_reboot_timeout=10" """) test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """ """-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """) test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5") test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) test.add_stonith_log_pattern("Total timeout set to 3 for peer's fencing targeting node_fake") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true1") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true2") test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # fence_dummy sets "on" as an on_target action test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("cpg_remap_automatic", "Verify remapped topology reboot skips automatic 'on'", 1) test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy_auto_unfence """ """-o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy_auto_unfence """ """-o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5") test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true1") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true2") test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test.add_stonith_neg_log_pattern("perform 'on' action targeting node_fake using") test.add_stonith_neg_log_pattern("'on' failure") test = self.new_test("cpg_remap_complex_1", "Verify remapped topology reboot in second level works if non-remapped first level fails", 1) test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5") test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true1") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true2") test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("cpg_remap_complex_2", "Verify remapped topology reboot failure in second level proceeds to third level", 1) test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", """--output-as=xml -R true3 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v false2 -v true3") test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 3 -v true2") test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5") test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using false1") test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using true1") test.add_stonith_log_pattern("perform 'off' action targeting node_fake using false2") test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times") test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using true2") test.add_stonith_neg_log_pattern("node_fake with true3") def setup_environment(self, use_corosync): """ Prepare the host before executing any tests """ if use_corosync: if self.autogen_corosync_cfg: logname = os.path.join(self.logdir, "corosync.log") corosync_cfg = io.open("/etc/corosync/corosync.conf", "w") corosync_cfg.write(AUTOGEN_COROSYNC_TEMPLATE % (localname(), logname)) corosync_cfg.close() ### make sure we are in control ### killall("corosync") self.start_corosync() subprocess.call(["cts-support", "install"]) def cleanup_environment(self, use_corosync): """ Clean up the host after executing desired tests """ if use_corosync: killall("corosync") if self.autogen_corosync_cfg: if self.verbose: print("Corosync output") logfile = io.open(os.path.join(self.logdir, "corosync.log"), 'rt') for line in logfile.readlines(): print(line.strip()) logfile.close() os.remove("/etc/corosync/corosync.conf") subprocess.call(["cts-support", "uninstall"]) class TestOptions(object): """ Option handler """ def __init__(self): self.options = {} self.options['list-tests'] = 0 self.options['run-all'] = 1 self.options['run-only'] = "" self.options['run-only-pattern'] = "" self.options['verbose'] = 0 self.options['timeout'] = 2 self.options['force-wait'] = 0 self.options['invalid-arg'] = "" self.options['cpg-only'] = 0 self.options['no-cpg'] = 0 self.options['show-usage'] = 0 def build_options(self, argv): """ Set options based on command-line arguments """ args = argv[1:] skip = 0 for i in range(0, len(args)): if skip: skip = 0 continue elif args[i] == "-h" or args[i] == "--help": self.options['show-usage'] = 1 elif args[i] == "-l" or args[i] == "--list-tests": self.options['list-tests'] = 1 elif args[i] == "-V" or args[i] == "--verbose": self.options['verbose'] = 1 elif args[i] == "-t" or args[i] == "--timeout": self.options['timeout'] = float(args[i+1]) elif args[i] == "-w" or args[i] == "--force-wait": self.options['force-wait'] = 1 elif args[i] == "-n" or args[i] == "--no-cpg": self.options['no-cpg'] = 1 elif args[i] == "-c" or args[i] == "--cpg-only": self.options['cpg-only'] = 1 elif args[i] == "-r" or args[i] == "--run-only": self.options['run-only'] = args[i+1] skip = 1 elif args[i] == "-p" or args[i] == "--run-only-pattern": self.options['run-only-pattern'] = args[i+1] skip = 1 def show_usage(self): """ Show command usage """ print("usage: " + sys.argv[0] + " [options]") print("If no options are provided, all tests will run") print("Options:") print("\t [--help | -h] Show usage") print("\t [--list-tests | -l] Print out all registered tests.") print("\t [--cpg-only | -c] Only run tests that require corosync.") print("\t [--no-cpg | -n] Only run tests that do not require corosync") print("\t [--run-only | -r 'testname'] Run a specific test") print("\t [--verbose | -V] Verbose output") print("\t [--timeout | -t 'floating point number']" "\n\t\tUp to how many seconds each test case waits for the daemon to be initialized." "\n\t\tDefaults to 2. The value 0 means no limit.") print("\t [--force-wait | -w]" "\n\t\tEach test case waits the default/specified --timeout for the daemon without tracking the log.") print("\t [--run-only-pattern | -p 'string'] Run only tests containing the string value") print("\n\tExample: Run only the test 'start_stop'") print("\t\t " + sys.argv[0] + " --run-only start_stop") print("\n\tExample: Run only the tests with the string 'systemd' present in them") print("\t\t " + sys.argv[0] + " --run-only-pattern systemd") def main(argv): """ Run fencing regression tests as specified by arguments """ update_path() opts = TestOptions() opts.build_options(argv) use_corosync = 1 # Create a temporary directory for log files (the directory and its # contents will automatically be erased when done) with tempfile.TemporaryDirectory(prefix="cts-fencing-") as logdir: tests = Tests(opts.options['verbose'], opts.options['timeout'], opts.options['force-wait'], logdir) tests.build_standalone_tests() tests.build_custom_timeout_tests() tests.build_api_sanity_tests() tests.build_fence_merge_tests() tests.build_fence_no_merge_tests() tests.build_unfence_tests() tests.build_unfence_on_target_tests() tests.build_nodeid_tests() tests.build_remap_tests() if opts.options['list-tests']: tests.print_list() sys.exit(CrmExit.OK) elif opts.options['show-usage']: opts.show_usage() sys.exit(CrmExit.OK) print("Starting ...") if opts.options['no-cpg']: use_corosync = 0 tests.setup_environment(use_corosync) if opts.options['run-only-pattern'] != "": tests.run_tests_matching(opts.options['run-only-pattern']) tests.print_results() elif opts.options['run-only'] != "": tests.run_single(opts.options['run-only']) tests.print_results() elif opts.options['no-cpg']: tests.run_no_cpg() tests.print_results() elif opts.options['cpg-only']: tests.run_cpg_only() tests.print_results() else: tests.run_tests() tests.print_results() tests.cleanup_environment(use_corosync) tests.exit() if __name__ == "__main__": main(sys.argv) diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c index a4c2c158ba..66199d4fbc 100644 --- a/daemons/fenced/fenced_commands.c +++ b/daemons/fenced/fenced_commands.c @@ -1,3047 +1,3051 @@ /* * Copyright 2009-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *device_list = NULL; GHashTable *topology = NULL; GList *cmd_list = NULL; struct device_search_s { /* target of fence action */ char *host; /* requested fence action */ char *action; /* timeout to use if a device is queried dynamically for possible targets */ int per_device_timeout; /* number of registered fencing devices at time of request */ int replies_needed; /* number of device replies received so far */ int replies_received; /* whether the target is eligible to perform requested action (or off) */ bool allow_suicide; /* private data to pass to search callback function */ void *user_data; /* function to call when all replies have been received */ void (*callback) (GList * devices, void *user_data); /* devices capable of performing requested action (or off if remapping) */ GList *capable; }; static gboolean stonith_device_dispatch(gpointer user_data); static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data); static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, const char *client_id); static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence); static int get_agent_metadata(const char *agent, xmlNode **metadata); static void read_action_metadata(stonith_device_t *device); typedef struct async_command_s { int id; int pid; int fd_stdout; int options; int default_timeout; /* seconds */ int timeout; /* seconds */ int start_delay; /* seconds */ int delay_id; char *op; char *origin; char *client; char *client_name; char *remote_op_id; char *victim; uint32_t victim_nodeid; char *action; char *device; GList *device_list; GList *device_next; void *internal_user_data; void (*done_cb) (GPid pid, int rc, const char *output, gpointer user_data); guint timer_sigterm; guint timer_sigkill; /*! If the operation timed out, this is the last signal * we sent to the process to get it to terminate */ int last_timeout_signo; stonith_device_t *active_on; stonith_device_t *activating_on; } async_command_t; static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc); static gboolean is_action_required(const char *action, stonith_device_t *device) { return device && device->automatic_unfencing && pcmk__str_eq(action, "on", pcmk__str_casei); } static int get_action_delay_max(stonith_device_t * device, const char * action) { const char *value = NULL; int delay_max = 0; if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { return 0; } value = g_hash_table_lookup(device->params, PCMK_STONITH_DELAY_MAX); if (value) { delay_max = crm_parse_interval_spec(value) / 1000; } return delay_max; } static int get_action_delay_base(stonith_device_t * device, const char * action) { const char *value = NULL; int delay_base = 0; if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { return 0; } value = g_hash_table_lookup(device->params, PCMK_STONITH_DELAY_BASE); if (value) { delay_base = crm_parse_interval_spec(value) / 1000; } return delay_base; } /*! * \internal * \brief Override STONITH timeout with pcmk_*_timeout if available * * \param[in] device STONITH device to use * \param[in] action STONITH action name * \param[in] default_timeout Timeout to use if device does not have * a pcmk_*_timeout parameter for action * * \return Value of pcmk_(action)_timeout if available, otherwise default_timeout * \note For consistency, it would be nice if reboot/off/on timeouts could be * set the same way as start/stop/monitor timeouts, i.e. with an * entry in the fencing resource configuration. However that * is insufficient because fencing devices may be registered directly via * the fencer's register_device() API instead of going through the CIB * (e.g. stonith_admin uses it for its -R option, and the executor uses it * to ensure a device is registered when a command is issued). As device * properties, pcmk_*_timeout parameters can be grabbed by the fencer when * the device is registered, whether by CIB change or API call. */ static int get_action_timeout(stonith_device_t * device, const char *action, int default_timeout) { if (action && device && device->params) { char buffer[64] = { 0, }; const char *value = NULL; /* If "reboot" was requested but the device does not support it, * we will remap to "off", so check timeout for "off" instead */ if (pcmk__str_eq(action, "reboot", pcmk__str_casei) && !pcmk_is_set(device->flags, st_device_supports_reboot)) { crm_trace("%s doesn't support reboot, using timeout for off instead", device->id); action = "off"; } /* If the device config specified an action-specific timeout, use it */ snprintf(buffer, sizeof(buffer), "pcmk_%s_timeout", action); value = g_hash_table_lookup(device->params, buffer); if (value) { return atoi(value); } } return default_timeout; } static void free_async_command(async_command_t * cmd) { if (!cmd) { return; } if (cmd->delay_id) { g_source_remove(cmd->delay_id); } cmd_list = g_list_remove(cmd_list, cmd); g_list_free_full(cmd->device_list, free); free(cmd->device); free(cmd->action); free(cmd->victim); free(cmd->remote_op_id); free(cmd->client); free(cmd->client_name); free(cmd->origin); free(cmd->op); free(cmd); } static async_command_t * create_async_command(xmlNode * msg) { async_command_t *cmd = NULL; xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); const char *action = crm_element_value(op, F_STONITH_ACTION); CRM_CHECK(action != NULL, crm_log_xml_warn(msg, "NoAction"); return NULL); crm_log_xml_trace(msg, "Command"); cmd = calloc(1, sizeof(async_command_t)); crm_element_value_int(msg, F_STONITH_CALLID, &(cmd->id)); crm_element_value_int(msg, F_STONITH_CALLOPTS, &(cmd->options)); crm_element_value_int(msg, F_STONITH_TIMEOUT, &(cmd->default_timeout)); cmd->timeout = cmd->default_timeout; // Value -1 means disable any static/random fencing delays crm_element_value_int(msg, F_STONITH_DELAY, &(cmd->start_delay)); cmd->origin = crm_element_value_copy(msg, F_ORIG); cmd->remote_op_id = crm_element_value_copy(msg, F_STONITH_REMOTE_OP_ID); cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID); cmd->client_name = crm_element_value_copy(msg, F_STONITH_CLIENTNAME); cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION); cmd->action = strdup(action); cmd->victim = crm_element_value_copy(op, F_STONITH_TARGET); cmd->device = crm_element_value_copy(op, F_STONITH_DEVICE); CRM_CHECK(cmd->op != NULL, crm_log_xml_warn(msg, "NoOp"); free_async_command(cmd); return NULL); CRM_CHECK(cmd->client != NULL, crm_log_xml_warn(msg, "NoClient")); cmd->done_cb = st_child_done; cmd_list = g_list_append(cmd_list, cmd); return cmd; } static int get_action_limit(stonith_device_t * device) { const char *value = NULL; int action_limit = 1; value = g_hash_table_lookup(device->params, PCMK_STONITH_ACTION_LIMIT); if ((value == NULL) || (pcmk__scan_min_int(value, &action_limit, INT_MIN) != pcmk_rc_ok) || (action_limit == 0)) { action_limit = 1; } return action_limit; } static int get_active_cmds(stonith_device_t * device) { int counter = 0; GList *gIter = NULL; GList *gIterNext = NULL; CRM_CHECK(device != NULL, return 0); for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) { async_command_t *cmd = gIter->data; gIterNext = gIter->next; if (cmd->active_on == device) { counter++; } } return counter; } static void fork_cb(GPid pid, gpointer user_data) { async_command_t *cmd = (async_command_t *) user_data; stonith_device_t * device = /* in case of a retry we've done the move from activating_on to active_on already */ cmd->activating_on?cmd->activating_on:cmd->active_on; CRM_ASSERT(device); crm_debug("Operation '%s' [%d]%s%s using %s now running with %ds timeout", cmd->action, pid, ((cmd->victim == NULL)? "" : " targeting "), ((cmd->victim == NULL)? "" : cmd->victim), device->id, cmd->timeout); cmd->active_on = device; cmd->activating_on = NULL; } static int get_agent_metadata_cb(gpointer data) { stonith_device_t *device = data; guint period_ms; switch (get_agent_metadata(device->agent, &device->agent_metadata)) { case pcmk_rc_ok: if (device->agent_metadata) { read_action_metadata(device); stonith__device_parameter_flags(&(device->flags), device->id, device->agent_metadata); } return G_SOURCE_REMOVE; case EAGAIN: period_ms = pcmk__mainloop_timer_get_period(device->timer); if (period_ms < 160 * 1000) { mainloop_timer_set_period(device->timer, 2 * period_ms); } return G_SOURCE_CONTINUE; default: return G_SOURCE_REMOVE; } } static gboolean stonith_device_execute(stonith_device_t * device) { int exec_rc = 0; const char *action_str = NULL; const char *host_arg = NULL; async_command_t *cmd = NULL; stonith_action_t *action = NULL; int active_cmds = 0; int action_limit = 0; GList *gIter = NULL; GList *gIterNext = NULL; CRM_CHECK(device != NULL, return FALSE); active_cmds = get_active_cmds(device); action_limit = get_action_limit(device); if (action_limit > -1 && active_cmds >= action_limit) { crm_trace("%s is over its action limit of %d (%u active action%s)", device->id, action_limit, active_cmds, pcmk__plural_s(active_cmds)); return TRUE; } for (gIter = device->pending_ops; gIter != NULL; gIter = gIterNext) { async_command_t *pending_op = gIter->data; gIterNext = gIter->next; if (pending_op && pending_op->delay_id) { crm_trace("Operation '%s'%s%s using %s was asked to run too early, " "waiting for start delay of %ds", pending_op->action, ((pending_op->victim == NULL)? "" : " targeting "), ((pending_op->victim == NULL)? "" : pending_op->victim), device->id, pending_op->start_delay); continue; } device->pending_ops = g_list_remove_link(device->pending_ops, gIter); g_list_free_1(gIter); cmd = pending_op; break; } if (cmd == NULL) { crm_trace("No actions using %s are needed", device->id); return TRUE; } if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { if (node_does_watchdog_fencing(stonith_our_uname)) { pcmk__panic(__func__); goto done; } } else { crm_info("Faking success for %s watchdog operation", cmd->action); cmd->done_cb(0, 0, NULL, cmd); goto done; } } #if SUPPORT_CIBSECRETS if (pcmk__substitute_secrets(device->id, device->params) != pcmk_rc_ok) { /* replacing secrets failed! */ if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { /* don't fail on stop! */ crm_info("Proceeding with stop operation for %s", device->id); } else { crm_err("Considering %s unconfigured: Failed to get secrets", device->id); exec_rc = PCMK_OCF_NOT_CONFIGURED; cmd->done_cb(0, exec_rc, NULL, cmd); goto done; } } #endif action_str = cmd->action; if (pcmk__str_eq(cmd->action, "reboot", pcmk__str_casei) && !pcmk_is_set(device->flags, st_device_supports_reboot)) { - crm_warn("Agent '%s' does not advertise support for 'reboot', performing 'off' action instead", device->agent); + crm_notice("Remapping 'reboot' action%s%s using %s to 'off' " + "because agent '%s' does not support reboot", + ((cmd->victim == NULL)? "" : " targeting "), + ((cmd->victim == NULL)? "" : cmd->victim), + device->id, device->agent); action_str = "off"; } if (pcmk_is_set(device->flags, st_device_supports_parameter_port)) { host_arg = "port"; } else if (pcmk_is_set(device->flags, st_device_supports_parameter_plug)) { host_arg = "plug"; } action = stonith_action_create(device->agent, action_str, cmd->victim, cmd->victim_nodeid, cmd->timeout, device->params, device->aliases, host_arg); /* for async exec, exec_rc is negative for early error exit otherwise handling of success/errors is done via callbacks */ cmd->activating_on = device; exec_rc = stonith_action_execute_async(action, (void *)cmd, cmd->done_cb, fork_cb); if (exec_rc < 0) { crm_warn("Operation '%s'%s%s using %s failed: %s " CRM_XS " rc=%d", cmd->action, cmd->victim ? " targeting " : "", cmd->victim ? cmd->victim : "", device->id, pcmk_strerror(exec_rc), exec_rc); cmd->activating_on = NULL; cmd->done_cb(0, exec_rc, NULL, cmd); } done: /* Device might get triggered to work by multiple fencing commands * simultaneously. Trigger the device again to make sure any * remaining concurrent commands get executed. */ if (device->pending_ops) { mainloop_set_trigger(device->work); } return TRUE; } static gboolean stonith_device_dispatch(gpointer user_data) { return stonith_device_execute(user_data); } static gboolean start_delay_helper(gpointer data) { async_command_t *cmd = data; stonith_device_t *device = NULL; cmd->delay_id = 0; device = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; if (device) { mainloop_set_trigger(device->work); } return FALSE; } static void schedule_stonith_command(async_command_t * cmd, stonith_device_t * device) { int delay_max = 0; int delay_base = 0; int requested_delay = cmd->start_delay; CRM_CHECK(cmd != NULL, return); CRM_CHECK(device != NULL, return); if (cmd->device) { free(cmd->device); } if (device->include_nodeid && cmd->victim) { crm_node_t *node = crm_get_peer(0, cmd->victim); cmd->victim_nodeid = node->id; } cmd->device = strdup(device->id); cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout); if (cmd->remote_op_id) { crm_debug("Scheduling '%s' action%s%s using %s for remote peer %s " "with op id %.8s and timeout %ds", cmd->action, cmd->victim ? " targeting " : "", cmd->victim ? cmd->victim : "", device->id, cmd->origin, cmd->remote_op_id, cmd->timeout); } else { crm_debug("Scheduling '%s' action%s%s using %s for %s with timeout %ds", cmd->action, cmd->victim ? " targeting " : "", cmd->victim ? cmd->victim : "", device->id, cmd->client, cmd->timeout); } device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); // Value -1 means disable any static/random fencing delays if (requested_delay < 0) { return; } delay_max = get_action_delay_max(device, cmd->action); delay_base = get_action_delay_base(device, cmd->action); if (delay_max == 0) { delay_max = delay_base; } if (delay_max < delay_base) { crm_warn(PCMK_STONITH_DELAY_BASE " (%ds) is larger than " PCMK_STONITH_DELAY_MAX " (%ds) for %s using %s " "(limiting to maximum delay)", delay_base, delay_max, cmd->action, device->id); delay_base = delay_max; } if (delay_max > 0) { // coverity[dont_call] We're not using rand() for security cmd->start_delay += ((delay_max != delay_base)?(rand() % (delay_max - delay_base)):0) + delay_base; } if (cmd->start_delay > 0) { crm_notice("Delaying '%s' action%s%s using %s for %ds " CRM_XS " timeout=%ds requested_delay=%ds base=%ds max=%ds", cmd->action, cmd->victim ? " targeting " : "", cmd->victim ? cmd->victim : "", device->id, cmd->start_delay, cmd->timeout, requested_delay, delay_base, delay_max); cmd->delay_id = g_timeout_add_seconds(cmd->start_delay, start_delay_helper, cmd); } } static void free_device(gpointer data) { GList *gIter = NULL; stonith_device_t *device = data; g_hash_table_destroy(device->params); g_hash_table_destroy(device->aliases); for (gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) { async_command_t *cmd = gIter->data; crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); cmd->done_cb(0, -ENODEV, NULL, cmd); } g_list_free(device->pending_ops); g_list_free_full(device->targets, free); if (device->timer) { mainloop_timer_stop(device->timer); mainloop_timer_del(device->timer); } mainloop_destroy_trigger(device->work); free_xml(device->agent_metadata); free(device->namespace); free(device->on_target_actions); free(device->agent); free(device->id); free(device); } void free_device_list(void) { if (device_list != NULL) { g_hash_table_destroy(device_list); device_list = NULL; } } void init_device_list(void) { if (device_list == NULL) { device_list = pcmk__strkey_table(NULL, free_device); } } static GHashTable * build_port_aliases(const char *hostmap, GList ** targets) { char *name = NULL; int last = 0, lpc = 0, max = 0, added = 0; GHashTable *aliases = pcmk__strikey_table(free, free); if (hostmap == NULL) { return aliases; } max = strlen(hostmap); for (; lpc <= max; lpc++) { switch (hostmap[lpc]) { /* Assignment chars */ case '=': case ':': if (lpc > last) { free(name); name = calloc(1, 1 + lpc - last); memcpy(name, hostmap + last, lpc - last); } last = lpc + 1; break; /* Delimeter chars */ /* case ',': Potentially used to specify multiple ports */ case 0: case ';': case ' ': case '\t': if (name) { char *value = NULL; value = calloc(1, 1 + lpc - last); memcpy(value, hostmap + last, lpc - last); crm_debug("Adding alias '%s'='%s'", name, value); g_hash_table_replace(aliases, name, value); if (targets) { *targets = g_list_append(*targets, strdup(value)); } value = NULL; name = NULL; added++; } else if (lpc > last) { crm_debug("Parse error at offset %d near '%s'", lpc - last, hostmap + last); } last = lpc + 1; break; } if (hostmap[lpc] == 0) { break; } } if (added == 0) { crm_info("No host mappings detected in '%s'", hostmap); } free(name); return aliases; } GHashTable *metadata_cache = NULL; void free_metadata_cache(void) { if (metadata_cache != NULL) { g_hash_table_destroy(metadata_cache); metadata_cache = NULL; } } static void init_metadata_cache(void) { if (metadata_cache == NULL) { metadata_cache = pcmk__strkey_table(free, free); } } int get_agent_metadata(const char *agent, xmlNode ** metadata) { char *buffer = NULL; if (metadata == NULL) { return EINVAL; } *metadata = NULL; if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT_INTERNAL, pcmk__str_none)) { return pcmk_rc_ok; } init_metadata_cache(); buffer = g_hash_table_lookup(metadata_cache, agent); if (buffer == NULL) { stonith_t *st = stonith_api_new(); int rc; if (st == NULL) { crm_warn("Could not get agent meta-data: " "API memory allocation failed"); return EAGAIN; } rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); stonith_api_delete(st); if (rc || !buffer) { crm_err("Could not retrieve metadata for fencing agent %s", agent); return EAGAIN; } g_hash_table_replace(metadata_cache, strdup(agent), buffer); } *metadata = string2xml(buffer); return pcmk_rc_ok; } static gboolean is_nodeid_required(xmlNode * xml) { xmlXPathObjectPtr xpath = NULL; if (stand_alone) { return FALSE; } if (!xml) { return FALSE; } xpath = xpath_search(xml, "//parameter[@name='nodeid']"); if (numXpathResults(xpath) <= 0) { freeXpathObject(xpath); return FALSE; } freeXpathObject(xpath); return TRUE; } #define MAX_ACTION_LEN 256 static char * add_action(char *actions, const char *action) { int offset = 0; if (actions == NULL) { actions = calloc(1, MAX_ACTION_LEN); } else { offset = strlen(actions); } if (offset > 0) { offset += snprintf(actions+offset, MAX_ACTION_LEN - offset, " "); } offset += snprintf(actions+offset, MAX_ACTION_LEN - offset, "%s", action); return actions; } static void read_action_metadata(stonith_device_t *device) { xmlXPathObjectPtr xpath = NULL; int max = 0; int lpc = 0; if (device->agent_metadata == NULL) { return; } xpath = xpath_search(device->agent_metadata, "//action"); max = numXpathResults(xpath); if (max <= 0) { freeXpathObject(xpath); return; } for (lpc = 0; lpc < max; lpc++) { const char *on_target = NULL; const char *action = NULL; xmlNode *match = getXpathResult(xpath, lpc); CRM_LOG_ASSERT(match != NULL); if(match == NULL) { continue; }; on_target = crm_element_value(match, "on_target"); action = crm_element_value(match, "name"); if(pcmk__str_eq(action, "list", pcmk__str_casei)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_list); } else if(pcmk__str_eq(action, "status", pcmk__str_casei)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_status); } else if(pcmk__str_eq(action, "reboot", pcmk__str_casei)) { stonith__set_device_flags(device->flags, device->id, st_device_supports_reboot); } else if (pcmk__str_eq(action, "on", pcmk__str_casei)) { /* "automatic" means the cluster will unfence node when it joins */ const char *automatic = crm_element_value(match, "automatic"); /* "required" is a deprecated synonym for "automatic" */ const char *required = crm_element_value(match, "required"); if (crm_is_true(automatic) || crm_is_true(required)) { device->automatic_unfencing = TRUE; } } if (action && crm_is_true(on_target)) { device->on_target_actions = add_action(device->on_target_actions, action); } } freeXpathObject(xpath); } /*! * \internal * \brief Set a pcmk_*_action parameter if not already set * * \param[in,out] params Device parameters * \param[in] action Name of action * \param[in] value Value to use if action is not already set */ static void map_action(GHashTable *params, const char *action, const char *value) { char *key = crm_strdup_printf("pcmk_%s_action", action); if (g_hash_table_lookup(params, key)) { crm_warn("Ignoring %s='%s', see %s instead", STONITH_ATTR_ACTION_OP, value, key); free(key); } else { crm_warn("Mapping %s='%s' to %s='%s'", STONITH_ATTR_ACTION_OP, value, key, value); g_hash_table_insert(params, key, strdup(value)); } } /*! * \internal * \brief Create device parameter table from XML * * \param[in] name Device name (used for logging only) * \param[in,out] params Device parameters */ static GHashTable * xml2device_params(const char *name, xmlNode *dev) { GHashTable *params = xml2list(dev); const char *value; /* Action should never be specified in the device configuration, * but we support it for users who are familiar with other software * that worked that way. */ value = g_hash_table_lookup(params, STONITH_ATTR_ACTION_OP); if (value != NULL) { crm_warn("%s has '%s' parameter, which should never be specified in configuration", name, STONITH_ATTR_ACTION_OP); if (*value == '\0') { crm_warn("Ignoring empty '%s' parameter", STONITH_ATTR_ACTION_OP); } else if (strcmp(value, "reboot") == 0) { crm_warn("Ignoring %s='reboot' (see stonith-action cluster property instead)", STONITH_ATTR_ACTION_OP); } else if (strcmp(value, "off") == 0) { map_action(params, "reboot", value); } else { map_action(params, "off", value); map_action(params, "reboot", value); } g_hash_table_remove(params, STONITH_ATTR_ACTION_OP); } return params; } static const char * target_list_type(stonith_device_t * dev) { const char *check_type = NULL; check_type = g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK); if (check_type == NULL) { if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_LIST)) { check_type = "static-list"; } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP)) { check_type = "static-list"; } else if (pcmk_is_set(dev->flags, st_device_supports_list)) { check_type = "dynamic-list"; } else if (pcmk_is_set(dev->flags, st_device_supports_status)) { check_type = "status"; } else { check_type = "none"; } } return check_type; } static stonith_device_t * build_device_from_xml(xmlNode * msg) { const char *value; xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); stonith_device_t *device = NULL; char *agent = crm_element_value_copy(dev, "agent"); CRM_CHECK(agent != NULL, return device); device = calloc(1, sizeof(stonith_device_t)); CRM_CHECK(device != NULL, {free(agent); return device;}); device->id = crm_element_value_copy(dev, XML_ATTR_ID); device->agent = agent; device->namespace = crm_element_value_copy(dev, "namespace"); device->params = xml2device_params(device->id, dev); value = g_hash_table_lookup(device->params, PCMK_STONITH_HOST_LIST); if (value) { device->targets = stonith__parse_targets(value); } value = g_hash_table_lookup(device->params, PCMK_STONITH_HOST_MAP); device->aliases = build_port_aliases(value, &(device->targets)); value = target_list_type(device); if (!pcmk__str_eq(value, "static-list", pcmk__str_casei) && device->targets) { /* Other than "static-list", dev-> targets is unnecessary. */ g_list_free_full(device->targets, free); device->targets = NULL; } switch (get_agent_metadata(device->agent, &device->agent_metadata)) { case pcmk_rc_ok: if (device->agent_metadata) { read_action_metadata(device); stonith__device_parameter_flags(&(device->flags), device->id, device->agent_metadata); } break; case EAGAIN: if (device->timer == NULL) { device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, TRUE, get_agent_metadata_cb, device); } if (!mainloop_timer_running(device->timer)) { mainloop_timer_start(device->timer); } break; default: break; } value = g_hash_table_lookup(device->params, "nodeid"); if (!value) { device->include_nodeid = is_nodeid_required(device->agent_metadata); } value = crm_element_value(dev, "rsc_provides"); if (pcmk__str_eq(value, "unfencing", pcmk__str_casei)) { device->automatic_unfencing = TRUE; } if (is_action_required("on", device)) { crm_info("Fencing device '%s' requires unfencing", device->id); } if (device->on_target_actions) { crm_info("Fencing device '%s' requires actions (%s) to be executed " "on target", device->id, device->on_target_actions); } device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device); /* TODO: Hook up priority */ return device; } static void schedule_internal_command(const char *origin, stonith_device_t * device, const char *action, const char *victim, int timeout, void *internal_user_data, void (*done_cb) (GPid pid, int rc, const char *output, gpointer user_data)) { async_command_t *cmd = NULL; cmd = calloc(1, sizeof(async_command_t)); cmd->id = -1; cmd->default_timeout = timeout ? timeout : 60; cmd->timeout = cmd->default_timeout; cmd->action = strdup(action); cmd->victim = victim ? strdup(victim) : NULL; cmd->device = strdup(device->id); cmd->origin = strdup(origin); cmd->client = strdup(crm_system_name); cmd->client_name = strdup(crm_system_name); cmd->internal_user_data = internal_user_data; cmd->done_cb = done_cb; /* cmd, not internal_user_data, is passed to 'done_cb' as the userdata */ schedule_stonith_command(cmd, device); } static void status_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; gboolean can = FALSE; free_async_command(cmd); if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } mainloop_set_trigger(dev->work); if (rc == 1 /* unknown */ ) { crm_trace("Host %s is not known by %s", search->host, dev->id); } else if (rc == 0 /* active */ || rc == 2 /* inactive */ ) { crm_trace("Host %s is known by %s", search->host, dev->id); can = TRUE; } else { crm_notice("Unknown result when testing if %s can fence %s: rc=%d", dev->id, search->host, rc); } search_devices_record_result(search, dev->id, can); } static void dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; gboolean can_fence = FALSE; free_async_command(cmd); /* Host/alias must be in the list output to be eligible to be fenced * * Will cause problems if down'd nodes aren't listed or (for virtual nodes) * if the guest is still listed despite being moved to another machine */ if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } mainloop_set_trigger(dev->work); /* If we successfully got the targets earlier, don't disable. */ if (rc != 0 && !dev->targets) { if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK) == NULL) { /* If the operation fails if the user does not explicitly specify "dynamic-list", it will fall back to "status". */ crm_notice("Disabling port list queries for %s (%d): %s", dev->id, rc, output); g_hash_table_replace(dev->params, strdup(PCMK_STONITH_HOST_CHECK), strdup("status")); } } else if (!rc) { crm_info("Refreshing port list for %s", dev->id); g_list_free_full(dev->targets, free); dev->targets = stonith__parse_targets(output); dev->targets_age = time(NULL); } if (dev->targets) { const char *alias = g_hash_table_lookup(dev->aliases, search->host); if (!alias) { alias = search->host; } if (pcmk__str_in_list(alias, dev->targets, pcmk__str_casei)) { can_fence = TRUE; } } search_devices_record_result(search, dev->id, can_fence); } /*! * \internal * \brief Returns true if any key in first is not in second or second has a different value for key */ static int device_params_diff(GHashTable *first, GHashTable *second) { char *key = NULL; char *value = NULL; GHashTableIter gIter; g_hash_table_iter_init(&gIter, first); while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&value)) { if(strstr(key, "CRM_meta") == key) { continue; } else if(strcmp(key, "crm_feature_set") == 0) { continue; } else { char *other_value = g_hash_table_lookup(second, key); if (!other_value || !pcmk__str_eq(other_value, value, pcmk__str_casei)) { crm_trace("Different value for %s: %s != %s", key, other_value, value); return 1; } } } return 0; } /*! * \internal * \brief Checks to see if an identical device already exists in the device_list */ static stonith_device_t * device_has_duplicate(stonith_device_t * device) { stonith_device_t *dup = g_hash_table_lookup(device_list, device->id); if (!dup) { crm_trace("No match for %s", device->id); return NULL; } else if (!pcmk__str_eq(dup->agent, device->agent, pcmk__str_casei)) { crm_trace("Different agent: %s != %s", dup->agent, device->agent); return NULL; } /* Use calculate_operation_digest() here? */ if (device_params_diff(device->params, dup->params) || device_params_diff(dup->params, device->params)) { return NULL; } crm_trace("Match"); return dup; } int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) { stonith_device_t *dup = NULL; stonith_device_t *device = build_device_from_xml(msg); guint ndevices = 0; int rv = pcmk_ok; CRM_CHECK(device != NULL, return -ENOMEM); /* do we have a watchdog-device? */ if (pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none) || pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) do { if (stonith_watchdog_timeout_ms <= 0) { crm_err("Ignoring watchdog fence device without " "stonith-watchdog-timeout set."); rv = -ENODEV; /* fall through to cleanup & return */ } else if (!pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { crm_err("Ignoring watchdog fence device with unknown " "agent '%s' unequal '" STONITH_WATCHDOG_AGENT "'.", device->agent?device->agent:""); rv = -ENODEV; /* fall through to cleanup & return */ } else if (!pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none)) { crm_err("Ignoring watchdog fence device " "named %s !='"STONITH_WATCHDOG_ID"'.", device->id?device->id:""); rv = -ENODEV; /* fall through to cleanup & return */ } else { if (pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { /* this either has an empty list or the targets configured for watchdog-fencing */ g_list_free_full(stonith_watchdog_targets, free); stonith_watchdog_targets = device->targets; device->targets = NULL; } if (node_does_watchdog_fencing(stonith_our_uname)) { g_list_free_full(device->targets, free); device->targets = stonith__parse_targets(stonith_our_uname); g_hash_table_replace(device->params, strdup(PCMK_STONITH_HOST_LIST), strdup(stonith_our_uname)); /* proceed as with any other stonith-device */ break; } crm_debug("Skip registration of watchdog fence device on node not in host-list."); /* cleanup and fall through to more cleanup and return */ device->targets = NULL; stonith_device_remove(device->id, from_cib); } free_device(device); return rv; } while (0); dup = device_has_duplicate(device); if (dup) { ndevices = g_hash_table_size(device_list); crm_debug("Device '%s' already in device list (%d active device%s)", device->id, ndevices, pcmk__plural_s(ndevices)); free_device(device); device = dup; dup = g_hash_table_lookup(device_list, device->id); dup->dirty = FALSE; } else { stonith_device_t *old = g_hash_table_lookup(device_list, device->id); if (from_cib && old && old->api_registered) { /* If the cib is writing over an entry that is shared with a stonith client, * copy any pending ops that currently exist on the old entry to the new one. * Otherwise the pending ops will be reported as failures */ crm_info("Overwriting existing entry for %s from CIB", device->id); device->pending_ops = old->pending_ops; device->api_registered = TRUE; old->pending_ops = NULL; if (device->pending_ops) { mainloop_set_trigger(device->work); } } g_hash_table_replace(device_list, device->id, device); ndevices = g_hash_table_size(device_list); crm_notice("Added '%s' to device list (%d active device%s)", device->id, ndevices, pcmk__plural_s(ndevices)); } if (desc) { *desc = device->id; } if (from_cib) { device->cib_registered = TRUE; } else { device->api_registered = TRUE; } return pcmk_ok; } int stonith_device_remove(const char *id, gboolean from_cib) { stonith_device_t *device = g_hash_table_lookup(device_list, id); guint ndevices = 0; if (!device) { ndevices = g_hash_table_size(device_list); crm_info("Device '%s' not found (%d active device%s)", id, ndevices, pcmk__plural_s(ndevices)); return pcmk_ok; } if (from_cib) { device->cib_registered = FALSE; } else { device->verified = FALSE; device->api_registered = FALSE; } if (!device->cib_registered && !device->api_registered) { g_hash_table_remove(device_list, id); ndevices = g_hash_table_size(device_list); crm_info("Removed '%s' from device list (%d active device%s)", id, ndevices, pcmk__plural_s(ndevices)); } else { crm_trace("Not removing '%s' from device list (%d active) because " "still registered via:%s%s", id, g_hash_table_size(device_list), (device->cib_registered? " cib" : ""), (device->api_registered? " api" : "")); } return pcmk_ok; } /*! * \internal * \brief Return the number of stonith levels registered for a node * * \param[in] tp Node's topology table entry * * \return Number of non-NULL levels in topology entry * \note This function is used only for log messages. */ static int count_active_levels(stonith_topology_t * tp) { int lpc = 0; int count = 0; for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if (tp->levels[lpc] != NULL) { count++; } } return count; } static void free_topology_entry(gpointer data) { stonith_topology_t *tp = data; int lpc = 0; for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if (tp->levels[lpc] != NULL) { g_list_free_full(tp->levels[lpc], free); } } free(tp->target); free(tp->target_value); free(tp->target_pattern); free(tp->target_attribute); free(tp); } void free_topology_list(void) { if (topology != NULL) { g_hash_table_destroy(topology); topology = NULL; } } void init_topology_list(void) { if (topology == NULL) { topology = pcmk__strkey_table(NULL, free_topology_entry); } } char *stonith_level_key(xmlNode *level, int mode) { if(mode == -1) { mode = stonith_level_kind(level); } switch(mode) { case 0: return crm_element_value_copy(level, XML_ATTR_STONITH_TARGET); case 1: return crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_PATTERN); case 2: { const char *name = crm_element_value(level, XML_ATTR_STONITH_TARGET_ATTRIBUTE); const char *value = crm_element_value(level, XML_ATTR_STONITH_TARGET_VALUE); if(name && value) { return crm_strdup_printf("%s=%s", name, value); } } default: return crm_strdup_printf("Unknown-%d-%s", mode, ID(level)); } } int stonith_level_kind(xmlNode * level) { int mode = 0; const char *target = crm_element_value(level, XML_ATTR_STONITH_TARGET); if(target == NULL) { mode++; target = crm_element_value(level, XML_ATTR_STONITH_TARGET_PATTERN); } if(stand_alone == FALSE && target == NULL) { mode++; if(crm_element_value(level, XML_ATTR_STONITH_TARGET_ATTRIBUTE) == NULL) { mode++; } else if(crm_element_value(level, XML_ATTR_STONITH_TARGET_VALUE) == NULL) { mode++; } } return mode; } static stonith_key_value_t * parse_device_list(const char *devices) { int lpc = 0; int max = 0; int last = 0; stonith_key_value_t *output = NULL; if (devices == NULL) { return output; } max = strlen(devices); for (lpc = 0; lpc <= max; lpc++) { if (devices[lpc] == ',' || devices[lpc] == 0) { char *line = strndup(devices + last, lpc - last); output = stonith_key_value_add(output, NULL, line); free(line); last = lpc + 1; } } return output; } /*! * \internal * \brief Register a STONITH level for a target * * Given an XML request specifying the target name, level index, and device IDs * for the level, this will create an entry for the target in the global topology * table if one does not already exist, then append the specified device IDs to * the entry's device list for the specified level. * * \param[in] msg XML request for STONITH level registration * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") * * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index */ int stonith_level_register(xmlNode *msg, char **desc) { int id = 0; xmlNode *level; int mode; char *target; stonith_topology_t *tp; stonith_key_value_t *dIter = NULL; stonith_key_value_t *devices = NULL; /* Allow the XML here to point to the level tag directly, or wrapped in * another tag. If directly, don't search by xpath, because it might give * multiple hits (e.g. if the XML is the CIB). */ if (pcmk__str_eq(TYPE(msg), XML_TAG_FENCING_LEVEL, pcmk__str_casei)) { level = msg; } else { level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); } CRM_CHECK(level != NULL, return -EINVAL); mode = stonith_level_kind(level); target = stonith_level_key(level, mode); crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); if (desc) { *desc = crm_strdup_printf("%s[%d]", target, id); } /* Sanity-check arguments */ if (mode >= 3 || (id <= 0) || (id >= ST_LEVEL_MAX)) { crm_trace("Could not add %s[%d] (%d) to the topology (%d active entries)", target, id, mode, g_hash_table_size(topology)); free(target); crm_log_xml_err(level, "Bad topology"); return -EINVAL; } /* Find or create topology table entry */ tp = g_hash_table_lookup(topology, target); if (tp == NULL) { tp = calloc(1, sizeof(stonith_topology_t)); tp->kind = mode; tp->target = target; tp->target_value = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_VALUE); tp->target_pattern = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_PATTERN); tp->target_attribute = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_ATTRIBUTE); g_hash_table_replace(topology, tp->target, tp); crm_trace("Added %s (%d) to the topology (%d active entries)", target, mode, g_hash_table_size(topology)); } else { free(target); } if (tp->levels[id] != NULL) { crm_info("Adding to the existing %s[%d] topology entry", tp->target, id); } devices = parse_device_list(crm_element_value(level, XML_ATTR_STONITH_DEVICES)); for (dIter = devices; dIter; dIter = dIter->next) { const char *device = dIter->value; crm_trace("Adding device '%s' for %s[%d]", device, tp->target, id); tp->levels[id] = g_list_append(tp->levels[id], strdup(device)); } stonith_key_value_freeall(devices, 1, 1); { int nlevels = count_active_levels(tp); crm_info("Target %s has %d active fencing level%s", tp->target, nlevels, pcmk__plural_s(nlevels)); } return pcmk_ok; } int stonith_level_remove(xmlNode *msg, char **desc) { int id = 0; stonith_topology_t *tp; char *target; /* Unlike additions, removal requests should always have one level tag */ xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); CRM_CHECK(level != NULL, return -EINVAL); target = stonith_level_key(level, -1); crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); if (desc) { *desc = crm_strdup_printf("%s[%d]", target, id); } /* Sanity-check arguments */ if (id >= ST_LEVEL_MAX) { free(target); return -EINVAL; } tp = g_hash_table_lookup(topology, target); if (tp == NULL) { guint nentries = g_hash_table_size(topology); crm_info("No fencing topology found for %s (%d active %s)", target, nentries, pcmk__plural_alt(nentries, "entry", "entries")); } else if (id == 0 && g_hash_table_remove(topology, target)) { guint nentries = g_hash_table_size(topology); crm_info("Removed all fencing topology entries related to %s " "(%d active %s remaining)", target, nentries, pcmk__plural_alt(nentries, "entry", "entries")); } else if (id > 0 && tp->levels[id] != NULL) { guint nlevels; g_list_free_full(tp->levels[id], free); tp->levels[id] = NULL; nlevels = count_active_levels(tp); crm_info("Removed level %d from fencing topology for %s " "(%d active level%s remaining)", id, target, nlevels, pcmk__plural_s(nlevels)); } free(target); return pcmk_ok; } /*! * \internal * \brief Schedule an (asynchronous) action directly on a stonith device * * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action * directly on a specified device. Only list, monitor, and status actions are * expected to use this call, though it should work with any agent command. * * \param[in] msg API message XML with desired action * \param[out] output Unused * * \return -EINPROGRESS on success, -errno otherwise * \note If the action is monitor, the device must be registered via the API * (CIB registration is not sufficient), because monitor should not be * possible unless the device is "started" (API registered). */ static char * list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) { int max = g_list_length(list); size_t delim_len = delim?strlen(delim):0; size_t alloc_size = 1 + (max?((max-1+(terminate_with_delim?1:0))*delim_len):0); char *rv; GList *gIter; for (gIter = list; gIter != NULL; gIter = gIter->next) { const char *value = (const char *) gIter->data; alloc_size += strlen(value); } rv = calloc(alloc_size, sizeof(char)); if (rv) { char *pos = rv; const char *lead_delim = ""; for (gIter = list; gIter != NULL; gIter = gIter->next) { const char *value = (const char *) gIter->data; pos = &pos[sprintf(pos, "%s%s", lead_delim, value)]; lead_delim = delim; } if (max && terminate_with_delim) { sprintf(pos, "%s", delim); } } return rv; } static int stonith_device_action(xmlNode * msg, char **output) { xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); const char *id = crm_element_value(dev, F_STONITH_DEVICE); const char *action = crm_element_value(op, F_STONITH_ACTION); async_command_t *cmd = NULL; stonith_device_t *device = NULL; if ((id == NULL) || (action == NULL)) { crm_info("Malformed API action request: device %s, action %s", (id? id : "not specified"), (action? action : "not specified")); return -EPROTO; } if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { if (stonith_watchdog_timeout_ms <= 0) { return -ENODEV; } else { if (pcmk__str_eq(action, "list", pcmk__str_casei)) { *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); return pcmk_ok; } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { return pcmk_ok; } } } device = g_hash_table_lookup(device_list, id); if ((device == NULL) || (!device->api_registered && !strcmp(action, "monitor"))) { // Monitors may run only on "started" (API-registered) devices crm_info("Ignoring API '%s' action request because device %s not found", action, id); return -ENODEV; } cmd = create_async_command(msg); if (cmd == NULL) { return -EPROTO; } schedule_stonith_command(cmd, device); return -EINPROGRESS; } static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence) { search->replies_received++; if (can_fence && device) { search->capable = g_list_append(search->capable, strdup(device)); } if (search->replies_needed == search->replies_received) { guint ndevices = g_list_length(search->capable); crm_debug("Search found %d device%s that can perform '%s' targeting %s", ndevices, pcmk__plural_s(ndevices), (search->action? search->action : "unknown action"), (search->host? search->host : "any node")); search->callback(search->capable, search->user_data); free(search->host); free(search->action); free(search); } } /*! * \internal * \brief Check whether the local host is allowed to execute a fencing action * * \param[in] device Fence device to check * \param[in] action Fence action to check * \param[in] target Hostname of fence target * \param[in] allow_suicide Whether self-fencing is allowed for this operation * * \return TRUE if local host is allowed to execute action, FALSE otherwise */ static gboolean localhost_is_eligible(const stonith_device_t *device, const char *action, const char *target, gboolean allow_suicide) { gboolean localhost_is_target = pcmk__str_eq(target, stonith_our_uname, pcmk__str_casei); if (device && action && device->on_target_actions && strstr(device->on_target_actions, action)) { if (!localhost_is_target) { crm_trace("Operation '%s' using %s can only be executed for " "local host, not %s", action, device->id, target); return FALSE; } } else if (localhost_is_target && !allow_suicide) { crm_trace("'%s' operation does not support self-fencing", action); return FALSE; } return TRUE; } static void can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *search) { gboolean can = FALSE; const char *check_type = NULL; const char *host = search->host; const char *alias = NULL; CRM_LOG_ASSERT(dev != NULL); if (dev == NULL) { goto search_report_results; } else if (host == NULL) { can = TRUE; goto search_report_results; } /* Short-circuit query if this host is not allowed to perform the action */ if (pcmk__str_eq(search->action, "reboot", pcmk__str_casei)) { /* A "reboot" *might* get remapped to "off" then "on", so short-circuit * only if all three are disallowed. If only one or two are disallowed, * we'll report that with the results. We never allow suicide for * remapped "on" operations because the host is off at that point. */ if (!localhost_is_eligible(dev, "reboot", host, search->allow_suicide) && !localhost_is_eligible(dev, "off", host, search->allow_suicide) && !localhost_is_eligible(dev, "on", host, FALSE)) { goto search_report_results; } } else if (!localhost_is_eligible(dev, search->action, host, search->allow_suicide)) { goto search_report_results; } alias = g_hash_table_lookup(dev->aliases, host); if (alias == NULL) { alias = host; } check_type = target_list_type(dev); if (pcmk__str_eq(check_type, "none", pcmk__str_casei)) { can = TRUE; } else if (pcmk__str_eq(check_type, "static-list", pcmk__str_casei)) { /* Presence in the hostmap is sufficient * Only use if all hosts on which the device can be active can always fence all listed hosts */ if (pcmk__str_in_list(host, dev->targets, pcmk__str_casei)) { can = TRUE; } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP) && g_hash_table_lookup(dev->aliases, host)) { can = TRUE; } } else if (pcmk__str_eq(check_type, "dynamic-list", pcmk__str_casei)) { time_t now = time(NULL); if (dev->targets == NULL || dev->targets_age + 60 < now) { crm_trace("Running '%s' to check whether %s is eligible to fence %s (%s)", check_type, dev->id, search->host, search->action); schedule_internal_command(__func__, dev, "list", NULL, search->per_device_timeout, search, dynamic_list_search_cb); /* we'll respond to this search request async in the cb */ return; } if (pcmk__str_in_list(alias, dev->targets, pcmk__str_casei)) { can = TRUE; } } else if (pcmk__str_eq(check_type, "status", pcmk__str_casei)) { crm_trace("Running '%s' to check whether %s is eligible to fence %s (%s)", check_type, dev->id, search->host, search->action); schedule_internal_command(__func__, dev, "status", search->host, search->per_device_timeout, search, status_search_cb); /* we'll respond to this search request async in the cb */ return; } else { crm_err("Invalid value for " PCMK_STONITH_HOST_CHECK ": %s", check_type); check_type = "Invalid " PCMK_STONITH_HOST_CHECK; } if (pcmk__str_eq(host, alias, pcmk__str_casei)) { crm_notice("%s is%s eligible to fence (%s) %s: %s", dev->id, (can? "" : " not"), search->action, host, check_type); } else { crm_notice("%s is%s eligible to fence (%s) %s (aka. '%s'): %s", dev->id, (can? "" : " not"), search->action, host, alias, check_type); } search_report_results: search_devices_record_result(search, dev ? dev->id : NULL, can); } static void search_devices(gpointer key, gpointer value, gpointer user_data) { stonith_device_t *dev = value; struct device_search_s *search = user_data; can_fence_host_with_device(dev, search); } #define DEFAULT_QUERY_TIMEOUT 20 static void get_capable_devices(const char *host, const char *action, int timeout, bool suicide, void *user_data, void (*callback) (GList * devices, void *user_data)) { struct device_search_s *search; int per_device_timeout = DEFAULT_QUERY_TIMEOUT; int devices_needing_async_query = 0; char *key = NULL; const char *check_type = NULL; GHashTableIter gIter; stonith_device_t *device = NULL; guint ndevices = g_hash_table_size(device_list); if (ndevices == 0) { callback(NULL, user_data); return; } search = calloc(1, sizeof(struct device_search_s)); if (!search) { callback(NULL, user_data); return; } g_hash_table_iter_init(&gIter, device_list); while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&device)) { check_type = target_list_type(device); if (pcmk__strcase_any_of(check_type, "status", "dynamic-list", NULL)) { devices_needing_async_query++; } } /* If we have devices that require an async event in order to know what * nodes they can fence, we have to give the events a timeout. The total * query timeout is divided among those events. */ if (devices_needing_async_query) { per_device_timeout = timeout / devices_needing_async_query; if (!per_device_timeout) { crm_err("Fencing timeout %ds is too low; using %ds, " "but consider raising to at least %ds", timeout, DEFAULT_QUERY_TIMEOUT, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); per_device_timeout = DEFAULT_QUERY_TIMEOUT; } else if (per_device_timeout < DEFAULT_QUERY_TIMEOUT) { crm_notice("Fencing timeout %ds is low for the current " "configuration; consider raising to at least %ds", timeout, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); } } search->host = host ? strdup(host) : NULL; search->action = action ? strdup(action) : NULL; search->per_device_timeout = per_device_timeout; /* We are guaranteed this many replies. Even if a device gets * unregistered some how during the async search, we will get * the correct number of replies. */ search->replies_needed = ndevices; search->allow_suicide = suicide; search->callback = callback; search->user_data = user_data; /* kick off the search */ crm_debug("Searching %d device%s to see which can execute '%s' targeting %s", ndevices, pcmk__plural_s(ndevices), (search->action? search->action : "unknown action"), (search->host? search->host : "any node")); g_hash_table_foreach(device_list, search_devices, search); } struct st_query_data { xmlNode *reply; char *remote_peer; char *client_id; char *target; char *action; int call_options; }; /*! * \internal * \brief Add action-specific attributes to query reply XML * * \param[in,out] xml XML to add attributes to * \param[in] action Fence action * \param[in] device Fence device */ static void add_action_specific_attributes(xmlNode *xml, const char *action, stonith_device_t *device) { int action_specific_timeout; int delay_max; int delay_base; CRM_CHECK(xml && action && device, return); if (is_action_required(action, device)) { crm_trace("Action '%s' is required using %s", action, device->id); crm_xml_add_int(xml, F_STONITH_DEVICE_REQUIRED, 1); } action_specific_timeout = get_action_timeout(device, action, 0); if (action_specific_timeout) { crm_trace("Action '%s' has timeout %dms using %s", action, action_specific_timeout, device->id); crm_xml_add_int(xml, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); } delay_max = get_action_delay_max(device, action); if (delay_max > 0) { crm_trace("Action '%s' has maximum random delay %dms using %s", action, delay_max, device->id); crm_xml_add_int(xml, F_STONITH_DELAY_MAX, delay_max / 1000); } delay_base = get_action_delay_base(device, action); if (delay_base > 0) { crm_xml_add_int(xml, F_STONITH_DELAY_BASE, delay_base / 1000); } if ((delay_max > 0) && (delay_base == 0)) { crm_trace("Action '%s' has maximum random delay %dms using %s", action, delay_max, device->id); } else if ((delay_max == 0) && (delay_base > 0)) { crm_trace("Action '%s' has a static delay of %dms using %s", action, delay_base, device->id); } else if ((delay_max > 0) && (delay_base > 0)) { crm_trace("Action '%s' has a minimum delay of %dms and a randomly chosen " "maximum delay of %dms using %s", action, delay_base, delay_max, device->id); } } /*! * \internal * \brief Add "disallowed" attribute to query reply XML if appropriate * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_disallowed(xmlNode *xml, const char *action, stonith_device_t *device, const char *target, gboolean allow_suicide) { if (!localhost_is_eligible(device, action, target, allow_suicide)) { crm_trace("Action '%s' using %s is disallowed for local host", action, device->id); crm_xml_add(xml, F_STONITH_ACTION_DISALLOWED, XML_BOOLEAN_TRUE); } } /*! * \internal * \brief Add child element with action-specific values to query reply XML * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_action_reply(xmlNode *xml, const char *action, stonith_device_t *device, const char *target, gboolean allow_suicide) { xmlNode *child = create_xml_node(xml, F_STONITH_ACTION); crm_xml_add(child, XML_ATTR_ID, action); add_action_specific_attributes(child, action, device); add_disallowed(child, action, device, target, allow_suicide); } static void stonith_query_capable_device_cb(GList * devices, void *user_data) { struct st_query_data *query = user_data; int available_devices = 0; xmlNode *dev = NULL; xmlNode *list = NULL; GList *lpc = NULL; /* Pack the results into XML */ list = create_xml_node(NULL, __func__); crm_xml_add(list, F_STONITH_TARGET, query->target); for (lpc = devices; lpc != NULL; lpc = lpc->next) { stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); const char *action = query->action; if (!device) { /* It is possible the device got unregistered while * determining who can fence the target */ continue; } available_devices++; dev = create_xml_node(list, F_STONITH_DEVICE); crm_xml_add(dev, XML_ATTR_ID, device->id); crm_xml_add(dev, "namespace", device->namespace); crm_xml_add(dev, "agent", device->agent); crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified); /* If the originating fencer wants to reboot the node, and we have a * capable device that doesn't support "reboot", remap to "off" instead. */ if (!pcmk_is_set(device->flags, st_device_supports_reboot) && pcmk__str_eq(query->action, "reboot", pcmk__str_casei)) { crm_trace("%s doesn't support reboot, using values for off instead", device->id); action = "off"; } /* Add action-specific values if available */ add_action_specific_attributes(dev, action, device); if (pcmk__str_eq(query->action, "reboot", pcmk__str_casei)) { /* A "reboot" *might* get remapped to "off" then "on", so after * sending the "reboot"-specific values in the main element, we add * sub-elements for "off" and "on" values. * * We short-circuited earlier if "reboot", "off" and "on" are all * disallowed for the local host. However if only one or two are * disallowed, we send back the results and mark which ones are * disallowed. If "reboot" is disallowed, this might cause problems * with older fencer versions, which won't check for it. Older * versions will ignore "off" and "on", so they are not a problem. */ add_disallowed(dev, action, device, query->target, pcmk_is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, "off", device, query->target, pcmk_is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, "on", device, query->target, FALSE); } /* A query without a target wants device parameters */ if (query->target == NULL) { xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); g_hash_table_foreach(device->params, hash2field, attrs); } } crm_xml_add_int(list, F_STONITH_AVAILABLE_DEVICES, available_devices); if (query->target) { crm_debug("Found %d matching device%s for target '%s'", available_devices, pcmk__plural_s(available_devices), query->target); } else { crm_debug("%d device%s installed", available_devices, pcmk__plural_s(available_devices)); } if (list != NULL) { crm_log_xml_trace(list, "Add query results"); add_message_xml(query->reply, F_STONITH_CALLDATA, list); } stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id); free_xml(query->reply); free(query->remote_peer); free(query->client_id); free(query->target); free(query->action); free(query); free_xml(list); g_list_free_full(devices, free); } static void stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int call_options) { struct st_query_data *query = NULL; const char *action = NULL; const char *target = NULL; int timeout = 0; xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_NEVER); crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); if (dev) { const char *device = crm_element_value(dev, F_STONITH_DEVICE); target = crm_element_value(dev, F_STONITH_TARGET); action = crm_element_value(dev, F_STONITH_ACTION); if (device && pcmk__str_eq(device, "manual_ack", pcmk__str_casei)) { /* No query or reply necessary */ return; } } crm_log_xml_debug(msg, "Query"); query = calloc(1, sizeof(struct st_query_data)); query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; query->client_id = client_id ? strdup(client_id) : NULL; query->target = target ? strdup(target) : NULL; query->action = action ? strdup(action) : NULL; query->call_options = call_options; get_capable_devices(target, action, timeout, pcmk_is_set(call_options, st_opt_allow_suicide), query, stonith_query_capable_device_cb); } #define ST_LOG_OUTPUT_MAX 512 static void log_operation(async_command_t * cmd, int rc, int pid, const char *next, const char *output, gboolean op_merged) { if (rc == 0) { next = NULL; } if (cmd->victim != NULL) { do_crm_log(((rc == 0)? LOG_NOTICE : LOG_ERR), "Operation '%s' [%d] (%scall %d from %s) targeting %s " "using %s returned %d (%s)%s%s", cmd->action, pid, (op_merged? "merged " : ""), cmd->id, cmd->client_name, cmd->victim, cmd->device, rc, pcmk_strerror(rc), (next? ", retrying with " : ""), (next ? next : "")); } else { do_crm_log_unlikely(((rc == 0)? LOG_DEBUG : LOG_NOTICE), "Operation '%s' [%d]%s using %s returned %d (%s)%s%s", cmd->action, pid, (op_merged? " (merged)" : ""), cmd->device, rc, pcmk_strerror(rc), (next? ", retrying with " : ""), (next ? next : "")); } if (output) { // Output may have multiple lines char *prefix = crm_strdup_printf("%s[%d]", cmd->device, pid); crm_log_output(rc == 0 ? LOG_DEBUG : LOG_WARNING, prefix, output); free(prefix); } } static void stonith_send_async_reply(async_command_t * cmd, const char *output, int rc, GPid pid, int options) { xmlNode *reply = NULL; gboolean bcast = FALSE; reply = stonith_construct_async_reply(cmd, output, NULL, rc); if (pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { /* Too verbose to log */ crm_trace("Metadata query for %s", cmd->device); output = NULL; } else if (pcmk__str_any_of(cmd->action, "monitor", "list", "status", NULL)) { crm_trace("Never broadcast '%s' replies", cmd->action); } else if (!stand_alone && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei) && !pcmk__str_eq(cmd->action, "on", pcmk__str_casei)) { crm_trace("Broadcast '%s' reply for %s", cmd->action, cmd->victim); crm_xml_add(reply, F_SUBTYPE, "broadcast"); bcast = TRUE; } log_operation(cmd, rc, pid, NULL, output, (options & st_reply_opt_merged ? TRUE : FALSE)); crm_log_xml_trace(reply, "Reply"); if (options & st_reply_opt_merged) { crm_xml_add(reply, F_STONITH_MERGED, "true"); } if (bcast) { crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); } else if (cmd->origin) { crm_trace("Directed reply to %s", cmd->origin); send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); } else { crm_trace("Directed local %ssync reply to %s", (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name); do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); } if (stand_alone) { /* Do notification with a clean data object */ xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); crm_xml_add(notify_data, F_STONITH_DEVICE, cmd->device); crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); } free_xml(reply); } static void cancel_stonith_command(async_command_t * cmd) { stonith_device_t *device; CRM_CHECK(cmd != NULL, return); if (!cmd->device) { return; } device = g_hash_table_lookup(device_list, cmd->device); if (device) { crm_trace("Cancel scheduled '%s' action using %s", cmd->action, device->id); device->pending_ops = g_list_remove(device->pending_ops, cmd); } } static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data) { stonith_device_t *device = NULL; stonith_device_t *next_device = NULL; async_command_t *cmd = user_data; GList *gIter = NULL; GList *gIterNext = NULL; CRM_CHECK(cmd != NULL, return); cmd->active_on = NULL; /* The device is ready to do something else now */ device = g_hash_table_lookup(device_list, cmd->device); if (device) { if (!device->verified && (rc == pcmk_ok) && (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { device->verified = TRUE; } mainloop_set_trigger(device->work); } crm_debug("Operation '%s' using %s returned %d (%d devices remaining)", cmd->action, cmd->device, rc, g_list_length(cmd->device_next)); if (rc == 0) { GList *iter; /* see if there are any required devices left to execute for this op */ for (iter = cmd->device_next; iter != NULL; iter = iter->next) { next_device = g_hash_table_lookup(device_list, iter->data); if (next_device != NULL && is_action_required(cmd->action, next_device)) { cmd->device_next = iter->next; break; } next_device = NULL; } } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) { /* if this device didn't work out, see if there are any others we can try. * if the failed device was 'required', we can't pick another device. */ next_device = g_hash_table_lookup(device_list, cmd->device_next->data); cmd->device_next = cmd->device_next->next; } /* this operation requires more fencing, hooray! */ if (next_device) { log_operation(cmd, rc, pid, next_device->id, output, FALSE); schedule_stonith_command(cmd, next_device); /* Prevent cmd from being freed */ cmd = NULL; goto done; } stonith_send_async_reply(cmd, output, rc, pid, st_reply_opt_none); if (rc != 0) { goto done; } /* Check to see if any operations are scheduled to do the exact * same thing that just completed. If so, rather than * performing the same fencing operation twice, return the result * of this operation for all pending commands it matches. */ for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) { async_command_t *cmd_other = gIter->data; gIterNext = gIter->next; if (cmd == cmd_other) { continue; } /* A pending scheduled command matches the command that just finished if. * 1. The client connections are different. * 2. The node victim is the same. * 3. The fencing action is the same. * 4. The device scheduled to execute the action is the same. */ if (pcmk__str_eq(cmd->client, cmd_other->client, pcmk__str_casei) || !pcmk__str_eq(cmd->victim, cmd_other->victim, pcmk__str_casei) || !pcmk__str_eq(cmd->action, cmd_other->action, pcmk__str_casei) || !pcmk__str_eq(cmd->device, cmd_other->device, pcmk__str_casei)) { continue; } /* Duplicate merging will do the right thing for either type of remapped * reboot. If the executing fencer remapped an unsupported reboot to * off, then cmd->action will be reboot and will be merged with any * other reboot requests. If the originating fencer remapped a * topology reboot to off then on, we will get here once with * cmd->action "off" and once with "on", and they will be merged * separately with similar requests. */ crm_notice("Merging fencing action '%s' targeting %s originating from " "client %s with identical fencing request from client %s", cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name); cmd_list = g_list_remove_link(cmd_list, gIter); stonith_send_async_reply(cmd_other, output, rc, pid, st_reply_opt_merged); cancel_stonith_command(cmd_other); free_async_command(cmd_other); g_list_free_1(gIter); } done: free_async_command(cmd); } static gint sort_device_priority(gconstpointer a, gconstpointer b) { const stonith_device_t *dev_a = a; const stonith_device_t *dev_b = b; if (dev_a->priority > dev_b->priority) { return -1; } else if (dev_a->priority < dev_b->priority) { return 1; } return 0; } static void stonith_fence_get_devices_cb(GList * devices, void *user_data) { async_command_t *cmd = user_data; stonith_device_t *device = NULL; guint ndevices = g_list_length(devices); crm_info("Found %d matching device%s for target '%s'", ndevices, pcmk__plural_s(ndevices), cmd->victim); if (devices != NULL) { /* Order based on priority */ devices = g_list_sort(devices, sort_device_priority); device = g_hash_table_lookup(device_list, devices->data); if (device) { cmd->device_list = devices; cmd->device_next = devices->next; devices = NULL; /* list owned by cmd now */ } } /* we have a device, schedule it for fencing. */ if (device) { schedule_stonith_command(cmd, device); /* in progress */ return; } /* no device found! */ stonith_send_async_reply(cmd, NULL, -ENODEV, 0, st_reply_opt_none); free_async_command(cmd); g_list_free_full(devices, free); } static int stonith_fence(xmlNode * msg) { const char *device_id = NULL; stonith_device_t *device = NULL; async_command_t *cmd = create_async_command(msg); xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); if (cmd == NULL) { return -EPROTO; } device_id = crm_element_value(dev, F_STONITH_DEVICE); if (device_id) { device = g_hash_table_lookup(device_list, device_id); if (device == NULL) { crm_err("Requested device '%s' is not available", device_id); return -ENODEV; } schedule_stonith_command(cmd, device); } else { const char *host = crm_element_value(dev, F_STONITH_TARGET); if (cmd->options & st_opt_cs_nodeid) { int nodeid; crm_node_t *node; pcmk__scan_min_int(host, &nodeid, 0); node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY); if (node) { host = node->uname; } } /* If we get to here, then self-fencing is implicitly allowed */ get_capable_devices(host, cmd->action, cmd->default_timeout, TRUE, cmd, stonith_fence_get_devices_cb); } return -EINPROGRESS; } xmlNode * stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) { xmlNode *reply = NULL; reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __func__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, "st_output", output); crm_xml_add_int(reply, F_STONITH_RC, rc); if (request == NULL) { /* Most likely, this is the result of a stonith operation that was * initiated before we came up. Unfortunately that means we lack enough * information to provide clients with a full result. * * @TODO Maybe synchronize this information at start-up? */ crm_warn("Missing request information for client notifications for " "operation with result %d (initiated before we came up?)", rc); } else { const char *name = NULL; const char *value = NULL; const char *names[] = { F_STONITH_OPERATION, F_STONITH_CALLID, F_STONITH_CLIENTID, F_STONITH_CLIENTNAME, F_STONITH_REMOTE_OP_ID, F_STONITH_CALLOPTS }; crm_trace("Creating a result reply with%s reply output (rc=%d)", (data? "" : "out"), rc); for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { name = names[lpc]; value = crm_element_value(request, name); crm_xml_add(reply, name, value); } if (data != NULL) { add_message_xml(reply, F_STONITH_CALLDATA, data); } } return reply; } static xmlNode * stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc) { xmlNode *reply = NULL; crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __func__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client); crm_xml_add(reply, F_STONITH_CLIENTNAME, cmd->client_name); crm_xml_add(reply, F_STONITH_TARGET, cmd->victim); crm_xml_add(reply, F_STONITH_ACTION, cmd->op); crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); crm_xml_add_int(reply, F_STONITH_RC, rc); crm_xml_add(reply, "st_output", output); if (data != NULL) { crm_info("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } bool fencing_peer_active(crm_node_t *peer) { if (peer == NULL) { return FALSE; } else if (peer->uname == NULL) { return FALSE; } else if (pcmk_is_set(peer->processes, crm_get_cluster_proc())) { return TRUE; } return FALSE; } /*! * \internal * \brief Determine if we need to use an alternate node to * fence the target. If so return that node's uname * * \retval NULL, no alternate host * \retval uname, uname of alternate host to use */ static const char * check_alternate_host(const char *target) { const char *alternate_host = NULL; crm_trace("Checking if we (%s) can fence %s", stonith_our_uname, target); if (find_topology_for_host(target) && pcmk__str_eq(target, stonith_our_uname, pcmk__str_casei)) { GHashTableIter gIter; crm_node_t *entry = NULL; g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_trace("Checking for %s.%d != %s", entry->uname, entry->id, target); if (fencing_peer_active(entry) && !pcmk__str_eq(entry->uname, target, pcmk__str_casei)) { alternate_host = entry->uname; break; } } if (alternate_host == NULL) { crm_err("No alternate host available to handle request " "for self-fencing with topology"); g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_notice("Peer[%d] %s", entry->id, entry->uname); } } } return alternate_host; } static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, const char *client_id) { if (remote_peer) { send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); } else { do_local_reply(reply, client_id, pcmk_is_set(call_options, st_opt_sync_call), (remote_peer != NULL)); } } static void remove_relay_op(xmlNode * request) { xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, request, LOG_TRACE); const char *relay_op_id = NULL; const char *op_id = NULL; const char *client_name = NULL; const char *target = NULL; remote_fencing_op_t *relay_op = NULL; if (dev) { target = crm_element_value(dev, F_STONITH_TARGET); } relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID_RELAY); op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID); client_name = crm_element_value(request, F_STONITH_CLIENTNAME); /* Delete RELAY operation. */ if (relay_op_id && target && pcmk__str_eq(target, stonith_our_uname, pcmk__str_casei)) { relay_op = g_hash_table_lookup(stonith_remote_op_list, relay_op_id); if (relay_op) { GHashTableIter iter; remote_fencing_op_t *list_op = NULL; g_hash_table_iter_init(&iter, stonith_remote_op_list); /* If the operation to be deleted is registered as a duplicate, delete the registration. */ while (g_hash_table_iter_next(&iter, NULL, (void **)&list_op)) { GList *dup_iter = NULL; if (list_op != relay_op) { for (dup_iter = list_op->duplicates; dup_iter != NULL; dup_iter = dup_iter->next) { remote_fencing_op_t *other = dup_iter->data; if (other == relay_op) { other->duplicates = g_list_remove(other->duplicates, relay_op); break; } } } } crm_debug("Deleting relay op %s ('%s' targeting %s for %s), " "replaced by op %s ('%s' targeting %s for %s)", relay_op->id, relay_op->action, relay_op->target, relay_op->client_name, op_id, relay_op->action, target, client_name); g_hash_table_remove(stonith_remote_op_list, relay_op_id); } } } static int handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) { int call_options = 0; int rc = -EOPNOTSUPP; xmlNode *data = NULL; xmlNode *reply = NULL; char *output = NULL; const char *op = crm_element_value(request, F_STONITH_OPERATION); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); /* IPC commands related to fencing configuration may be done only by * privileged users (i.e. root or hacluster), because all other users should * go through the CIB to have ACLs applied. * * If no client was given, this is a peer request, which is always allowed. */ bool allowed = (client == NULL) || pcmk_is_set(client->flags, pcmk__client_privileged); crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { xmlNode *reply = create_xml_node(NULL, "reply"); CRM_ASSERT(client); crm_xml_add(reply, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(reply, F_STONITH_CLIENTID, client->id); pcmk__ipc_send_xml(client, id, reply, flags); client->request_id = 0; free_xml(reply); return 0; } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { rc = stonith_device_action(request, &output); } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { const char *call_id = crm_element_value(request, F_STONITH_CALLID); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); int op_timeout = 0; crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); do_stonith_async_timeout_update(client_id, call_id, op_timeout); return 0; } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { if (remote_peer) { create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */ } /* Delete the DC node RELAY operation. */ remove_relay_op(request); stonith_query(request, remote_peer, client_id, call_options); return 0; } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { const char *flag_name = NULL; CRM_ASSERT(client); flag_name = crm_element_value(request, F_STONITH_NOTIFY_ACTIVATE); if (flag_name) { crm_debug("Enabling %s callbacks for client %s", flag_name, pcmk__client_name(client)); pcmk__set_client_flags(client, get_stonith_flag(flag_name)); } flag_name = crm_element_value(request, F_STONITH_NOTIFY_DEACTIVATE); if (flag_name) { crm_debug("Disabling %s callbacks for client %s", flag_name, pcmk__client_name(client)); pcmk__clear_client_flags(client, get_stonith_flag(flag_name)); } pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); return 0; } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); crm_notice("Received forwarded fencing request from " "%s %s to fence (%s) peer %s", ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client)), crm_element_value(dev, F_STONITH_ACTION), crm_element_value(dev, F_STONITH_TARGET)); if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { rc = -EINPROGRESS; } } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { if (remote_peer || stand_alone) { rc = stonith_fence(request); } else if (call_options & st_opt_manual_ack) { remote_fencing_op_t *rop = NULL; xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); const char *target = crm_element_value(dev, F_STONITH_TARGET); crm_notice("Received manual confirmation that %s is fenced", target); rop = initiate_remote_stonith_op(client, request, TRUE); rc = stonith_manual_ack(request, rop); } else { const char *alternate_host = NULL; xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); const char *target = crm_element_value(dev, F_STONITH_TARGET); const char *action = crm_element_value(dev, F_STONITH_ACTION); const char *device = crm_element_value(dev, F_STONITH_DEVICE); if (client) { int tolerance = 0; crm_notice("Client %s wants to fence (%s) %s using %s", pcmk__client_name(client), action, target, (device? device : "any device")); crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); if (stonith_check_fence_tolerance(tolerance, target, action)) { rc = 0; goto done; } } else { crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'", remote_peer, action, target, device ? device : "(any)"); } alternate_host = check_alternate_host(target); if (alternate_host && client) { const char *client_id = NULL; remote_fencing_op_t *op = NULL; crm_notice("Forwarding self-fencing request to peer %s" "due to topology", alternate_host); if (client->id) { client_id = client->id; } else { client_id = crm_element_value(request, F_STONITH_CLIENTID); } /* Create an operation for RELAY and send the ID in the RELAY message. */ /* When a QUERY response is received, delete the RELAY operation to avoid the existence of duplicate operations. */ op = create_remote_stonith_op(client_id, request, FALSE); crm_xml_add(request, F_STONITH_OPERATION, STONITH_OP_RELAY); crm_xml_add(request, F_STONITH_CLIENTID, client->id); crm_xml_add(request, F_STONITH_REMOTE_OP_ID, op->id); send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, FALSE); rc = -EINPROGRESS; } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) { rc = -EINPROGRESS; } } } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { rc = stonith_fence_history(request, &data, remote_peer, call_options); if (call_options & st_opt_discard_reply) { /* we don't expect answers to the broadcast * we might have sent out */ free_xml(data); return pcmk_ok; } } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { const char *device_id = NULL; if (allowed) { rc = stonith_device_register(request, &device_id, FALSE); } else { rc = -EACCES; } do_stonith_notify_device(call_options, op, rc, device_id); } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); const char *device_id = crm_element_value(dev, XML_ATTR_ID); if (allowed) { rc = stonith_device_remove(device_id, FALSE); } else { rc = -EACCES; } do_stonith_notify_device(call_options, op, rc, device_id); } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { char *device_id = NULL; if (allowed) { rc = stonith_level_register(request, &device_id); } else { rc = -EACCES; } do_stonith_notify_level(call_options, op, rc, device_id); free(device_id); } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { char *device_id = NULL; if (allowed) { rc = stonith_level_remove(request, &device_id); } else { rc = -EACCES; } do_stonith_notify_level(call_options, op, rc, device_id); } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { int node_id = 0; const char *name = NULL; crm_element_value_int(request, XML_ATTR_ID, &node_id); name = crm_element_value(request, XML_ATTR_UNAME); reap_crm_member(node_id, name); return pcmk_ok; } else { crm_err("Unknown IPC request %s from %s %s", op, ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client))); } done: if (rc == -EACCES) { crm_warn("Rejecting IPC request '%s' from unprivileged client %s", crm_str(op), pcmk__client_name(client)); } /* Always reply unless the request is in process still. * If in progress, a reply will happen async after the request * processing is finished */ if (rc != -EINPROGRESS) { crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0, id, pcmk_is_set(call_options, st_opt_sync_call), call_options, crm_element_value(request, F_STONITH_CALLOPTS)); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } reply = stonith_construct_reply(request, output, data, rc); stonith_send_reply(reply, call_options, remote_peer, client_id); } free(output); free_xml(data); free_xml(reply); return rc; } static void handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) { const char *op = crm_element_value(request, F_STONITH_OPERATION); if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { process_remote_stonith_query(request); } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { process_remote_stonith_exec(request); } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { /* Reply to a complex fencing op */ process_remote_stonith_exec(request); } else { crm_err("Unknown %s reply from %s %s", op, ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client))); crm_log_xml_warn(request, "UnknownOp"); } } void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) { int call_options = 0; int rc = 0; gboolean is_reply = FALSE; /* Copy op for reporting. The original might get freed by handle_reply() * before we use it in crm_debug(): * handle_reply() * |- process_remote_stonith_exec() * |-- remote_op_done() * |--- handle_local_reply_and_notify() * |---- crm_xml_add(...F_STONITH_OPERATION...) * |--- free_xml(op->request) */ char *op = crm_element_value_copy(request, F_STONITH_OPERATION); if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_NEVER)) { is_reply = TRUE; } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_debug("Processing %s%s %u from %s %s with call options 0x%08x", op, (is_reply? " reply" : ""), id, ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client)), call_options); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (is_reply) { handle_reply(client, request, remote_peer); } else { rc = handle_request(client, id, flags, request, remote_peer); } crm_debug("Processed %s%s from %s %s: %s (rc=%d)", op, (is_reply? " reply" : ""), ((client == NULL)? "peer" : "client"), ((client == NULL)? remote_peer : pcmk__client_name(client)), ((rc > 0)? "" : pcmk_strerror(rc)), rc); free(op); }