Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F1842283
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
155 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in
index 18415f1d9c..83ba0fa3be 100644
--- a/cts/cts-fencing.in
+++ b/cts/cts-fencing.in
@@ -1,1114 +1,1114 @@
#!@PYTHON@
""" Regression tests for Pacemaker's fencer
"""
__copyright__ = "Copyright 2012-2023 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import argparse
import os
import sys
import subprocess
import tempfile
# These imports allow running from a source checkout after running `make`.
# Note that while this doesn't necessarily mean it will successfully run tests,
# but being able to see --help output can be useful.
if os.path.exists("@abs_top_srcdir@/python"):
sys.path.insert(0, "@abs_top_srcdir@/python")
if os.path.exists("@abs_top_builddir@/python") and "@abs_top_builddir@" != "@abs_top_srcdir@":
sys.path.insert(0, "@abs_top_builddir@/python")
from pacemaker.buildoptions import BuildOptions
from pacemaker.exitstatus import ExitStatus
from pacemaker._cts.corosync import Corosync, localname
from pacemaker._cts.errors import ExitCodeError, OutputFoundError, OutputNotFoundError, XmlValidationError
from pacemaker._cts.process import killall, exit_if_proc_running
from pacemaker._cts.test import Test, Tests
TEST_DIR = sys.path[0]
def update_path():
""" Set the PATH environment variable appropriately for the tests """
new_path = os.environ['PATH']
if os.path.exists("%s/cts-fencing.in" % TEST_DIR):
print("Running tests from the source tree: %s (%s)" % (BuildOptions._BUILD_DIR, TEST_DIR))
# For pacemaker-fenced and cts-fence-helper
new_path = "%s/daemons/fenced:%s" % (BuildOptions._BUILD_DIR, new_path)
new_path = "%s/tools:%s" % (BuildOptions._BUILD_DIR, new_path) # For stonith_admin
new_path = "%s/cts/support:%s" % (BuildOptions._BUILD_DIR, new_path) # For cts-support
else:
print("Running tests from the install tree: %s (not %s)" % (BuildOptions.DAEMON_DIR, TEST_DIR))
# For pacemaker-fenced, cts-fence-helper, and cts-support
new_path = "%s:%s" % (BuildOptions.DAEMON_DIR, new_path)
print('Using PATH="%s"' % new_path)
os.environ['PATH'] = new_path
class FenceTest(Test):
""" Executor for a single test """
def __init__(self, name, description, **kwargs):
Test.__init__(self, name, description, **kwargs)
if kwargs.get("with_cpg", False):
self._enable_corosync = True
self._daemon_options = ["-c"]
else:
self._enable_corosync = False
self._daemon_options = ["-s"]
self._daemon_location = "pacemaker-fenced"
def _kill_daemons(self):
killall(["pacemakerd", "pacemaker-fenced"])
def _start_daemons(self):
if self.verbose:
self._daemon_options += ["-V"]
print("Starting %s with %s" % (self._daemon_location, self._daemon_options))
cmd = ["pacemaker-fenced", "-l", self.logpath] + self._daemon_options
self._daemon_process = subprocess.Popen(cmd)
class FenceTests(Tests):
""" Collection of all fencing regression tests """
def __init__(self, **kwargs):
Tests.__init__(self, **kwargs)
self._corosync = Corosync(self.verbose, self.logdir, "cts-fencing")
def new_test(self, name, description, with_cpg=False):
""" Create a named test """
test = FenceTest(name, description, verbose=self.verbose, with_cpg=with_cpg,
timeout=self.timeout, force_wait=self.force_wait,
logdir=self.logdir)
self._tests.append(test)
return test
def run_cpg_only(self):
""" Run all corosync-enabled tests """
for test in self._tests:
if test._enable_corosync:
test.run()
def run_no_cpg(self):
""" Run all standalone tests """
for test in self._tests:
if not test._enable_corosync:
test.run()
def build_api_sanity_tests(self):
""" Register tests to verify basic API usage """
verbose_arg = ""
if self.verbose:
verbose_arg = "-V"
test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.")
test.add_cmd("cts-fence-helper", "-t %s" % (verbose_arg), validate=False)
test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", True)
test.add_cmd("cts-fence-helper", "-m %s" % (verbose_arg), validate=False)
def build_custom_timeout_tests(self):
""" Register tests to verify custom timeout usage """
# custom timeout without topology
test = self.new_test("cpg_custom_timeout_1",
"Verify per device timeouts work as expected without using topology.", True)
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"')
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"')
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4"')
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# timeout is 5+1+4 = 10
test.add_log_pattern("Total timeout set to 12")
# custom timeout _WITH_ topology
test = self.new_test("cpg_custom_timeout_2",
"Verify per device timeouts work as expected _WITH_ topology.", True)
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node1 node2 node3"')
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=1"')
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node3" -o "pcmk_off_timeout=4000"')
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# timeout is 5+1+4000 = 4006
test.add_log_pattern("Total timeout set to 4807")
def build_fence_merge_tests(self):
""" Register tests to verify when fence operations should be merged """
### Simple test that overlapping fencing operations get merged
test = self.new_test("cpg_custom_merge_single",
"Verify overlapping identical fencing operations are merged, no fencing levels used.", True)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### one merger will happen
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
### Test that multiple mergers occur
test = self.new_test("cpg_custom_merge_multiple",
"Verify multiple overlapping identical fencing operations are merged", True)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"delay=2\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### 4 mergers should occur
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
### Test that multiple mergers occur with topologies used
test = self.new_test("cpg_custom_merge_with_topology",
"Verify multiple overlapping identical fencing operations are merged with fencing levels.",
True)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
### 4 mergers should occur
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client")
### the pattern below signifies that both the original and duplicate operation completed
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
test.add_log_pattern("Operation 'off' targeting node3 by ")
def build_fence_no_merge_tests(self):
""" Register tests to verify when fence operations should not be merged """
test = self.new_test("cpg_custom_no_merge",
"Verify differing fencing operations are not merged", True)
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ")
test.add_cmd("stonith_admin", "--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true1")
test.add_cmd_no_wait("stonith_admin", "--output-as=xml -F node2 -t 10")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 10")
test.add_log_pattern("Merging fencing action 'off' targeting node3 originating from client",
negative=True)
def build_standalone_tests(self):
""" Register a grab bag of tests that can be executed in standalone or corosync mode """
test_types = [
{
"prefix" : "standalone",
"use_cpg" : False,
},
{
"prefix" : "cpg",
"use_cpg" : True,
},
]
# test what happens when all devices timeout
for test_type in test_types:
test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"],
"Verify that all devices timeout, a fencing failure is returned.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
if test_type["use_cpg"]:
test.add_cmd_expected_fail("stonith_admin", "--output-as=xml -F node3 -t 2", ExitStatus.TIMEOUT)
test.add_log_pattern("Total timeout set to 7")
else:
test.add_cmd_expected_fail("stonith_admin", "--output-as=xml -F node3 -t 2", ExitStatus.ERROR)
test.add_log_pattern("targeting node3 using false1 returned ")
test.add_log_pattern("targeting node3 using false2 returned ")
test.add_log_pattern("targeting node3 using false3 returned ")
# test what happens when multiple devices can fence a node, but the first device fails.
for test_type in test_types:
test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"],
"Verify that when one fence device fails for a node, the others are tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
if test_type["use_cpg"]:
test.add_log_pattern("Total timeout set to 18")
# test what happens when we try to use a missing fence-agent.
for test_type in test_types:
test = self.new_test("%s_fence_missing_agent" % test_type["prefix"],
"Verify proper error-handling when using a non-existent fence-agent.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_missing -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node2\"")
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -F node3 -t 5",
ExitStatus.NOSUCH)
test.add_cmd("stonith_admin", "--output-as=xml -F node2 -t 5")
# simple topology test for one device
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_simple" % test_type["prefix"],
"Verify all fencing devices at a level are used.", test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_log_pattern("Total timeout set to 6")
test.add_log_pattern("targeting node3 using true returned 0")
# add topology, delete topology, verify fencing still works
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_add_remove" % test_type["prefix"],
"Verify fencing occurrs after all topology levels are removed",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 1")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_log_pattern("Total timeout set to 6")
test.add_log_pattern("targeting node3 using true returned 0")
# test what happens when the first fencing level has multiple devices.
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_device_fails" % test_type["prefix"],
"Verify if one device in a level fails, the other is tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R false -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20")
test.add_log_pattern("Total timeout set to 48")
test.add_log_pattern("targeting node3 using false returned 1")
test.add_log_pattern("targeting node3 using true returned 0")
# test what happens when the first fencing level fails.
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"],
"Verify if one level fails, the next leve is tried.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3")
test.add_log_pattern("Total timeout set to 21")
test.add_log_pattern("targeting node3 using false1 returned 1")
test.add_log_pattern("targeting node3 using false2 returned 1")
test.add_log_pattern("targeting node3 using true3 returned 0")
test.add_log_pattern("targeting node3 using true4 returned 0")
# test what happens when the first fencing level had devices that no one has registered
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_missing_devices" % test_type["prefix"],
"Verify topology can continue with missing devices.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
# Test what happens if multiple fencing levels are defined, and then the first one is removed.
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_level_removal" % test_type["prefix"],
"Verify level removal works.", test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v false2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 3 -v true4")
# Now remove level 2, verify none of the devices in level two are hit.
test.add_cmd("stonith_admin", "--output-as=xml -d node3 -i 2")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20")
test.add_log_pattern("Total timeout set to 96")
test.add_log_pattern("targeting node3 using false1 returned 1")
test.add_log_pattern("targeting node3 using false2 returned ",
negative=True)
test.add_log_pattern("targeting node3 using true3 returned 0")
test.add_log_pattern("targeting node3 using true4 returned 0")
# Test targeting a topology level by node name pattern.
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_level_pattern" % test_type["prefix"],
"Verify targeting topology by node name pattern works.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"""--output-as=xml -R true -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1 node2 node3" """)
test.add_cmd("stonith_admin", """--output-as=xml -r '@node.*' -i 1 -v true""")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5")
test.add_log_pattern("targeting node3 using true returned 0")
# test allowing commas and semicolons as delimiters in pcmk_host_list
for test_type in test_types:
test = self.new_test("%s_host_list_delimiters" % test_type["prefix"],
"Verify commas and semicolons can be used as pcmk_host_list delimiters",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node1,node2,node3" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=pcmk1;pcmk2;pcmk3" """)
test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F node2 -t 5")
test.add_cmd("stonith_admin", "stonith_admin --output-as=xml -F pcmk3 -t 5")
test.add_log_pattern("targeting node2 using true1 returned 0")
test.add_log_pattern("targeting pcmk3 using true2 returned 0")
# test the stonith builds the correct list of devices that can fence a node.
for test_type in test_types:
test = self.new_test("%s_list_devices" % test_type["prefix"],
"Verify list of devices that can fence a node is correct",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true2", "true1")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l node1 -V", "true3", "true1")
# simple test of device monitor
for test_type in test_types:
test = self.new_test("%s_monitor" % test_type["prefix"],
"Verify device is reachable", test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -Q true1")
test.add_cmd("stonith_admin", "--output-as=xml -Q false1")
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -Q true2",
ExitStatus.NOSUCH)
# Verify monitor occurs for duration of timeout period on failure
for test_type in test_types:
test = self.new_test("%s_monitor_timeout" % test_type["prefix"],
"Verify monitor uses duration of timeout period given.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
'--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"')
test.add_cmd_expected_fail("stonith_admin", "--output-as=xml -Q true1 -t 5", ExitStatus.ERROR)
test.add_log_pattern("Attempt 2 to execute")
# Verify monitor occurs for duration of timeout period on failure, but stops at max retries
for test_type in test_types:
test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"],
"Verify monitor retries until max retry value or timeout is hit.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
'--output-as=xml -R true1 -a fence_dummy -o "mode=fail" -o "monitor_mode=fail" -o "pcmk_host_list=node3"')
test.add_cmd_expected_fail("stonith_admin", "--output-as=xml -Q true1 -t 15", ExitStatus.ERROR)
test.add_log_pattern("Attempted to execute agent fence_dummy (list) the maximum number of times")
# simple register test
for test_type in test_types:
test = self.new_test("%s_register" % test_type["prefix"],
"Verify devices can be registered and un-registered",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -Q true1")
test.add_cmd("stonith_admin", "--output-as=xml -D true1")
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -Q true1",
ExitStatus.NOSUCH)
# simple reboot test
for test_type in test_types:
test = self.new_test("%s_reboot" % test_type["prefix"],
"Verify devices can be rebooted",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node3 -t 5")
test.add_cmd("stonith_admin", "--output-as=xml -D true1")
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -Q true1",
ExitStatus.NOSUCH)
# test fencing history.
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_fence_history" % test_type["prefix"],
"Verify last fencing operation is returned.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 5 -V")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -H node3", 'action="off" target="node3" .* status="success"')
# simple test of dynamic list query
for test_type in test_types:
test = self.new_test("%s_dynamic_list_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"')
# fence using dynamic list query
for test_type in test_types:
test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o mode=pass -o mock_dynamic_hosts=fake_port_1")
test.add_cmd("stonith_admin", "--output-as=xml -F fake_port_1 -t 5 -V")
# simple test of query using status action
for test_type in test_types:
test = self.new_test("%s_status_query" % test_type["prefix"],
"Verify dynamic list of fencing devices can be retrieved.",
test_type["use_cpg"])
test.add_cmd("stonith_admin", "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd("stonith_admin", "--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -l fake_port_1", 'count="3"')
# test what happens when no reboot action is advertised
for test_type in test_types:
test = self.new_test("%s_no_reboot_support" % test_type["prefix"],
"Verify reboot action defaults to off when no reboot action is advertised by agent.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V")
test.add_log_pattern("does not support reboot")
test.add_log_pattern("using true1 returned 0")
# make sure reboot is used when reboot action is advertised
for test_type in test_types:
test = self.new_test("%s_with_reboot_support" % test_type["prefix"],
"Verify reboot action can be used when metadata advertises it.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V")
test.add_log_pattern("does not advertise support for 'reboot', performing 'off'",
negative=True)
test.add_log_pattern("using true1 returned 0")
# make sure all fencing delays are applied correctly and taken into account by fencing timeouts with topology
for test_type in test_types:
if not test_type["use_cpg"]:
continue
test = self.new_test("%s_topology_delays" % test_type["prefix"],
"Verify all fencing delays are applied correctly and taken into account by fencing timeouts with topology.",
test_type["use_cpg"])
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\" -o \"pcmk_delay_base=1\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\" -o \"pcmk_delay_base=1\"")
# Resulting "random" delay will always be 1 since (rand() % (delay_max - delay_base)) is always 0 here.
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\" -o \"pcmk_delay_base=1\" -o \"pcmk_delay_max=2\"")
test.add_cmd("stonith_admin",
"--output-as=xml -R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -r node3 -i 2 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -F node3 --delay 1")
# Total fencing timeout takes all fencing delays into account.
test.add_log_pattern("Total timeout set to 577")
# Fencing timeout for the first device takes the requested fencing delay into account.
# Fencing timeout also takes pcmk_delay_base into account.
- test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true1 .*144s.*",
+ test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true1 .*145s.*",
regex=True)
# Requested fencing delay is applied only for the first device in the first level.
# Static delay from pcmk_delay_base is added.
test.add_log_pattern("Delaying 'off' action targeting node3 using true1 for 2s | timeout=120s requested_delay=1s base=1s max=1s")
# Fencing timeout no longer takes the requested fencing delay into account for further devices.
test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using false1 .*144s.*",
regex=True)
# Requested fencing delay is no longer applied for further devices.
test.add_log_pattern("Delaying 'off' action targeting node3 using false1 for 1s | timeout=120s requested_delay=0s base=1s max=1s")
# Fencing timeout takes pcmk_delay_max into account.
test.add_log_pattern(r"Requesting that .* perform 'off' action targeting node3 using true2 .*144s.*",
regex=True)
test.add_log_pattern("Delaying 'off' action targeting node3 using true2 for 1s | timeout=120s requested_delay=0s base=1s max=2s")
test.add_log_pattern("Delaying 'off' action targeting node3 using true3",
negative=True)
def build_nodeid_tests(self):
""" Register tests that use a corosync node id """
our_uname = localname()
### verify nodeid is supplied when nodeid is in the metadata parameters
test = self.new_test("cpg_supply_nodeid",
"Verify nodeid is given when fence agent has nodeid as parameter", True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname))
test.add_log_pattern("as nodeid with fence action 'off' targeting %s" % (our_uname))
### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters
test = self.new_test("cpg_do_not_supply_nodeid",
"Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter",
True)
# use a host name that won't be in corosync.conf
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=regr-test\"")
test.add_cmd("stonith_admin", "--output-as=xml -F regr-test -t 3")
test.add_log_pattern("as nodeid with fence action 'off' targeting regr-test",
negative=True)
### verify nodeid use doesn't explode standalone mode
test = self.new_test("standalone_do_not_supply_nodeid",
"Verify nodeid in metadata parameter list doesn't kill standalone mode",
False)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -F %s -t 3" % (our_uname))
test.add_log_pattern("as nodeid with fence action 'off' targeting %s" % our_uname,
negative=True)
def build_unfence_tests(self):
""" Register tests that verify unfencing """
our_uname = localname()
### verify unfencing using automatic unfencing
test = self.new_test("cpg_unfence_required_1",
"Verify require unfencing on all devices when automatic=true in agent's metadata",
True)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
# both devices should be executed
test.add_log_pattern("using true1 returned 0")
test.add_log_pattern("using true2 returned 0")
### verify unfencing using automatic unfencing fails if any of the required agents fail
test = self.new_test("cpg_unfence_required_2",
"Verify require unfencing on all devices when automatic=true in agent's metadata",
True)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=fail" -o "pcmk_host_list=%s"' % (our_uname))
test.add_cmd_expected_fail("stonith_admin", "--output-as=xml -U %s -t 6" % (our_uname), ExitStatus.ERROR)
### verify unfencing using automatic devices with topology
test = self.new_test("cpg_unfence_required_3",
"Verify require unfencing on all devices even when at different topology levels",
True)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_log_pattern("using true1 returned 0")
test.add_log_pattern("using true2 returned 0")
### verify unfencing using automatic devices with topology
test = self.new_test("cpg_unfence_required_4",
"Verify all required devices are executed even with topology levels fail.",
True)
test.add_cmd('stonith_admin',
'--output-as=xml -R true1 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true3 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R true4 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false3 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd('stonith_admin',
'--output-as=xml -R false4 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=%s node3"' % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v false1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v false3" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true3" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_log_pattern("using true1 returned 0")
test.add_log_pattern("using true2 returned 0")
test.add_log_pattern("using true3 returned 0")
test.add_log_pattern("using true4 returned 0")
def build_unfence_on_target_tests(self):
""" Register tests that verify unfencing that runs on the target """
our_uname = localname()
### verify unfencing using on_target device
test = self.new_test("cpg_unfence_on_target_1",
"Verify unfencing with on_target = true", True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_log_pattern("(on) to be executed on target")
### verify failure of unfencing using on_target device
test = self.new_test("cpg_unfence_on_target_2",
"Verify failure unfencing with on_target = true",
True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname))
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -U node_fake_1234 -t 3",
ExitStatus.NOSUCH)
test.add_log_pattern("(on) to be executed on target")
### verify unfencing using on_target device with topology
test = self.new_test("cpg_unfence_on_target_3",
"Verify unfencing with on_target = true using topology",
True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname))
test.add_log_pattern("(on) to be executed on target")
### verify unfencing using on_target device with topology fails when target node doesn't exist
test = self.new_test("cpg_unfence_on_target_4",
"Verify unfencing failure with on_target = true using topology",
True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true2")
test.add_cmd_expected_fail("stonith_admin",
"--output-as=xml -U node_fake -t 3",
ExitStatus.NOSUCH)
test.add_log_pattern("(on) to be executed on target")
def build_remap_tests(self):
""" Register tests that verify remapping of reboots to off-on """
test = self.new_test("cpg_remap_simple",
"Verify sequential topology reboot is remapped to all-off-then-all-on", True)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
"""-o "pcmk_off_timeout=1" -o "pcmk_reboot_timeout=10" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
"""-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
test.add_log_pattern("Remapping multiple-device reboot targeting node_fake")
# timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30)
test.add_log_pattern("Total timeout set to 3 for peer's fencing targeting node_fake")
test.add_log_pattern("perform 'off' action targeting node_fake using true1")
test.add_log_pattern("perform 'off' action targeting node_fake using true2")
test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
# fence_dummy sets "on" as an on_target action
test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake")
test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake")
test.add_log_pattern("Undoing remap of reboot targeting node_fake")
test = self.new_test("cpg_remap_simple_off",
"Verify sequential topology reboot skips 'on' if "
"pcmk_reboot_action=off or agent doesn't support "
"'on'", True)
test.add_cmd("stonith_admin",
"--output-as=xml -R true1 -a fence_dummy -o mode=pass "
"-o pcmk_host_list=node_fake -o pcmk_off_timeout=1 "
"-o pcmk_reboot_timeout=10 -o pcmk_reboot_action=off")
test.add_cmd("stonith_admin",
"--output-as=xml -R true2 -a fence_dummy_no_on "
"-o mode=pass -o pcmk_host_list=node_fake "
"-o pcmk_off_timeout=2 -o pcmk_reboot_timeout=20")
test.add_cmd("stonith_admin",
"--output-as=xml -r node_fake -i 1 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
test.add_log_pattern("Remapping multiple-device reboot targeting node_fake")
# timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30)
test.add_log_pattern("Total timeout set to 3 for peer's fencing targeting node_fake")
test.add_log_pattern("perform 'off' action targeting node_fake using true1")
test.add_log_pattern("perform 'off' action targeting node_fake using true2")
test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
# "on" should be skipped
test.add_log_pattern("Not turning node_fake back on using "
"true1 because the device is configured "
"to stay off")
test.add_log_pattern("Not turning node_fake back on using true2"
" because the agent doesn't support 'on'")
test.add_log_pattern("Undoing remap of reboot targeting node_fake")
test = self.new_test("cpg_remap_automatic",
"Verify remapped topology reboot skips automatic 'on'", True)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true1 -a fence_dummy_auto_unfence """
"""-o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin",
"""--output-as=xml -R true2 -a fence_dummy_auto_unfence """
"""-o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
test.add_log_pattern("Remapping multiple-device reboot targeting node_fake")
test.add_log_pattern("perform 'off' action targeting node_fake using true1")
test.add_log_pattern("perform 'off' action targeting node_fake using true2")
test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
test.add_log_pattern("Undoing remap of reboot targeting node_fake")
test.add_log_pattern("perform 'on' action targeting node_fake using",
negative=True)
test.add_log_pattern("'on' failure",
negative=True)
test = self.new_test("cpg_remap_complex_1",
"Verify remapped topology reboot in second level works if non-remapped first level fails",
True)
test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
test.add_log_pattern("perform 'reboot' action targeting node_fake using false1")
test.add_log_pattern("Remapping multiple-device reboot targeting node_fake")
test.add_log_pattern("perform 'off' action targeting node_fake using true1")
test.add_log_pattern("perform 'off' action targeting node_fake using true2")
test.add_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'")
test.add_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake")
test.add_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake")
test.add_log_pattern("Undoing remap of reboot targeting node_fake")
test = self.new_test("cpg_remap_complex_2",
"Verify remapped topology reboot failure in second level proceeds to third level",
True)
test.add_cmd("stonith_admin", """--output-as=xml -R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", """--output-as=xml -R true3 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 1 -v false1")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 2 -v true1 -v false2 -v true3")
test.add_cmd("stonith_admin", "--output-as=xml -r node_fake -i 3 -v true2")
test.add_cmd("stonith_admin", "--output-as=xml -B node_fake -t 5")
test.add_log_pattern("perform 'reboot' action targeting node_fake using false1")
test.add_log_pattern("Remapping multiple-device reboot targeting node_fake")
test.add_log_pattern("perform 'off' action targeting node_fake using true1")
test.add_log_pattern("perform 'off' action targeting node_fake using false2")
test.add_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times")
test.add_log_pattern("Undoing remap of reboot targeting node_fake")
test.add_log_pattern("perform 'reboot' action targeting node_fake using true2")
test.add_log_pattern("node_fake with true3",
negative=True)
def build_query_tests(self):
""" run stonith_admin --metadata for the fence_dummy agent and check command output """
test = self.new_test("get_metadata",
"Run stonith_admin --metadata for the fence_dummy agent", True)
test.add_cmd_check_stdout("stonith_admin", "--output-as=xml -a fence_dummy --metadata", '<shortdesc lang')
def build_metadata_tests(self):
""" run fence-agents coming with pacemaker with -o metadata and check for valid xml """
test = self.new_test("check_metadata_dummy",
"Run fence_dummy -o metadata and check for valid xml", False)
test.add_cmd("fence_dummy", "-o metadata", check_rng=False, check_stderr=False)
# fence_dummy prints on stderr to check that tools just listen on stdout
test = self.new_test("check_metadata_watchdog",
"Run fence_watchdog -o metadata and check for valid xml", False)
test.add_cmd("fence_watchdog", "-o metadata", check_rng=False)
def build_validate_tests(self):
""" run stonith_admin --validate for the fence_dummy agent and check command output """
test = self.new_test("validate_dummy",
"Run stonith_admin --validate-all and check output", False)
test.add_cmd_expected_fail("stonith_admin", "-a fence_dummy --validate --output-as=xml")
test.add_cmd("stonith_admin", """-a fence_dummy --validate -o "delay=5" --output-as=xml""", check_rng=False)
test.add_cmd_expected_fail("stonith_admin", """-a fence_dummy --validate -o "delay=15" --output-as=xml""")
def setup_environment(self, use_corosync):
""" Prepare the host before executing any tests """
if use_corosync:
self._corosync.start(kill_first=True)
subprocess.call(["cts-support", "install"])
def cleanup_environment(self, use_corosync):
""" Clean up the host after executing desired tests """
if use_corosync:
self._corosync.stop()
subprocess.call(["cts-support", "uninstall"])
def build_options():
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description="Run pacemaker-fenced regression tests",
epilog="Example: Run only the test 'start_stop'\n"
"\t " + sys.argv[0] + " --run-only start_stop\n\n"
"Example: Run only the tests with the string 'systemd' present in them\n"
"\t " + sys.argv[0] + " --run-only-pattern systemd")
parser.add_argument("-c", "--cpg-only", action="store_true",
help="Only run tests that require corosync")
parser.add_argument("-l", "--list-tests", action="store_true",
help="Print out all registered tests")
parser.add_argument("-n", "--no-cpg", action="store_true",
help="Only run tests that do not require corosync")
parser.add_argument("-p", "--run-only-pattern", metavar='PATTERN',
help="Run only tests matching the given pattern")
parser.add_argument("-r", "--run-only", metavar='TEST',
help="Run a specific test")
parser.add_argument("-t", "--timeout", type=float, default=2,
help="Up to how many seconds each test case waits for the daemon to "
"be initialized. Defaults to 2. The value 0 means no limit.")
parser.add_argument("-w", "--force-wait", action="store_true",
help="Each test case waits the default/specified --timeout for the "
"daemon without tracking the log")
parser.add_argument("-V", "--verbose", action="store_true",
help="Verbose output")
args = parser.parse_args()
return args
def main():
""" Run fencing regression tests as specified by arguments """
update_path()
# Ensure all command output is in portable locale for comparison
os.environ['LC_ALL'] = "C"
opts = build_options()
exit_if_proc_running("pacemaker-fenced")
use_corosync = not opts.no_cpg
if use_corosync:
exit_if_proc_running("corosync")
# Create a temporary directory for log files (the directory and its
# contents will automatically be erased when done)
with tempfile.TemporaryDirectory(prefix="cts-fencing-") as logdir:
tests = FenceTests(verbose=opts.verbose, timeout=opts.timeout,
force_wait=opts.force_wait, logdir=logdir)
tests.build_standalone_tests()
tests.build_custom_timeout_tests()
tests.build_api_sanity_tests()
tests.build_fence_merge_tests()
tests.build_fence_no_merge_tests()
tests.build_unfence_tests()
tests.build_unfence_on_target_tests()
tests.build_nodeid_tests()
tests.build_remap_tests()
tests.build_query_tests()
tests.build_metadata_tests()
tests.build_validate_tests()
if opts.list_tests:
tests.print_list()
sys.exit(ExitStatus.OK)
print("Starting ...")
try:
tests.setup_environment(use_corosync)
except TimeoutError:
print("corosync did not start in time, exiting")
sys.exit(ExitStatus.TIMEOUT)
if opts.run_only_pattern:
tests.run_tests_matching(opts.run_only_pattern)
tests.print_results()
elif opts.run_only:
tests.run_single(opts.run_only)
tests.print_results()
elif opts.no_cpg:
tests.run_no_cpg()
tests.print_results()
elif opts.cpg_only:
tests.run_cpg_only()
tests.print_results()
else:
tests.run_tests()
tests.print_results()
tests.cleanup_environment(use_corosync)
tests.exit()
if __name__ == "__main__":
main()
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 411c528b98..b5e65567f3 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,2482 +1,2486 @@
/*
* Copyright 2009-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/param.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/utsname.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <ctype.h>
#include <regex.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/common/ipc.h>
#include <crm/common/ipc_internal.h>
#include <crm/cluster/internal.h>
#include <crm/stonith-ng.h>
#include <crm/fencing/internal.h>
#include <crm/common/xml.h>
#include <crm/common/xml_internal.h>
#include <crm/common/util.h>
#include <pacemaker-fenced.h>
#define TIMEOUT_MULTIPLY_FACTOR 1.2
/* When one fencer queries its peers for devices able to handle a fencing
* request, each peer will reply with a list of such devices available to it.
* Each reply will be parsed into a peer_device_info_t, with each device's
* information kept in a device_properties_t.
*/
typedef struct device_properties_s {
/* Whether access to this device has been verified */
gboolean verified;
/* The remaining members are indexed by the operation's "phase" */
/* Whether this device has been executed in each phase */
gboolean executed[st_phase_max];
/* Whether this device is disallowed from executing in each phase */
gboolean disallowed[st_phase_max];
/* Action-specific timeout for each phase */
int custom_action_timeout[st_phase_max];
/* Action-specific maximum random delay for each phase */
int delay_max[st_phase_max];
/* Action-specific base delay for each phase */
int delay_base[st_phase_max];
/* Group of enum st_device_flags */
uint32_t device_support_flags;
} device_properties_t;
typedef struct {
/* Name of peer that sent this result */
char *host;
/* Only try peers for non-topology based operations once */
gboolean tried;
/* Number of entries in the devices table */
int ndevices;
/* Devices available to this host that are capable of fencing the target */
GHashTable *devices;
} peer_device_info_t;
GHashTable *stonith_remote_op_list = NULL;
extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
int call_options);
static void request_peer_fencing(remote_fencing_op_t *op,
peer_device_info_t *peer);
static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
static int get_op_total_timeout(const remote_fencing_op_t *op,
const peer_device_info_t *chosen_peer);
static gint
sort_strings(gconstpointer a, gconstpointer b)
{
return strcmp(a, b);
}
static void
free_remote_query(gpointer data)
{
if (data != NULL) {
peer_device_info_t *peer = data;
g_hash_table_destroy(peer->devices);
free(peer->host);
free(peer);
}
}
void
free_stonith_remote_op_list(void)
{
if (stonith_remote_op_list != NULL) {
g_hash_table_destroy(stonith_remote_op_list);
stonith_remote_op_list = NULL;
}
}
struct peer_count_data {
const remote_fencing_op_t *op;
gboolean verified_only;
uint32_t support_action_only;
int count;
};
/*!
* \internal
* \brief Increment a counter if a device has not been executed yet
*
* \param[in] key Device ID (ignored)
* \param[in] value Device properties
* \param[in,out] user_data Peer count data
*/
static void
count_peer_device(gpointer key, gpointer value, gpointer user_data)
{
device_properties_t *props = (device_properties_t*)value;
struct peer_count_data *data = user_data;
if (!props->executed[data->op->phase]
&& (!data->verified_only || props->verified)
&& ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
++(data->count);
}
}
/*!
* \internal
* \brief Check the number of available devices in a peer's query results
*
* \param[in] op Operation that results are for
* \param[in] peer Peer to count
* \param[in] verified_only Whether to count only verified devices
* \param[in] support_action_only Whether to count only devices that support action
*
* \return Number of devices available to peer that were not already executed
*/
static int
count_peer_devices(const remote_fencing_op_t *op,
const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
{
struct peer_count_data data;
data.op = op;
data.verified_only = verified_only;
data.support_action_only = support_on_action_only;
data.count = 0;
if (peer) {
g_hash_table_foreach(peer->devices, count_peer_device, &data);
}
return data.count;
}
/*!
* \internal
* \brief Search for a device in a query result
*
* \param[in] op Operation that result is for
* \param[in] peer Query result for a peer
* \param[in] device Device ID to search for
*
* \return Device properties if found, NULL otherwise
*/
static device_properties_t *
find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
const char *device, uint32_t support_action_only)
{
device_properties_t *props = g_hash_table_lookup(peer->devices, device);
if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
return NULL;
}
return (props && !props->executed[op->phase]
&& !props->disallowed[op->phase])? props : NULL;
}
/*!
* \internal
* \brief Find a device in a peer's device list and mark it as executed
*
* \param[in] op Operation that peer result is for
* \param[in,out] peer Peer with results to search
* \param[in] device ID of device to mark as done
* \param[in] verified_devices_only Only consider verified devices
*
* \return TRUE if device was found and marked, FALSE otherwise
*/
static gboolean
grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
const char *device, gboolean verified_devices_only)
{
device_properties_t *props = find_peer_device(op, peer, device,
fenced_support_flag(op->action));
if ((props == NULL) || (verified_devices_only && !props->verified)) {
return FALSE;
}
crm_trace("Removing %s from %s (%d remaining)",
device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
props->executed[op->phase] = TRUE;
return TRUE;
}
static void
clear_remote_op_timers(remote_fencing_op_t * op)
{
if (op->query_timer) {
g_source_remove(op->query_timer);
op->query_timer = 0;
}
if (op->op_timer_total) {
g_source_remove(op->op_timer_total);
op->op_timer_total = 0;
}
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
op->op_timer_one = 0;
}
}
static void
free_remote_op(gpointer data)
{
remote_fencing_op_t *op = data;
crm_log_xml_debug(op->request, "Destroying");
clear_remote_op_timers(op);
free(op->id);
free(op->action);
free(op->delegate);
free(op->target);
free(op->client_id);
free(op->client_name);
free(op->originator);
if (op->query_results) {
g_list_free_full(op->query_results, free_remote_query);
}
if (op->request) {
free_xml(op->request);
op->request = NULL;
}
if (op->devices_list) {
g_list_free_full(op->devices_list, free);
op->devices_list = NULL;
}
g_list_free_full(op->automatic_list, free);
g_list_free(op->duplicates);
pcmk__reset_result(&op->result);
free(op);
}
void
init_stonith_remote_op_hash_table(GHashTable **table)
{
if (*table == NULL) {
*table = pcmk__strkey_table(NULL, free_remote_op);
}
}
/*!
* \internal
* \brief Return an operation's originally requested action (before any remap)
*
* \param[in] op Operation to check
*
* \return Operation's original action
*/
static const char *
op_requested_action(const remote_fencing_op_t *op)
{
return ((op->phase > st_phase_requested)? "reboot" : op->action);
}
/*!
* \internal
* \brief Remap a "reboot" operation to the "off" phase
*
* \param[in,out] op Operation to remap
*/
static void
op_phase_off(remote_fencing_op_t *op)
{
crm_info("Remapping multiple-device reboot targeting %s to 'off' "
CRM_XS " id=%.8s", op->target, op->id);
op->phase = st_phase_off;
/* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
* memory allocation at each phase.
*/
strcpy(op->action, "off");
}
/*!
* \internal
* \brief Advance a remapped reboot operation to the "on" phase
*
* \param[in,out] op Operation to remap
*/
static void
op_phase_on(remote_fencing_op_t *op)
{
GList *iter = NULL;
crm_info("Remapped 'off' targeting %s complete, "
"remapping to 'on' for %s " CRM_XS " id=%.8s",
op->target, op->client_name, op->id);
op->phase = st_phase_on;
strcpy(op->action, "on");
/* Skip devices with automatic unfencing, because the cluster will handle it
* when the node rejoins.
*/
for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
GList *match = g_list_find_custom(op->devices_list, iter->data,
sort_strings);
if (match) {
op->devices_list = g_list_remove(op->devices_list, match->data);
}
}
g_list_free_full(op->automatic_list, free);
op->automatic_list = NULL;
/* Rewind device list pointer */
op->devices = op->devices_list;
}
/*!
* \internal
* \brief Reset a remapped reboot operation
*
* \param[in,out] op Operation to reset
*/
static void
undo_op_remap(remote_fencing_op_t *op)
{
if (op->phase > 0) {
crm_info("Undoing remap of reboot targeting %s for %s "
CRM_XS " id=%.8s", op->target, op->client_name, op->id);
op->phase = st_phase_requested;
strcpy(op->action, "reboot");
}
}
/*!
* \internal
* \brief Create notification data XML for a fencing operation result
*
* \param[in] op Fencer operation that completed
*
* \return Newly created XML to add as notification data
* \note The caller is responsible for freeing the result.
*/
static xmlNode *
fencing_result2xml(const remote_fencing_op_t *op)
{
xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
crm_xml_add_int(notify_data, "state", op->state);
crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
return notify_data;
}
/*!
* \internal
* \brief Broadcast a fence result notification to all CPG peers
*
* \param[in] op Fencer operation that completed
* \param[in] op_merged Whether this operation is a duplicate of another
*/
void
fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
{
static int count = 0;
xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
xmlNode *notify_data = fencing_result2xml(op);
count++;
crm_trace("Broadcasting result to peers");
crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
crm_xml_add(bcast, F_SUBTYPE, "broadcast");
crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
crm_xml_add_int(bcast, "count", count);
if (op_merged) {
pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
}
stonith__xe_set_result(notify_data, &op->result);
add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
free_xml(notify_data);
free_xml(bcast);
return;
}
/*!
* \internal
* \brief Reply to a local request originator and notify all subscribed clients
*
* \param[in,out] op Fencer operation that completed
* \param[in,out] data Top-level XML to add notification to
*/
static void
handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
{
xmlNode *notify_data = NULL;
xmlNode *reply = NULL;
pcmk__client_t *client = NULL;
if (op->notify_sent == TRUE) {
/* nothing to do */
return;
}
/* Do notification with a clean data object */
crm_xml_add_int(data, "state", op->state);
crm_xml_add(data, F_STONITH_TARGET, op->target);
crm_xml_add(data, F_STONITH_OPERATION, op->action);
reply = fenced_construct_reply(op->request, data, &op->result);
crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
/* Send fencing OP reply to local client that initiated fencing */
client = pcmk__find_client_by_id(op->client_id);
if (client == NULL) {
crm_trace("Skipping reply to %s: no longer a client", op->client_id);
} else {
do_local_reply(reply, client, op->call_options);
}
/* bcast to all local clients that the fencing operation happend */
notify_data = fencing_result2xml(op);
fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
free_xml(notify_data);
fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
/* mark this op as having notify's already sent */
op->notify_sent = TRUE;
free_xml(reply);
}
/*!
* \internal
* \brief Finalize all duplicates of a given fencer operation
*
* \param[in,out] op Fencer operation that completed
* \param[in,out] data Top-level XML to add notification to
*/
static void
finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
{
for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
remote_fencing_op_t *other = iter->data;
if (other->state == st_duplicate) {
other->state = op->state;
crm_debug("Performing duplicate notification for %s@%s: %s "
CRM_XS " id=%.8s",
other->client_name, other->originator,
pcmk_exec_status_str(op->result.execution_status),
other->id);
pcmk__copy_result(&op->result, &other->result);
finalize_op(other, data, true);
} else {
// Possible if (for example) it timed out already
crm_err("Skipping duplicate notification for %s@%s "
CRM_XS " state=%s id=%.8s",
other->client_name, other->originator,
stonith_op_state_str(other->state), other->id);
}
}
}
static char *
delegate_from_xml(xmlNode *xml)
{
xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
if (match == NULL) {
return crm_element_value_copy(xml, F_ORIG);
} else {
return crm_element_value_copy(match, F_STONITH_DELEGATE);
}
}
/*!
* \internal
* \brief Finalize a peer fencing operation
*
* Clean up after a fencing operation completes. This function has two code
* paths: the executioner uses it to broadcast the result to CPG peers, and then
* each peer (including the executioner) uses it to process that broadcast and
* notify its IPC clients of the result.
*
* \param[in,out] op Fencer operation that completed
* \param[in,out] data If not NULL, XML reply of last delegated operation
* \param[in] dup Whether this operation is a duplicate of another
* (in which case, do not broadcast the result)
*
* \note The operation result should be set before calling this function.
*/
static void
finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
{
int level = LOG_ERR;
const char *subt = NULL;
xmlNode *local_data = NULL;
gboolean op_merged = FALSE;
CRM_CHECK((op != NULL), return);
// This is a no-op if timers have already been cleared
clear_remote_op_timers(op);
if (op->notify_sent) {
// Most likely, this is a timed-out action that eventually completed
crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
"Result arrived too late " CRM_XS " id=%.8s",
op->action, (op->target? " targeting " : ""),
(op->target? op->target : ""),
(op->delegate? op->delegate : "unknown node"),
op->client_name, op->originator,
(op_merged? " (merged)" : ""),
op->id);
return;
}
set_fencing_completed(op);
undo_op_remap(op);
if (data == NULL) {
data = create_xml_node(NULL, "remote-op");
local_data = data;
} else if (op->delegate == NULL) {
switch (op->result.execution_status) {
case PCMK_EXEC_NO_FENCE_DEVICE:
break;
case PCMK_EXEC_INVALID:
if (op->result.exit_status != CRM_EX_EXPIRED) {
op->delegate = delegate_from_xml(data);
}
break;
default:
op->delegate = delegate_from_xml(data);
break;
}
}
if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
op_merged = true;
}
/* Tell everyone the operation is done, we will continue
* with doing the local notifications once we receive
* the broadcast back. */
subt = crm_element_value(data, F_SUBTYPE);
if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
/* Defer notification until the bcast message arrives */
fenced_broadcast_op_result(op, op_merged);
free_xml(local_data);
return;
}
if (pcmk__result_ok(&op->result) || dup
|| !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
level = LOG_NOTICE;
}
do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
(op->target? op->target : ""),
(op->delegate? op->delegate : "unknown node"),
op->client_name, op->originator,
(op_merged? " (merged)" : ""),
crm_exit_str(op->result.exit_status),
pcmk_exec_status_str(op->result.execution_status),
((op->result.exit_reason == NULL)? "" : ": "),
((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
op->id);
handle_local_reply_and_notify(op, data);
if (!dup) {
finalize_op_duplicates(op, data);
}
/* Free non-essential parts of the record
* Keep the record around so we can query the history
*/
if (op->query_results) {
g_list_free_full(op->query_results, free_remote_query);
op->query_results = NULL;
}
if (op->request) {
free_xml(op->request);
op->request = NULL;
}
free_xml(local_data);
}
/*!
* \internal
* \brief Finalize a watchdog fencer op after the waiting time expires
*
* \param[in,out] userdata Fencer operation that completed
*
* \return G_SOURCE_REMOVE (which tells glib not to restart timer)
*/
static gboolean
remote_op_watchdog_done(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_one = 0;
crm_notice("Self-fencing (%s) by %s for %s assumed complete "
CRM_XS " id=%.8s",
op->action, op->target, op->client_name, op->id);
op->state = st_done;
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
finalize_op(op, NULL, false);
return G_SOURCE_REMOVE;
}
static gboolean
remote_op_timeout_one(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_one = 0;
crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
" id=%.8s", op->action, op->target, op->client_name, op->id);
pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
"Peer did not return fence result within timeout");
// Try another device, if appropriate
request_peer_fencing(op, NULL);
return G_SOURCE_REMOVE;
}
/*!
* \internal
* \brief Finalize a remote fencer operation that timed out
*
* \param[in,out] op Fencer operation that timed out
* \param[in] reason Readable description of what step timed out
*/
static void
finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
{
crm_debug("Action '%s' targeting %s for client %s timed out "
CRM_XS " id=%.8s",
op->action, op->target, op->client_name, op->id);
if (op->phase == st_phase_on) {
/* A remapped reboot operation timed out in the "on" phase, but the
* "off" phase completed successfully, so quit trying any further
* devices, and return success.
*/
op->state = st_done;
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
} else {
op->state = st_failed;
pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
}
finalize_op(op, NULL, false);
}
/*!
* \internal
* \brief Finalize a remote fencer operation that timed out
*
* \param[in,out] userdata Fencer operation that timed out
*
* \return G_SOURCE_REMOVE (which tells glib not to restart timer)
*/
static gboolean
remote_op_timeout(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
op->op_timer_total = 0;
if (op->state == st_done) {
crm_debug("Action '%s' targeting %s for client %s already completed "
CRM_XS " id=%.8s",
op->action, op->target, op->client_name, op->id);
} else {
finalize_timed_out_op(userdata, "Fencing did not complete within a "
"total timeout based on the "
"configured timeout and retries for "
"any devices attempted");
}
return G_SOURCE_REMOVE;
}
static gboolean
remote_op_query_timeout(gpointer data)
{
remote_fencing_op_t *op = data;
op->query_timer = 0;
if (op->state == st_done) {
crm_debug("Operation %.8s targeting %s already completed",
op->id, op->target);
} else if (op->state == st_exec) {
crm_debug("Operation %.8s targeting %s already in progress",
op->id, op->target);
} else if (op->query_results) {
// Query succeeded, so attempt the actual fencing
crm_debug("Query %.8s targeting %s complete (state=%s)",
op->id, op->target, stonith_op_state_str(op->state));
request_peer_fencing(op, NULL);
} else {
crm_debug("Query %.8s targeting %s timed out (state=%s)",
op->id, op->target, stonith_op_state_str(op->state));
finalize_timed_out_op(op, "No capable peers replied to device query "
"within timeout");
}
return G_SOURCE_REMOVE;
}
static gboolean
topology_is_empty(stonith_topology_t *tp)
{
int i;
if (tp == NULL) {
return TRUE;
}
for (i = 0; i < ST_LEVEL_MAX; i++) {
if (tp->levels[i] != NULL) {
return FALSE;
}
}
return TRUE;
}
/*!
* \internal
* \brief Add a device to an operation's automatic unfencing list
*
* \param[in,out] op Operation to modify
* \param[in] device Device ID to add
*/
static void
add_required_device(remote_fencing_op_t *op, const char *device)
{
GList *match = g_list_find_custom(op->automatic_list, device,
sort_strings);
if (!match) {
op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
}
}
/*!
* \internal
* \brief Remove a device from the automatic unfencing list
*
* \param[in,out] op Operation to modify
* \param[in] device Device ID to remove
*/
static void
remove_required_device(remote_fencing_op_t *op, const char *device)
{
GList *match = g_list_find_custom(op->automatic_list, device,
sort_strings);
if (match) {
op->automatic_list = g_list_remove(op->automatic_list, match->data);
}
}
/* deep copy the device list */
static void
set_op_device_list(remote_fencing_op_t * op, GList *devices)
{
GList *lpc = NULL;
if (op->devices_list) {
g_list_free_full(op->devices_list, free);
op->devices_list = NULL;
}
for (lpc = devices; lpc != NULL; lpc = lpc->next) {
op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
}
op->devices = op->devices_list;
}
/*!
* \internal
* \brief Check whether a node matches a topology target
*
* \param[in] tp Topology table entry to check
* \param[in] node Name of node to check
*
* \return TRUE if node matches topology target
*/
static gboolean
topology_matches(const stonith_topology_t *tp, const char *node)
{
regex_t r_patt;
CRM_CHECK(node && tp && tp->target, return FALSE);
switch (tp->kind) {
case fenced_target_by_attribute:
/* This level targets by attribute, so tp->target is a NAME=VALUE pair
* of a permanent attribute applied to targeted nodes. The test below
* relies on the locally cached copy of the CIB, so if fencing needs to
* be done before the initial CIB is received or after a malformed CIB
* is received, then the topology will be unable to be used.
*/
if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
crm_notice("Matched %s with %s by attribute", node, tp->target);
return TRUE;
}
break;
case fenced_target_by_pattern:
/* This level targets node names matching a pattern, so tp->target
* (and tp->target_pattern) is a regular expression.
*/
if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
crm_info("Bad regex '%s' for fencing level", tp->target);
} else {
int status = regexec(&r_patt, node, 0, NULL, 0);
regfree(&r_patt);
if (status == 0) {
crm_notice("Matched %s with %s by name", node, tp->target);
return TRUE;
}
}
break;
case fenced_target_by_name:
crm_trace("Testing %s against %s", node, tp->target);
return pcmk__str_eq(tp->target, node, pcmk__str_casei);
default:
break;
}
crm_trace("No match for %s with %s", node, tp->target);
return FALSE;
}
stonith_topology_t *
find_topology_for_host(const char *host)
{
GHashTableIter tIter;
stonith_topology_t *tp = g_hash_table_lookup(topology, host);
if(tp != NULL) {
crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
return tp;
}
g_hash_table_iter_init(&tIter, topology);
while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
if (topology_matches(tp, host)) {
crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
return tp;
}
}
crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
return NULL;
}
/*!
* \internal
* \brief Set fencing operation's device list to target's next topology level
*
* \param[in,out] op Remote fencing operation to modify
* \param[in] empty_ok If true, an operation without a target (i.e.
* queries) or a target without a topology will get a
* pcmk_rc_ok return value instead of ENODEV
*
* \return Standard Pacemaker return value
*/
static int
advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
{
stonith_topology_t *tp = NULL;
if (op->target) {
tp = find_topology_for_host(op->target);
}
if (topology_is_empty(tp)) {
return empty_ok? pcmk_rc_ok : ENODEV;
}
CRM_ASSERT(tp->levels != NULL);
stonith__set_call_options(op->call_options, op->id, st_opt_topology);
/* This is a new level, so undo any remapping left over from previous */
undo_op_remap(op);
do {
op->level++;
} while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
if (op->level < ST_LEVEL_MAX) {
crm_trace("Attempting fencing level %d targeting %s (%d devices) "
"for client %s@%s (id=%.8s)",
op->level, op->target, g_list_length(tp->levels[op->level]),
op->client_name, op->originator, op->id);
set_op_device_list(op, tp->levels[op->level]);
// The requested delay has been applied for the first fencing level
if (op->level > 1 && op->delay > 0) {
op->delay = 0;
}
if ((g_list_next(op->devices_list) != NULL)
&& pcmk__str_eq(op->action, "reboot", pcmk__str_none)) {
/* A reboot has been requested for a topology level with multiple
* devices. Instead of rebooting the devices sequentially, we will
* turn them all off, then turn them all on again. (Think about
* switched power outlets for redundant power supplies.)
*/
op_phase_off(op);
}
return pcmk_rc_ok;
}
crm_info("All %sfencing options targeting %s for client %s@%s failed "
CRM_XS " id=%.8s",
(stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
op->target, op->client_name, op->originator, op->id);
return ENODEV;
}
/*!
* \internal
* \brief If fencing operation is a duplicate, merge it into the other one
*
* \param[in,out] op Fencing operation to check
*/
static void
merge_duplicates(remote_fencing_op_t *op)
{
GHashTableIter iter;
remote_fencing_op_t *other = NULL;
time_t now = time(NULL);
g_hash_table_iter_init(&iter, stonith_remote_op_list);
while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
const char *other_action = op_requested_action(other);
if (!strcmp(op->id, other->id)) {
continue; // Don't compare against self
}
if (other->state > st_exec) {
crm_trace("%.8s not duplicate of %.8s: not in progress",
op->id, other->id);
continue;
}
if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
op->id, other->id, op->target, other->target);
continue;
}
if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
op->id, other->id, op->action, other_action);
continue;
}
if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
crm_trace("%.8s not duplicate of %.8s: same client %s",
op->id, other->id, op->client_name);
continue;
}
if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
crm_trace("%.8s not duplicate of %.8s: suicide for %s",
op->id, other->id, other->target);
continue;
}
if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
crm_notice("Failing action '%s' targeting %s originating from "
"client %s@%s: Originator is dead " CRM_XS " id=%.8s",
other->action, other->target, other->client_name,
other->originator, other->id);
crm_trace("%.8s not duplicate of %.8s: originator dead",
op->id, other->id);
other->state = st_failed;
continue;
}
if ((other->total_timeout > 0)
&& (now > (other->total_timeout + other->created))) {
crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
op->id, other->id, now, other->created,
other->total_timeout);
continue;
}
/* There is another in-flight request to fence the same host
* Piggyback on that instead. If it fails, so do we.
*/
other->duplicates = g_list_append(other->duplicates, op);
if (other->total_timeout == 0) {
other->total_timeout = op->total_timeout =
TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
crm_trace("Best guess as to timeout used for %.8s: %d",
other->id, other->total_timeout);
}
crm_notice("Merging fencing action '%s' targeting %s originating from "
"client %s with identical request from %s@%s "
CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
op->action, op->target, op->client_name,
other->client_name, other->originator,
op->id, other->id, other->total_timeout);
report_timeout_period(op, other->total_timeout);
op->state = st_duplicate;
}
}
static uint32_t fencing_active_peers(void)
{
uint32_t count = 0;
crm_node_t *entry;
GHashTableIter gIter;
g_hash_table_iter_init(&gIter, crm_peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
if(fencing_peer_active(entry)) {
count++;
}
}
return count;
}
/*!
* \internal
* \brief Process a manual confirmation of a pending fence action
*
* \param[in] client IPC client that sent confirmation
* \param[in,out] msg Request XML with manual confirmation
*
* \return Standard Pacemaker return code
*/
int
fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
{
remote_fencing_op_t *op = NULL;
xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return EPROTO);
crm_notice("Received manual confirmation that %s has been fenced",
pcmk__s(crm_element_value(dev, F_STONITH_TARGET),
"unknown target"));
op = initiate_remote_stonith_op(client, msg, TRUE);
if (op == NULL) {
return EPROTO;
}
op->state = st_done;
set_fencing_completed(op);
op->delegate = strdup("a human");
// For the fencer's purposes, the fencing operation is done
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
finalize_op(op, msg, false);
/* For the requester's purposes, the operation is still pending. The
* actual result will be sent asynchronously via the operation's done_cb().
*/
return EINPROGRESS;
}
/*!
* \internal
* \brief Create a new remote stonith operation
*
* \param[in] client ID of local stonith client that initiated the operation
* \param[in] request The request from the client that started the operation
* \param[in] peer TRUE if this operation is owned by another stonith peer
* (an operation owned by one peer is stored on all peers,
* but only the owner executes it; all nodes get the results
* once the owner finishes execution)
*/
void *
create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
{
remote_fencing_op_t *op = NULL;
xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
int call_options = 0;
const char *operation = NULL;
init_stonith_remote_op_hash_table(&stonith_remote_op_list);
/* If this operation is owned by another node, check to make
* sure we haven't already created this operation. */
if (peer && dev) {
const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(op_id != NULL, return NULL);
op = g_hash_table_lookup(stonith_remote_op_list, op_id);
if (op) {
crm_debug("Reusing existing remote fencing op %.8s for %s",
op_id, ((client == NULL)? "unknown client" : client));
return op;
}
}
op = calloc(1, sizeof(remote_fencing_op_t));
CRM_ASSERT(op != NULL);
crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
// Value -1 means disable any static/random fencing delays
crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
if (peer && dev) {
op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
} else {
op->id = crm_generate_uuid();
}
g_hash_table_replace(stonith_remote_op_list, op->id, op);
op->state = st_query;
op->replies_expected = fencing_active_peers();
op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
op->created = time(NULL);
if (op->originator == NULL) {
/* Local or relayed request */
op->originator = strdup(stonith_our_uname);
}
CRM_LOG_ASSERT(client != NULL);
if (client) {
op->client_id = strdup(client);
}
/* For a RELAY operation, set fenced on the client. */
operation = crm_element_value(request, F_STONITH_OPERATION);
if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
(unsigned long) getpid());
} else {
op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
}
op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
op->request = copy_xml(request); /* TODO: Figure out how to avoid this */
crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
op->call_options = call_options;
crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
"base timeout %d, %u %s expected)",
(peer && dev)? "Recorded" : "Generated", op->id, op->action,
op->target, op->client_name, op->base_timeout,
op->replies_expected,
pcmk__plural_alt(op->replies_expected, "reply", "replies"));
if (op->call_options & st_opt_cs_nodeid) {
int nodeid;
crm_node_t *node;
pcmk__scan_min_int(op->target, &nodeid, 0);
node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
/* Ensure the conversion only happens once */
stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
if (node && node->uname) {
free(op->target);
op->target = strdup(node->uname);
} else {
crm_warn("Could not expand nodeid '%s' into a host name", op->target);
}
}
/* check to see if this is a duplicate operation of another in-flight operation */
merge_duplicates(op);
if (op->state != st_duplicate) {
/* kick history readers */
fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
}
/* safe to trim as long as that doesn't touch pending ops */
stonith_fence_history_trim();
return op;
}
/*!
* \internal
* \brief Create a peer fencing operation from a request, and initiate it
*
* \param[in] client IPC client that made request (NULL to get from request)
* \param[in] request Request XML
* \param[in] manual_ack Whether this is a manual action confirmation
*
* \return Newly created operation on success, otherwise NULL
*/
remote_fencing_op_t *
initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
gboolean manual_ack)
{
int query_timeout = 0;
xmlNode *query = NULL;
const char *client_id = NULL;
remote_fencing_op_t *op = NULL;
const char *relay_op_id = NULL;
const char *operation = NULL;
if (client) {
client_id = client->id;
} else {
client_id = crm_element_value(request, F_STONITH_CLIENTID);
}
CRM_LOG_ASSERT(client_id != NULL);
op = create_remote_stonith_op(client_id, request, FALSE);
op->owner = TRUE;
if (manual_ack) {
return op;
}
CRM_CHECK(op->action, return NULL);
if (advance_topology_level(op, true) != pcmk_rc_ok) {
op->state = st_failed;
}
switch (op->state) {
case st_failed:
// advance_topology_level() exhausted levels
pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
"All topology levels failed");
crm_warn("Could not request peer fencing (%s) targeting %s "
CRM_XS " id=%.8s", op->action, op->target, op->id);
finalize_op(op, NULL, false);
return op;
case st_duplicate:
crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
CRM_XS " id=%.8s", op->action, op->target, op->id);
return op;
default:
crm_notice("Requesting peer fencing (%s) targeting %s "
CRM_XS " id=%.8s state=%s base_timeout=%d",
op->action, op->target, op->id,
stonith_op_state_str(op->state), op->base_timeout);
}
query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
NULL, op->call_options);
crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(query, F_STONITH_TARGET, op->target);
crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
/* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */
operation = crm_element_value(request, F_STONITH_OPERATION);
if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
if (relay_op_id) {
crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
}
}
send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
free_xml(query);
query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
return op;
}
enum find_best_peer_options {
/*! Skip checking the target peer for capable fencing devices */
FIND_PEER_SKIP_TARGET = 0x0001,
/*! Only check the target peer for capable fencing devices */
FIND_PEER_TARGET_ONLY = 0x0002,
/*! Skip peers and devices that are not verified */
FIND_PEER_VERIFIED_ONLY = 0x0004,
};
static peer_device_info_t *
find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
{
GList *iter = NULL;
gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
return NULL;
}
for (iter = op->query_results; iter != NULL; iter = iter->next) {
peer_device_info_t *peer = iter->data;
crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
peer->host, op->target, peer->ndevices,
pcmk__plural_s(peer->ndevices), peer->tried, options);
if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
continue;
}
if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
continue;
}
if (pcmk_is_set(op->call_options, st_opt_topology)) {
if (grab_peer_device(op, peer, device, verified_devices_only)) {
return peer;
}
} else if (!peer->tried
&& count_peer_devices(op, peer, verified_devices_only,
fenced_support_flag(op->action))) {
/* No topology: Use the current best peer */
crm_trace("Simple fencing");
return peer;
}
}
return NULL;
}
static peer_device_info_t *
stonith_choose_peer(remote_fencing_op_t * op)
{
const char *device = NULL;
peer_device_info_t *peer = NULL;
uint32_t active = fencing_active_peers();
do {
if (op->devices) {
device = op->devices->data;
crm_trace("Checking for someone to fence (%s) %s using %s",
op->action, op->target, device);
} else {
crm_trace("Checking for someone to fence (%s) %s",
op->action, op->target);
}
/* Best choice is a peer other than the target with verified access */
peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
if (peer) {
crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
return peer;
}
if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
return NULL;
}
/* If no other peer has verified access, next best is unverified access */
peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
if (peer) {
crm_trace("Found best unverified peer %s", peer->host);
return peer;
}
/* If no other peer can do it, last option is self-fencing
* (which is never allowed for the "on" phase of a remapped reboot)
*/
if (op->phase != st_phase_on) {
peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
if (peer) {
crm_trace("%s will fence itself", peer->host);
return peer;
}
}
/* Try the next fencing level if there is one (unless we're in the "on"
* phase of a remapped "reboot", because we ignore errors in that case)
*/
} while ((op->phase != st_phase_on)
&& pcmk_is_set(op->call_options, st_opt_topology)
&& (advance_topology_level(op, false) == pcmk_rc_ok));
if ((stonith_watchdog_timeout_ms > 0)
&& pcmk__is_fencing_action(op->action)
&& pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
&& node_does_watchdog_fencing(op->target)) {
crm_info("Couldn't contact watchdog-fencing target-node (%s)",
op->target);
/* check_watchdog_fencing_and_wait will log additional info */
} else {
crm_notice("Couldn't find anyone to fence (%s) %s using %s",
op->action, op->target, (device? device : "any device"));
}
return NULL;
}
static int
get_device_timeout(const remote_fencing_op_t *op,
const peer_device_info_t *peer, const char *device)
{
device_properties_t *props;
if (!peer || !device) {
return op->base_timeout;
}
props = g_hash_table_lookup(peer->devices, device);
if (!props) {
return op->base_timeout;
}
return (props->custom_action_timeout[op->phase]?
props->custom_action_timeout[op->phase] : op->base_timeout)
+ props->delay_max[op->phase];
}
struct timeout_data {
const remote_fencing_op_t *op;
const peer_device_info_t *peer;
int total_timeout;
};
/*!
* \internal
* \brief Add timeout to a total if device has not been executed yet
*
* \param[in] key GHashTable key (device ID)
* \param[in] value GHashTable value (device properties)
* \param[in,out] user_data Timeout data
*/
static void
add_device_timeout(gpointer key, gpointer value, gpointer user_data)
{
const char *device_id = key;
device_properties_t *props = value;
struct timeout_data *timeout = user_data;
if (!props->executed[timeout->op->phase]
&& !props->disallowed[timeout->op->phase]) {
timeout->total_timeout += get_device_timeout(timeout->op,
timeout->peer, device_id);
}
}
static int
get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
{
struct timeout_data timeout;
timeout.op = op;
timeout.peer = peer;
timeout.total_timeout = 0;
g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
}
static int
get_op_total_timeout(const remote_fencing_op_t *op,
const peer_device_info_t *chosen_peer)
{
int total_timeout = 0;
stonith_topology_t *tp = find_topology_for_host(op->target);
if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
int i;
GList *device_list = NULL;
GList *iter = NULL;
GList *auto_list = NULL;
if (pcmk__str_eq(op->action, "on", pcmk__str_none)
&& (op->automatic_list != NULL)) {
auto_list = g_list_copy(op->automatic_list);
}
/* Yep, this looks scary, nested loops all over the place.
* Here is what is going on.
* Loop1: Iterate through fencing levels.
* Loop2: If a fencing level has devices, loop through each device
* Loop3: For each device in a fencing level, see what peer owns it
* and what that peer has reported the timeout is for the device.
*/
for (i = 0; i < ST_LEVEL_MAX; i++) {
if (!tp->levels[i]) {
continue;
}
for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
/* in case of watchdog-device we add the timeout to the budget
regardless of if we got a reply or not
*/
if ((stonith_watchdog_timeout_ms > 0)
&& pcmk__is_fencing_action(op->action)
&& pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID,
pcmk__str_none)
&& node_does_watchdog_fencing(op->target)) {
total_timeout += stonith_watchdog_timeout_ms / 1000;
continue;
}
for (iter = op->query_results; iter != NULL; iter = iter->next) {
const peer_device_info_t *peer = iter->data;
if (auto_list) {
GList *match = g_list_find_custom(auto_list, device_list->data,
sort_strings);
if (match) {
auto_list = g_list_remove(auto_list, match->data);
}
}
if (find_peer_device(op, peer, device_list->data,
fenced_support_flag(op->action))) {
total_timeout += get_device_timeout(op, peer,
device_list->data);
break;
}
} /* End Loop3: match device with peer that owns device, find device's timeout period */
} /* End Loop2: iterate through devices at a specific level */
} /*End Loop1: iterate through fencing levels */
//Add only exists automatic_list device timeout
if (auto_list) {
for (iter = auto_list; iter != NULL; iter = iter->next) {
GList *iter2 = NULL;
for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
peer_device_info_t *peer = iter2->data;
if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
total_timeout += get_device_timeout(op, peer, iter->data);
break;
}
}
}
}
g_list_free(auto_list);
} else if (chosen_peer) {
total_timeout = get_peer_timeout(op, chosen_peer);
} else {
total_timeout = op->base_timeout;
}
/* Take any requested fencing delay into account to prevent it from eating
* up the total timeout.
*/
return ((total_timeout ? total_timeout : op->base_timeout)
+ (op->delay > 0 ? op->delay : 0));
}
static void
report_timeout_period(remote_fencing_op_t * op, int op_timeout)
{
GList *iter = NULL;
xmlNode *update = NULL;
const char *client_node = NULL;
const char *client_id = NULL;
const char *call_id = NULL;
if (op->call_options & st_opt_sync_call) {
/* There is no reason to report the timeout for a synchronous call. It
* is impossible to use the reported timeout to do anything when the client
* is blocking for the response. This update is only important for
* async calls that require a callback to report the results in. */
return;
} else if (!op->request) {
return;
}
crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
call_id = crm_element_value(op->request, F_STONITH_CALLID);
client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
if (!client_node || !call_id || !client_id) {
return;
}
if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
// Client is connected to this node, so send update directly to them
do_stonith_async_timeout_update(client_id, call_id, op_timeout);
return;
}
/* The client is connected to another node, relay this update to them */
update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(update, F_STONITH_CLIENTID, client_id);
crm_xml_add(update, F_STONITH_CALLID, call_id);
crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
free_xml(update);
for (iter = op->duplicates; iter != NULL; iter = iter->next) {
remote_fencing_op_t *dup = iter->data;
crm_trace("Reporting timeout for duplicate %.8s to client %s",
dup->id, dup->client_name);
report_timeout_period(iter->data, op_timeout);
}
}
/*!
* \internal
* \brief Advance an operation to the next device in its topology
*
* \param[in,out] op Fencer operation to advance
* \param[in] device ID of device that just completed
* \param[in,out] msg If not NULL, XML reply of last delegated operation
*/
static void
advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
xmlNode *msg)
{
/* Advance to the next device at this topology level, if any */
if (op->devices) {
op->devices = op->devices->next;
}
/* Handle automatic unfencing if an "on" action was requested */
if ((op->phase == st_phase_requested)
&& pcmk__str_eq(op->action, "on", pcmk__str_none)) {
/* If the device we just executed was required, it's not anymore */
remove_required_device(op, device);
/* If there are no more devices at this topology level, run through any
* remaining devices with automatic unfencing
*/
if (op->devices == NULL) {
op->devices = op->automatic_list;
}
}
if ((op->devices == NULL) && (op->phase == st_phase_off)) {
/* We're done with this level and with required devices, but we had
* remapped "reboot" to "off", so start over with "on". If any devices
* need to be turned back on, op->devices will be non-NULL after this.
*/
op_phase_on(op);
}
// This function is only called if the previous device succeeded
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
if (op->devices) {
/* Necessary devices remain, so execute the next one */
crm_trace("Next targeting %s on behalf of %s@%s",
op->target, op->client_name, op->originator);
// The requested delay has been applied for the first device
if (op->delay > 0) {
op->delay = 0;
}
request_peer_fencing(op, NULL);
} else {
/* We're done with all devices and phases, so finalize operation */
crm_trace("Marking complex fencing op targeting %s as complete",
op->target);
op->state = st_done;
finalize_op(op, msg, false);
}
}
static gboolean
check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
{
if (node_does_watchdog_fencing(op->target)) {
crm_notice("Waiting %lds for %s to self-fence (%s) for "
"client %s " CRM_XS " id=%.8s",
(stonith_watchdog_timeout_ms / 1000),
op->target, op->action, op->client_name, op->id);
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
}
op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
remote_op_watchdog_done, op);
return TRUE;
} else {
crm_debug("Skipping fallback to watchdog-fencing as %s is "
"not in host-list", op->target);
}
return FALSE;
}
/*!
* \internal
* \brief Ask a peer to execute a fencing operation
*
* \param[in,out] op Fencing operation to be executed
* \param[in,out] peer If NULL or topology is in use, choose best peer to
* execute the fencing, otherwise use this peer
*/
static void
request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
{
const char *device = NULL;
int timeout;
CRM_CHECK(op != NULL, return);
crm_trace("Action %.8s targeting %s for %s is %s",
op->id, op->target, op->client_name,
stonith_op_state_str(op->state));
if ((op->phase == st_phase_on) && (op->devices != NULL)) {
/* We are in the "on" phase of a remapped topology reboot. If this
* device has pcmk_reboot_action="off", or doesn't support the "on"
* action, skip it.
*
* We can't check device properties at this point because we haven't
* chosen a peer for this stage yet. Instead, we check the local node's
* knowledge about the device. If different versions of the fence agent
* are installed on different nodes, there's a chance this could be
* mistaken, but the worst that could happen is we don't try turning the
* node back on when we should.
*/
device = op->devices->data;
if (pcmk__str_eq(fenced_device_reboot_action(device), "off",
pcmk__str_none)) {
crm_info("Not turning %s back on using %s because the device is "
"configured to stay off (pcmk_reboot_action='off')",
op->target, device);
advance_topology_device_in_level(op, device, NULL);
return;
}
if (!fenced_device_supports_on(device)) {
crm_info("Not turning %s back on using %s because the agent "
"doesn't support 'on'", op->target, device);
advance_topology_device_in_level(op, device, NULL);
return;
}
}
timeout = op->base_timeout;
if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
peer = stonith_choose_peer(op);
}
if (!op->op_timer_total) {
op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
report_timeout_period(op, op->total_timeout);
crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
CRM_XS "id=%.8s",
op->total_timeout, op->target, op->client_name, op->id);
}
if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
/* Ignore the caller's peer preference if topology is in use, because
* that peer might not have access to the required device. With
* topology, stonith_choose_peer() removes the device from further
* consideration, so the timeout must be calculated beforehand.
*
* @TODO Basing the total timeout on the caller's preferred peer (above)
* is less than ideal.
*/
peer = stonith_choose_peer(op);
device = op->devices->data;
timeout = get_device_timeout(op, peer, device);
}
if (peer) {
- int timeout_one = 0;
+ /* Take any requested fencing delay into account to prevent it from eating
+ * up the timeout.
+ */
+ int timeout_one = (op->delay > 0 ?
+ TIMEOUT_MULTIPLY_FACTOR * op->delay : 0);
xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
if (device) {
- timeout_one = TIMEOUT_MULTIPLY_FACTOR *
- get_device_timeout(op, peer, device);
+ timeout_one += TIMEOUT_MULTIPLY_FACTOR *
+ get_device_timeout(op, peer, device);
crm_notice("Requesting that %s perform '%s' action targeting %s "
"using %s " CRM_XS " for client %s (%ds)",
peer->host, op->action, op->target, device,
op->client_name, timeout_one);
crm_xml_add(remote_op, F_STONITH_DEVICE, device);
} else {
- timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
+ timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
crm_notice("Requesting that %s perform '%s' action targeting %s "
CRM_XS " for client %s (%ds, %lds)",
peer->host, op->action, op->target, op->client_name,
timeout_one, stonith_watchdog_timeout_ms);
}
op->state = st_exec;
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
op->op_timer_one = 0;
}
if (!((stonith_watchdog_timeout_ms > 0)
&& (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
|| (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
&& pcmk__is_fencing_action(op->action)))
&& check_watchdog_fencing_and_wait(op))) {
/* Some thoughts about self-fencing cases reaching this point:
- Actually check in check_watchdog_fencing_and_wait
shouldn't fail if STONITH_WATCHDOG_ID is
chosen as fencing-device and it being present implies
watchdog-fencing is enabled anyway
- If watchdog-fencing is disabled either in general or for
a specific target - detected in check_watchdog_fencing_and_wait -
for some other kind of self-fencing we can't expect
a success answer but timeout is fine if the node doesn't
come back in between
- Delicate might be the case where we have watchdog-fencing
enabled for a node but the watchdog-fencing-device isn't
explicitly chosen for suicide. Local pe-execution in sbd
may detect the node as unclean and lead to timely suicide.
Otherwise the selection of stonith-watchdog-timeout at
least is questionable.
*/
/* coming here we're not waiting for watchdog timeout -
thus engage timer with timout evaluated before */
op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
}
send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
peer->tried = TRUE;
free_xml(remote_op);
return;
} else if (op->phase == st_phase_on) {
/* A remapped "on" cannot be executed, but the node was already
* turned off successfully, so ignore the error and continue.
*/
crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
"after successful 'off'", device, op->target);
advance_topology_device_in_level(op, device, NULL);
return;
} else if (op->owner == FALSE) {
crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
op->action, op->target, op->client_name);
} else if (op->query_timer == 0) {
/* We've exhausted all available peers */
crm_info("No remaining peers capable of fencing (%s) %s for client %s "
CRM_XS " state=%s", op->action, op->target, op->client_name,
stonith_op_state_str(op->state));
CRM_CHECK(op->state < st_done, return);
finalize_timed_out_op(op, "All nodes failed, or are unable, to "
"fence target");
} else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
/* if the operation never left the query state,
* but we have all the expected replies, then no devices
* are available to execute the fencing operation. */
if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
if (check_watchdog_fencing_and_wait(op)) {
return;
}
}
if (op->state == st_query) {
crm_info("No peers (out of %d) have devices capable of fencing "
"(%s) %s for client %s " CRM_XS " state=%s",
op->replies, op->action, op->target, op->client_name,
stonith_op_state_str(op->state));
pcmk__reset_result(&op->result);
pcmk__set_result(&op->result, CRM_EX_ERROR,
PCMK_EXEC_NO_FENCE_DEVICE, NULL);
} else {
if (pcmk_is_set(op->call_options, st_opt_topology)) {
pcmk__reset_result(&op->result);
pcmk__set_result(&op->result, CRM_EX_ERROR,
PCMK_EXEC_NO_FENCE_DEVICE, NULL);
}
/* ... else use existing result from previous failed attempt
* (topology is not in use, and no devices remain to be attempted).
* Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would
* prevent finalize_op() from setting the correct delegate if
* needed.
*/
crm_info("No peers (out of %d) are capable of fencing (%s) %s "
"for client %s " CRM_XS " state=%s",
op->replies, op->action, op->target, op->client_name,
stonith_op_state_str(op->state));
}
op->state = st_failed;
finalize_op(op, NULL, false);
} else {
crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
"for client %s " CRM_XS " id=%.8s",
op->action, op->target, (device? " using " : ""),
(device? device : ""), op->client_name, op->id);
}
}
/*!
* \internal
* \brief Comparison function for sorting query results
*
* \param[in] a GList item to compare
* \param[in] b GList item to compare
*
* \return Per the glib documentation, "a negative integer if the first value
* comes before the second, 0 if they are equal, or a positive integer
* if the first value comes after the second."
*/
static gint
sort_peers(gconstpointer a, gconstpointer b)
{
const peer_device_info_t *peer_a = a;
const peer_device_info_t *peer_b = b;
return (peer_b->ndevices - peer_a->ndevices);
}
/*!
* \internal
* \brief Determine if all the devices in the topology are found or not
*
* \param[in] op Fencing operation with topology to check
*/
static gboolean
all_topology_devices_found(const remote_fencing_op_t *op)
{
GList *device = NULL;
GList *iter = NULL;
device_properties_t *match = NULL;
stonith_topology_t *tp = NULL;
gboolean skip_target = FALSE;
int i;
tp = find_topology_for_host(op->target);
if (!tp) {
return FALSE;
}
if (pcmk__is_fencing_action(op->action)) {
/* Don't count the devices on the target node if we are killing
* the target node. */
skip_target = TRUE;
}
for (i = 0; i < ST_LEVEL_MAX; i++) {
for (device = tp->levels[i]; device; device = device->next) {
match = NULL;
for (iter = op->query_results; iter && !match; iter = iter->next) {
peer_device_info_t *peer = iter->data;
if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
continue;
}
match = find_peer_device(op, peer, device->data, st_device_supports_none);
}
if (!match) {
return FALSE;
}
}
}
return TRUE;
}
/*!
* \internal
* \brief Parse action-specific device properties from XML
*
* \param[in] xml XML element containing the properties
* \param[in] peer Name of peer that sent XML (for logs)
* \param[in] device Device ID (for logs)
* \param[in] action Action the properties relate to (for logs)
* \param[in,out] op Fencing operation that properties are being parsed for
* \param[in] phase Phase the properties relate to
* \param[in,out] props Device properties to update
*/
static void
parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
const char *action, remote_fencing_op_t *op,
enum st_remap_phase phase, device_properties_t *props)
{
props->custom_action_timeout[phase] = 0;
crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
&props->custom_action_timeout[phase]);
if (props->custom_action_timeout[phase]) {
crm_trace("Peer %s with device %s returned %s action timeout %d",
peer, device, action, props->custom_action_timeout[phase]);
}
props->delay_max[phase] = 0;
crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
if (props->delay_max[phase]) {
crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
peer, device, props->delay_max[phase], action);
}
props->delay_base[phase] = 0;
crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
if (props->delay_base[phase]) {
crm_trace("Peer %s with device %s returned base delay %d for %s",
peer, device, props->delay_base[phase], action);
}
/* Handle devices with automatic unfencing */
if (pcmk__str_eq(action, "on", pcmk__str_none)) {
int required = 0;
crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
if (required) {
crm_trace("Peer %s requires device %s to execute for action %s",
peer, device, action);
add_required_device(op, device);
}
}
/* If a reboot is remapped to off+on, it's possible that a node is allowed
* to perform one action but not another.
*/
if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
props->disallowed[phase] = TRUE;
crm_trace("Peer %s is disallowed from executing %s for device %s",
peer, action, device);
}
}
/*!
* \internal
* \brief Parse one device's properties from peer's XML query reply
*
* \param[in] xml XML node containing device properties
* \param[in,out] op Operation that query and reply relate to
* \param[in,out] peer Peer's device information
* \param[in] device ID of device being parsed
*/
static void
add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
peer_device_info_t *peer, const char *device)
{
xmlNode *child;
int verified = 0;
device_properties_t *props = calloc(1, sizeof(device_properties_t));
int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */
/* Add a new entry to this peer's devices list */
CRM_ASSERT(props != NULL);
g_hash_table_insert(peer->devices, strdup(device), props);
/* Peers with verified (monitored) access will be preferred */
crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
if (verified) {
crm_trace("Peer %s has confirmed a verified device %s",
peer->host, device);
props->verified = TRUE;
}
crm_element_value_int(xml, F_STONITH_DEVICE_SUPPORT_FLAGS, &flags);
props->device_support_flags = flags;
/* Parse action-specific device properties */
parse_action_specific(xml, peer->host, device, op_requested_action(op),
op, st_phase_requested, props);
for (child = pcmk__xml_first_child(xml); child != NULL;
child = pcmk__xml_next(child)) {
/* Replies for "reboot" operations will include the action-specific
* values for "off" and "on" in child elements, just in case the reboot
* winds up getting remapped.
*/
if (pcmk__str_eq(ID(child), "off", pcmk__str_none)) {
parse_action_specific(child, peer->host, device, "off",
op, st_phase_off, props);
} else if (pcmk__str_eq(ID(child), "on", pcmk__str_none)) {
parse_action_specific(child, peer->host, device, "on",
op, st_phase_on, props);
}
}
}
/*!
* \internal
* \brief Parse a peer's XML query reply and add it to operation's results
*
* \param[in,out] op Operation that query and reply relate to
* \param[in] host Name of peer that sent this reply
* \param[in] ndevices Number of devices expected in reply
* \param[in] xml XML node containing device list
*
* \return Newly allocated result structure with parsed reply
*/
static peer_device_info_t *
add_result(remote_fencing_op_t *op, const char *host, int ndevices,
const xmlNode *xml)
{
peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
xmlNode *child;
// cppcheck seems not to understand the abort logic in CRM_CHECK
// cppcheck-suppress memleak
CRM_CHECK(peer != NULL, return NULL);
peer->host = strdup(host);
peer->devices = pcmk__strkey_table(free, free);
/* Each child element describes one capable device available to the peer */
for (child = pcmk__xml_first_child(xml); child != NULL;
child = pcmk__xml_next(child)) {
const char *device = ID(child);
if (device) {
add_device_properties(child, op, peer, device);
}
}
peer->ndevices = g_hash_table_size(peer->devices);
CRM_CHECK(ndevices == peer->ndevices,
crm_err("Query claimed to have %d device%s but %d found",
ndevices, pcmk__plural_s(ndevices), peer->ndevices));
op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
return peer;
}
/*!
* \internal
* \brief Handle a peer's reply to our fencing query
*
* Parse a query result from XML and store it in the remote operation
* table, and when enough replies have been received, issue a fencing request.
*
* \param[in] msg XML reply received
*
* \return pcmk_ok on success, -errno on error
*
* \note See initiate_remote_stonith_op() for how the XML query was initially
* formed, and stonith_query() for how the peer formed its XML reply.
*/
int
process_remote_stonith_query(xmlNode *msg)
{
int ndevices = 0;
gboolean host_is_target = FALSE;
gboolean have_all_replies = FALSE;
const char *id = NULL;
const char *host = NULL;
remote_fencing_op_t *op = NULL;
peer_device_info_t *peer = NULL;
uint32_t replies_expected;
xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(id != NULL, return -EPROTO);
dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
op = g_hash_table_lookup(stonith_remote_op_list, id);
if (op == NULL) {
crm_debug("Received query reply for unknown or expired operation %s",
id);
return -EOPNOTSUPP;
}
replies_expected = fencing_active_peers();
if (op->replies_expected < replies_expected) {
replies_expected = op->replies_expected;
}
if ((++op->replies >= replies_expected) && (op->state == st_query)) {
have_all_replies = TRUE;
}
host = crm_element_value(msg, F_ORIG);
host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
op->replies, replies_expected, host,
op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
if (ndevices > 0) {
peer = add_result(op, host, ndevices, dev);
}
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
if (pcmk_is_set(op->call_options, st_opt_topology)) {
/* If we start the fencing before all the topology results are in,
* it is possible fencing levels will be skipped because of the missing
* query results. */
if (op->state == st_query && all_topology_devices_found(op)) {
/* All the query results are in for the topology, start the fencing ops. */
crm_trace("All topology devices found");
request_peer_fencing(op, peer);
} else if (have_all_replies) {
crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
replies_expected, op->replies);
request_peer_fencing(op, NULL);
}
} else if (op->state == st_query) {
int nverified = count_peer_devices(op, peer, TRUE,
fenced_support_flag(op->action));
/* We have a result for a non-topology fencing op that looks promising,
* go ahead and start fencing before query timeout */
if ((peer != NULL) && !host_is_target && nverified) {
/* we have a verified device living on a peer that is not the target */
crm_trace("Found %d verified device%s",
nverified, pcmk__plural_s(nverified));
request_peer_fencing(op, peer);
} else if (have_all_replies) {
crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
replies_expected, op->replies);
request_peer_fencing(op, NULL);
} else {
crm_trace("Waiting for more peer results before launching fencing operation");
}
} else if ((peer != NULL) && (op->state == st_done)) {
crm_info("Discarding query result from %s (%d device%s): "
"Operation is %s", peer->host,
peer->ndevices, pcmk__plural_s(peer->ndevices),
stonith_op_state_str(op->state));
}
return pcmk_ok;
}
/*!
* \internal
* \brief Handle a peer's reply to a fencing request
*
* Parse a fencing reply from XML, and either finalize the operation
* or attempt another device as appropriate.
*
* \param[in] msg XML reply received
*/
void
fenced_process_fencing_reply(xmlNode *msg)
{
const char *id = NULL;
const char *device = NULL;
remote_fencing_op_t *op = NULL;
xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
CRM_CHECK(dev != NULL, return);
id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
CRM_CHECK(id != NULL, return);
dev = stonith__find_xe_with_result(msg);
CRM_CHECK(dev != NULL, return);
stonith__xe_get_result(dev, &result);
device = crm_element_value(dev, F_STONITH_DEVICE);
if (stonith_remote_op_list) {
op = g_hash_table_lookup(stonith_remote_op_list, id);
}
if ((op == NULL) && pcmk__result_ok(&result)) {
/* Record successful fencing operations */
const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
op = create_remote_stonith_op(client_id, dev, TRUE);
}
if (op == NULL) {
/* Could be for an event that began before we started */
/* TODO: Record the op for later querying */
crm_info("Received peer result of unknown or expired operation %s", id);
pcmk__reset_result(&result);
return;
}
pcmk__reset_result(&op->result);
op->result = result; // The operation takes ownership of the result
if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
crm_err("Received outdated reply for device %s (instead of %s) to "
"fence (%s) %s. Operation already timed out at peer level.",
device, (const char *) op->devices->data, op->action, op->target);
return;
}
if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
if (pcmk__result_ok(&op->result)) {
op->state = st_done;
} else {
op->state = st_failed;
}
finalize_op(op, msg, false);
return;
} else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
/* If this isn't a remote level broadcast, and we are not the
* originator of the operation, we should not be receiving this msg. */
crm_err("Received non-broadcast fencing result for operation %.8s "
"we do not own (device %s targeting %s)",
op->id, device, op->target);
return;
}
if (pcmk_is_set(op->call_options, st_opt_topology)) {
const char *device = NULL;
const char *reason = op->result.exit_reason;
/* We own the op, and it is complete. broadcast the result to all nodes
* and notify our local clients. */
if (op->state == st_done) {
finalize_op(op, msg, false);
return;
}
device = crm_element_value(msg, F_STONITH_DEVICE);
if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
/* A remapped "on" failed, but the node was already turned off
* successfully, so ignore the error and continue.
*/
crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
"after successful 'off'",
device, pcmk_exec_status_str(op->result.execution_status),
(reason == NULL)? "" : ": ",
(reason == NULL)? "" : reason,
op->target);
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
} else {
crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
"%s%s%s%s",
op->action, op->target,
((device == NULL)? "" : " using "),
((device == NULL)? "" : device),
op->client_name,
op->originator,
pcmk_exec_status_str(op->result.execution_status),
(reason == NULL)? "" : " (",
(reason == NULL)? "" : reason,
(reason == NULL)? "" : ")");
}
if (pcmk__result_ok(&op->result)) {
/* An operation completed successfully. Try another device if
* necessary, otherwise mark the operation as done. */
advance_topology_device_in_level(op, device, msg);
return;
} else {
/* This device failed, time to try another topology level. If no other
* levels are available, mark this operation as failed and report results. */
if (advance_topology_level(op, false) != pcmk_rc_ok) {
op->state = st_failed;
finalize_op(op, msg, false);
return;
}
}
} else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
op->state = st_done;
finalize_op(op, msg, false);
return;
} else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
&& (op->devices == NULL)) {
/* If the operation timed out don't bother retrying other peers. */
op->state = st_failed;
finalize_op(op, msg, false);
return;
} else {
/* fall-through and attempt other fencing action using another peer */
}
/* Retry on failure */
crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
op->target, op->originator, op->client_name,
pcmk_exec_status_str(op->result.execution_status));
request_peer_fencing(op, NULL);
}
gboolean
stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
{
GHashTableIter iter;
time_t now = time(NULL);
remote_fencing_op_t *rop = NULL;
if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
action == NULL) {
return FALSE;
}
g_hash_table_iter_init(&iter, stonith_remote_op_list);
while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
if (strcmp(rop->target, target) != 0) {
continue;
} else if (rop->state != st_done) {
continue;
/* We don't have to worry about remapped reboots here
* because if state is done, any remapping has been undone
*/
} else if (strcmp(rop->action, action) != 0) {
continue;
} else if ((rop->completed + tolerance) < now) {
continue;
}
crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
target, action, tolerance, rop->delegate, rop->originator);
return TRUE;
}
return FALSE;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Nov 23, 3:10 PM (15 h, 24 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1018829
Default Alt Text
(155 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment