Page MenuHomeClusterLabs Projects

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
This document is not UTF8. It was detected as Shift JIS and converted to UTF8 for display.
diff --git a/cts/CTSaudits.py b/cts/CTSaudits.py
index aa18d64813..f4c3b15011 100755
--- a/cts/CTSaudits.py
+++ b/cts/CTSaudits.py
@@ -1,866 +1,865 @@
""" Auditing classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2000-2018 Alan Robertson <alanr@unix.sh>"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import time, re, uuid
from cts.watcher import LogWatcher
from cts.remote import input_wrapper
from cts.CTSvars import *
class ClusterAudit(object):
def __init__(self, cm):
self.CM = cm
def __call__(self):
raise ValueError("Abstract Class member (__call__)")
def is_applicable(self):
'''Return TRUE if we are applicable in the current test configuration'''
raise ValueError("Abstract Class member (is_applicable)")
return 1
def log(self, args):
self.CM.log("audit: %s" % args)
def debug(self, args):
self.CM.debug("audit: %s" % args)
def name(self):
raise ValueError("Abstract Class member (name)")
AllAuditClasses = [ ]
class LogAudit(ClusterAudit):
def name(self):
return "LogAudit"
def __init__(self, cm):
self.CM = cm
self.kinds = [ "combined syslog", "journal", "remote" ]
def RestartClusterLogging(self, nodes=None):
if not nodes:
nodes = self.CM.Env["nodes"]
self.CM.debug("Restarting logging on: %s" % repr(nodes))
for node in nodes:
if self.CM.Env["have_systemd"]:
if self.CM.rsh(node, "systemctl stop systemd-journald.socket") != 0:
self.CM.log ("ERROR: Cannot stop 'systemd-journald' on %s" % node)
if self.CM.rsh(node, "systemctl start systemd-journald.service") != 0:
self.CM.log ("ERROR: Cannot start 'systemd-journald' on %s" % node)
if self.CM.rsh(node, "service %s restart" % self.CM.Env["syslogd"]) != 0:
self.CM.log ("ERROR: Cannot restart '%s' on %s" % (self.CM.Env["syslogd"], node))
def TestLogging(self):
patterns = []
prefix = "Test message from"
suffix = str(uuid.uuid4())
watch = {}
for node in self.CM.Env["nodes"]:
# Look for the node name in two places to make sure
# that syslog is logging with the correct hostname
m = re.search("^([^.]+).*", node)
if m:
simple = m.group(1)
else:
simple = node
patterns.append("%s.*%s %s %s" % (simple, prefix, node, suffix))
watch_pref = self.CM.Env["LogWatcher"]
if watch_pref == "any":
for k in self.kinds:
watch[k] = LogWatcher(self.CM.Env["LogFileName"], patterns, "LogAudit", 5, silent=True, hosts=self.CM.Env["nodes"], kind=k)
watch[k].setwatch()
else:
k = watch_pref
watch[k] = LogWatcher(self.CM.Env["LogFileName"], patterns, "LogAudit", 5, silent=True, hosts=self.CM.Env["nodes"], kind=k)
watch[k].setwatch()
if watch_pref == "any": self.CM.log("Writing log with key: %s" % (suffix))
for node in self.CM.Env["nodes"]:
cmd = "logger -p %s.info %s %s %s" % (self.CM.Env["SyslogFacility"], prefix, node, suffix)
if self.CM.rsh(node, cmd, synchronous=0, silent=True) != 0:
self.CM.log ("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node))
for k in self.kinds:
if k in watch:
w = watch[k]
if watch_pref == "any": self.CM.log("Testing for %s logs" % (k))
w.lookforall(silent=True)
if not w.unmatched:
if watch_pref == "any":
self.CM.log ("Continuing with %s-based log reader" % (w.kind))
self.CM.Env["LogWatcher"] = w.kind
return 1
for k in list(watch.keys()):
w = watch[k]
if w.unmatched:
for regex in w.unmatched:
self.CM.log ("Test message [%s] not found in %s logs." % (regex, w.kind))
return 0
def __call__(self):
max = 3
attempt = 0
self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
while attempt <= max and self.TestLogging() == 0:
attempt = attempt + 1
self.RestartClusterLogging()
time.sleep(60*attempt)
if attempt > max:
self.CM.log("ERROR: Cluster logging unrecoverable.")
return 0
return 1
def is_applicable(self):
if self.CM.Env["DoBSC"]:
return 0
if self.CM.Env["LogAuditDisabled"]:
return 0
return 1
class DiskAudit(ClusterAudit):
def name(self):
return "DiskspaceAudit"
def __init__(self, cm):
self.CM = cm
def __call__(self):
result = 1
# @TODO Use directory of PCMK_logfile if set on host
dfcmd = "df -BM " + CTSvars.CRM_LOG_DIR + " | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%'"
self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
for node in self.CM.Env["nodes"]:
dfout = self.CM.rsh(node, dfcmd, 1)
if not dfout:
self.CM.log ("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node))
else:
try:
(used, remain) = dfout.split()
used_percent = int(used)
remaining_mb = int(remain)
except (ValueError, TypeError):
self.CM.log("Warning: df output '%s' from %s was invalid [%s, %s]"
% (dfout, node, used, remain))
else:
if remaining_mb < 10 or used_percent > 95:
self.CM.log("CRIT: Out of log disk space on %s (%d%% / %dMB)"
% (node, used_percent, remaining_mb))
result = None
if self.CM.Env["continue"] == 1:
answer = "Y"
else:
try:
answer = input_wrapper('Continue? [nY]')
except EOFError as e:
answer = "n"
if answer and answer == "n":
raise ValueError("Disk full on %s" % (node))
- ret = 0
elif remaining_mb < 100 or used_percent > 90:
self.CM.log("WARN: Low on log disk space (%dMB) on %s" % (remaining_mb, node))
return result
def is_applicable(self):
if self.CM.Env["DoBSC"]:
return 0
return 1
class FileAudit(ClusterAudit):
def name(self):
return "FileAudit"
def __init__(self, cm):
self.CM = cm
self.known = []
def __call__(self):
result = 1
self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
for node in self.CM.Env["nodes"]:
(rc, lsout) = self.CM.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", None)
for line in lsout:
line = line.strip()
if line not in self.known:
result = 0
self.known.append(line)
self.CM.log("Warning: Pacemaker core file on %s: %s" % (node, line))
(rc, lsout) = self.CM.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", None)
for line in lsout:
line = line.strip()
if line not in self.known:
result = 0
self.known.append(line)
self.CM.log("Warning: Corosync core file on %s: %s" % (node, line))
if node in self.CM.ShouldBeStatus and self.CM.ShouldBeStatus[node] == "down":
clean = 0
(rc, lsout) = self.CM.rsh(node, "ls -al /dev/shm | grep qb-", None)
for line in lsout:
result = 0
clean = 1
self.CM.log("Warning: Stale IPC file on %s: %s" % (node, line))
if clean:
(rc, lsout) = self.CM.rsh(node, "ps axf | grep -e pacemaker -e corosync", None)
for line in lsout:
self.CM.debug("ps[%s]: %s" % (node, line))
self.CM.rsh(node, "rm -f /dev/shm/qb-*")
else:
self.CM.debug("Skipping %s" % node)
return result
def is_applicable(self):
return 1
class AuditResource(object):
def __init__(self, cm, line):
fields = line.split()
self.CM = cm
self.line = line
self.type = fields[1]
self.id = fields[2]
self.clone_id = fields[3]
self.parent = fields[4]
self.rprovider = fields[5]
self.rclass = fields[6]
self.rtype = fields[7]
self.host = fields[8]
self.needs_quorum = fields[9]
self.flags = int(fields[10])
self.flags_s = fields[11]
if self.parent == "NA":
self.parent = None
def unique(self):
if self.flags & int("0x00000020", 16):
return 1
return 0
def orphan(self):
if self.flags & int("0x00000001", 16):
return 1
return 0
def managed(self):
if self.flags & int("0x00000002", 16):
return 1
return 0
class AuditConstraint(object):
def __init__(self, cm, line):
fields = line.split()
self.CM = cm
self.line = line
self.type = fields[1]
self.id = fields[2]
self.rsc = fields[3]
self.target = fields[4]
self.score = fields[5]
self.rsc_role = fields[6]
self.target_role = fields[7]
if self.rsc_role == "NA":
self.rsc_role = None
if self.target_role == "NA":
self.target_role = None
class PrimitiveAudit(ClusterAudit):
def name(self):
return "PrimitiveAudit"
def __init__(self, cm):
self.CM = cm
def doResourceAudit(self, resource, quorum):
rc = 1
active = self.CM.ResourceLocation(resource.id)
if len(active) == 1:
if quorum:
self.debug("Resource %s active on %s" % (resource.id, repr(active)))
elif resource.needs_quorum == 1:
self.CM.log("Resource %s active without quorum: %s"
% (resource.id, repr(active)))
rc = 0
elif not resource.managed():
self.CM.log("Resource %s not managed. Active on %s"
% (resource.id, repr(active)))
elif not resource.unique():
# TODO: Figure out a clever way to actually audit these resource types
if len(active) > 1:
self.debug("Non-unique resource %s is active on: %s"
% (resource.id, repr(active)))
else:
self.debug("Non-unique resource %s is not active" % resource.id)
elif len(active) > 1:
self.CM.log("Resource %s is active multiple times: %s"
% (resource.id, repr(active)))
rc = 0
elif resource.orphan():
self.debug("Resource %s is an inactive orphan" % resource.id)
elif len(self.inactive_nodes) == 0:
self.CM.log("WARN: Resource %s not served anywhere" % resource.id)
rc = 0
elif self.CM.Env["warn-inactive"] == 1:
if quorum or not resource.needs_quorum:
self.CM.log("WARN: Resource %s not served anywhere (Inactive nodes: %s)"
% (resource.id, repr(self.inactive_nodes)))
else:
self.debug("Resource %s not served anywhere (Inactive nodes: %s)"
% (resource.id, repr(self.inactive_nodes)))
elif quorum or not resource.needs_quorum:
self.debug("Resource %s not served anywhere (Inactive nodes: %s)"
% (resource.id, repr(self.inactive_nodes)))
return rc
def setup(self):
self.target = None
self.resources = []
self.constraints = []
self.active_nodes = []
self.inactive_nodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.active_nodes.append(node)
else:
self.inactive_nodes.append(node)
for node in self.CM.Env["nodes"]:
if self.target == None and self.CM.ShouldBeStatus[node] == "up":
self.target = node
if not self.target:
# TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource
# with CIB_file=/path/to/cib.xml even when the cluster isn't running
self.debug("No nodes active - skipping %s" % self.name())
return 0
(rc, lines) = self.CM.rsh(self.target, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
self.resources.append(AuditResource(self.CM, line))
elif re.search("^Constraint", line):
self.constraints.append(AuditConstraint(self.CM, line))
else:
self.CM.log("Unknown entry: %s" % line);
return 1
def __call__(self):
rc = 1
if not self.setup():
return 1
quorum = self.CM.HasQuorum(None)
for resource in self.resources:
if resource.type == "primitive":
if self.doResourceAudit(resource, quorum) == 0:
rc = 0
return rc
def is_applicable(self):
# @TODO Due to long-ago refactoring, this name test would never match,
# so this audit (and those derived from it) would never run.
# Uncommenting the next lines fixes the name test, but that then
# exposes pre-existing bugs that need to be fixed.
#if self.CM["Name"] == "crm-corosync":
# return 1
return 0
class GroupAudit(PrimitiveAudit):
def name(self):
return "GroupAudit"
def __call__(self):
rc = 1
if not self.setup():
return 1
for group in self.resources:
if group.type == "group":
first_match = 1
group_location = None
for child in self.resources:
if child.parent == group.id:
nodes = self.CM.ResourceLocation(child.id)
if first_match and len(nodes) > 0:
group_location = nodes[0]
first_match = 0
if len(nodes) > 1:
rc = 0
self.CM.log("Child %s of %s is active more than once: %s"
% (child.id, group.id, repr(nodes)))
elif len(nodes) == 0:
# Groups are allowed to be partially active
# However we do need to make sure later children aren't running
group_location = None
self.debug("Child %s of %s is stopped" % (child.id, group.id))
elif nodes[0] != group_location:
rc = 0
self.CM.log("Child %s of %s is active on the wrong node (%s) expected %s"
% (child.id, group.id, nodes[0], group_location))
else:
self.debug("Child %s of %s is active on %s" % (child.id, group.id, nodes[0]))
return rc
class CloneAudit(PrimitiveAudit):
def name(self):
return "CloneAudit"
def __call__(self):
rc = 1
if not self.setup():
return 1
for clone in self.resources:
if clone.type == "clone":
for child in self.resources:
if child.parent == clone.id and child.type == "primitive":
self.debug("Checking child %s of %s..." % (child.id, clone.id))
# Check max and node_max
# Obtain with:
# crm_resource -g clone_max --meta -r child.id
# crm_resource -g clone_node_max --meta -r child.id
return rc
class ColocationAudit(PrimitiveAudit):
def name(self):
return "ColocationAudit"
def crm_location(self, resource):
(rc, lines) = self.CM.rsh(self.target, "crm_resource -W -r %s -Q"%resource, None)
hosts = []
if rc == 0:
for line in lines:
fields = line.split()
hosts.append(fields[0])
return hosts
def __call__(self):
rc = 1
if not self.setup():
return 1
for coloc in self.constraints:
if coloc.type == "rsc_colocation":
source = self.crm_location(coloc.rsc)
target = self.crm_location(coloc.target)
if len(source) == 0:
self.debug("Colocation audit (%s): %s not running" % (coloc.id, coloc.rsc))
else:
for node in source:
if not node in target:
rc = 0
self.CM.log("Colocation audit (%s): %s running on %s (not in %s)"
% (coloc.id, coloc.rsc, node, repr(target)))
else:
self.debug("Colocation audit (%s): %s running on %s (in %s)"
% (coloc.id, coloc.rsc, node, repr(target)))
return rc
class ControllerStateAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
passed = 1
up_are_down = 0
down_are_up = 0
unstable_list = []
for node in self.CM.Env["nodes"]:
should_be = self.CM.ShouldBeStatus[node]
rc = self.CM.test_node_CM(node)
if rc > 0:
if should_be == "down":
down_are_up = down_are_up + 1
if rc == 1:
unstable_list.append(node)
elif should_be == "up":
up_are_down = up_are_down + 1
if len(unstable_list) > 0:
passed = 0
self.CM.log("Cluster is not stable: %d (of %d): %s"
% (len(unstable_list), self.CM.upcount(), repr(unstable_list)))
if up_are_down > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be up were down."
% (up_are_down, len(self.CM.Env["nodes"])))
if down_are_up > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be down were up."
% (down_are_up, len(self.CM.Env["nodes"])))
return passed
def name(self):
return "ControllerStateAudit"
def is_applicable(self):
# @TODO Due to long-ago refactoring, this name test would never match,
# so this audit (and those derived from it) would never run.
# Uncommenting the next lines fixes the name test, but that then
# exposes pre-existing bugs that need to be fixed.
#if self.CM["Name"] == "crm-corosync":
# return 1
return 0
class CIBAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
passed = 1
ccm_partitions = self.CM.find_partitions()
if len(ccm_partitions) == 0:
self.debug("\tNo partitions to audit")
return 1
for partition in ccm_partitions:
self.debug("\tAuditing CIB consistency for: %s" % partition)
partition_passed = 0
if self.audit_cib_contents(partition) == 0:
passed = 0
return passed
def audit_cib_contents(self, hostlist):
passed = 1
node0 = None
node0_xml = None
partition_hosts = hostlist.split()
for node in partition_hosts:
node_xml = self.store_remote_cib(node, node0)
if node_xml == None:
self.CM.log("Could not perform audit: No configuration from %s" % node)
passed = 0
elif node0 == None:
node0 = node
node0_xml = node_xml
elif node0_xml == None:
self.CM.log("Could not perform audit: No configuration from %s" % node0)
passed = 0
else:
(rc, result) = self.CM.rsh(
node0, "crm_diff -VV -cf --new %s --original %s" % (node_xml, node0_xml), None)
if rc != 0:
self.CM.log("Diff between %s and %s failed: %d" % (node0_xml, node_xml, rc))
passed = 0
for line in result:
if not re.search("<diff/>", line):
passed = 0
self.debug("CibDiff[%s-%s]: %s" % (node0, node, line))
else:
self.debug("CibDiff[%s-%s] Ignoring: %s" % (node0, node, line))
# self.CM.rsh(node0, "rm -f %s" % node_xml)
# self.CM.rsh(node0, "rm -f %s" % node0_xml)
return passed
def store_remote_cib(self, node, target):
combined = ""
filename = "/tmp/ctsaudit.%s.xml" % node
if not target:
target = node
(rc, lines) = self.CM.rsh(node, self.CM["CibQuery"], None)
if rc != 0:
self.CM.log("Could not retrieve configuration")
return None
self.CM.rsh("localhost", "rm -f %s" % filename)
for line in lines:
self.CM.rsh("localhost", "echo \'%s\' >> %s" % (line[:-1], filename), silent=True)
if self.CM.rsh.cp(filename, "root@%s:%s" % (target, filename), silent=True) != 0:
self.CM.log("Could not store configuration")
return None
return filename
def name(self):
return "CibAudit"
def is_applicable(self):
# @TODO Due to long-ago refactoring, this name test would never match,
# so this audit (and those derived from it) would never run.
# Uncommenting the next lines fixes the name test, but that then
# exposes pre-existing bugs that need to be fixed.
#if self.CM["Name"] == "crm-corosync":
# return 1
return 0
class PartitionAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
self.NodeEpoch = {}
self.NodeState = {}
self.NodeQuorum = {}
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
passed = 1
ccm_partitions = self.CM.find_partitions()
if ccm_partitions == None or len(ccm_partitions) == 0:
return 1
self.CM.cluster_stable(double_check=True)
if len(ccm_partitions) != self.CM.partitions_expected:
self.CM.log("ERROR: %d cluster partitions detected:" % len(ccm_partitions))
passed = 0
for partition in ccm_partitions:
self.CM.log("\t %s" % partition)
for partition in ccm_partitions:
partition_passed = 0
if self.audit_partition(partition) == 0:
passed = 0
return passed
def trim_string(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return avalue[:-1]
def trim2int(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return int(avalue[:-1])
def audit_partition(self, partition):
passed = 1
dc_found = []
dc_allowed_list = []
lowest_epoch = None
node_list = partition.split()
self.debug("Auditing partition: %s" % (partition))
for node in node_list:
if self.CM.ShouldBeStatus[node] != "up":
self.CM.log("Warn: Node %s appeared out of nowhere" % (node))
self.CM.ShouldBeStatus[node] = "up"
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
self.NodeState[node] = self.CM.rsh(node, self.CM["StatusCmd"] % node, 1)
self.NodeEpoch[node] = self.CM.rsh(node, self.CM["EpochCmd"], 1)
self.NodeQuorum[node] = self.CM.rsh(node, self.CM["QuorumCmd"], 1)
self.debug("Node %s: %s - %s - %s." % (node, self.NodeState[node], self.NodeEpoch[node], self.NodeQuorum[node]))
self.NodeState[node] = self.trim_string(self.NodeState[node])
self.NodeEpoch[node] = self.trim2int(self.NodeEpoch[node])
self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node])
if not self.NodeEpoch[node]:
self.CM.log("Warn: Node %s dissappeared: cant determin epoch" % (node))
self.CM.ShouldBeStatus[node] = "down"
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
elif lowest_epoch == None or self.NodeEpoch[node] < lowest_epoch:
lowest_epoch = self.NodeEpoch[node]
if not lowest_epoch:
self.CM.log("Lowest epoch not determined in %s" % (partition))
passed = 0
for node in node_list:
if self.CM.ShouldBeStatus[node] == "up":
if self.CM.is_node_dc(node, self.NodeState[node]):
dc_found.append(node)
if self.NodeEpoch[node] == lowest_epoch:
self.debug("%s: OK" % node)
elif not self.NodeEpoch[node]:
self.debug("Check on %s ignored: no node epoch" % node)
elif not lowest_epoch:
self.debug("Check on %s ignored: no lowest epoch" % node)
else:
self.CM.log("DC %s is not the oldest node (%d vs. %d)"
% (node, self.NodeEpoch[node], lowest_epoch))
passed = 0
if len(dc_found) == 0:
self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
% (len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
elif len(dc_found) > 1:
self.CM.log("%d DCs (%s) found in cluster partition: %s"
% (len(dc_found), str(dc_found), str(node_list)))
passed = 0
if passed == 0:
for node in node_list:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.log("epoch %s : %s"
% (self.NodeEpoch[node], self.NodeState[node]))
return passed
def name(self):
return "PartitionAudit"
def is_applicable(self):
# @TODO Due to long-ago refactoring, this name test would never match,
# so this audit (and those derived from it) would never run.
# Uncommenting the next lines fixes the name test, but that then
# exposes pre-existing bugs that need to be fixed.
#if self.CM["Name"] == "crm-corosync":
# return 1
return 0
AllAuditClasses.append(DiskAudit)
AllAuditClasses.append(FileAudit)
AllAuditClasses.append(LogAudit)
AllAuditClasses.append(ControllerStateAudit)
AllAuditClasses.append(PartitionAudit)
AllAuditClasses.append(PrimitiveAudit)
AllAuditClasses.append(GroupAudit)
AllAuditClasses.append(CloneAudit)
AllAuditClasses.append(ColocationAudit)
AllAuditClasses.append(CIBAudit)
def AuditList(cm):
result = []
for auditclass in AllAuditClasses:
a = auditclass(cm)
if a.is_applicable():
result.append(a)
return result
diff --git a/cts/CTStests.py b/cts/CTStests.py
index 58f084fa10..42f6119294 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3111 +1,3111 @@
""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = """Copyright 2000, 2001 Alan Robertson <alanr@unix.sh>
Add RecourceRecover testcase Zhao Kai <zhaokai@cn.ibm.com>
"""
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
#
# SPECIAL NOTE:
#
# Tests may NOT implement any cluster-manager-specific code in them.
# EXTEND the ClusterManager object to provide the base capabilities
# the test needs if you need to do something that the current CM classes
# do not. Otherwise you screw up the whole point of the object structure
# in CTS.
#
# Thank you.
#
import os
import re
import time
import subprocess
import tempfile
from stat import *
from cts import CTS
from cts.CTSaudits import *
from cts.CTSvars import *
from cts.patterns import PatternSelector
from cts.logging import LogFactory
from cts.remote import RemoteFactory, input_wrapper
from cts.watcher import LogWatcher
from cts.environment import EnvFactory
AllTestClasses = [ ]
class CTSTest(object):
'''
A Cluster test.
We implement the basic set of properties and behaviors for a generic
cluster test.
Cluster tests track their own statistics.
We keep each of the kinds of counts we track as separate {name,value}
pairs.
'''
def __init__(self, cm):
#self.name="the unnamed test"
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
# if not issubclass(cm.__class__, ClusterManager):
# raise ValueError("Must be a ClusterManager object")
self.CM = cm
self.Env = EnvFactory().getInstance()
self.rsh = RemoteFactory().getInstance()
self.logger = LogFactory()
self.templates = PatternSelector(cm["Name"])
self.Audits = []
self.timeout = 120
self.passed = 1
self.is_loop = 0
self.is_unsafe = 0
self.is_docker_unsafe = 0
self.is_experimental = 0
self.is_container = 0
self.is_valgrind = 0
self.benchmark = 0 # which tests to benchmark
self.timer = {} # timers
def log(self, args):
self.logger.log(args)
def debug(self, args):
self.logger.debug(args)
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
if str(key) == "0":
raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")
if key in self.Stats:
return self.Stats[key]
return None
def log_mark(self, msg):
self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
return
def get_timer(self,key = "test"):
try: return self.timer[key]
except: return 0
def set_timer(self,key = "test"):
self.timer[key] = time.time()
return self.timer[key]
def log_timer(self,key = "test"):
elapsed = 0
if key in self.timer:
elapsed = time.time() - self.timer[key]
s = key == "test" and self.name or "%s:%s" % (self.name,key)
self.debug("%s runtime: %.2f" % (s, elapsed))
del self.timer[key]
return elapsed
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
# Reset the test passed boolean
if name == "calls":
self.passed = 1
def failure(self, reason="none"):
'''Increment the failure count'''
self.passed = 0
self.incr("failure")
self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
return None
def success(self):
'''Increment the success count'''
self.incr("success")
return 1
def skipped(self):
'''Increment the skipped count'''
self.incr("skipped")
return 1
def __call__(self, node):
'''Perform the given test'''
raise ValueError("Abstract Class member (__call__)")
self.incr("calls")
return self.failure()
def audit(self):
passed = 1
if len(self.Audits) > 0:
for audit in self.Audits:
if not audit():
self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
self.incr("auditfail")
passed = 0
return passed
def setup(self, node):
'''Setup the given test'''
return self.success()
def teardown(self, node):
'''Tear down the given test'''
return self.success()
def create_watch(self, patterns, timeout, name=None):
if not name:
name = self.name
return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
def local_badnews(self, prefix, watch, local_ignore=[]):
errcount = 0
if not prefix:
prefix = "LocalBadNews:"
ignorelist = []
ignorelist.append(" CTS: ")
ignorelist.append(prefix)
ignorelist.extend(local_ignore)
while errcount < 100:
match = watch.look(0)
if match:
add_err = 1
for ignore in ignorelist:
if add_err == 1 and re.search(ignore, match):
add_err = 0
if add_err == 1:
self.logger.log(prefix + " " + match)
errcount = errcount + 1
else:
break
else:
self.logger.log("Too many errors!")
watch.end()
return errcount
def is_applicable(self):
return self.is_applicable_common()
def is_applicable_common(self):
'''Return TRUE if we are applicable in the current test configuration'''
#raise ValueError("Abstract Class member (is_applicable)")
if self.is_loop and not self.Env["loop-tests"]:
return 0
elif self.is_unsafe and not self.Env["unsafe-tests"]:
return 0
elif self.is_valgrind and not self.Env["valgrind-tests"]:
return 0
elif self.is_experimental and not self.Env["experimental-tests"]:
return 0
elif self.is_docker_unsafe and self.Env["docker"]:
return 0
elif self.is_container and not self.Env["container-tests"]:
return 0
elif self.Env["benchmark"] and self.benchmark == 0:
return 0
return 1
def find_ocfs2_resources(self, node):
self.r_o2cb = None
self.r_ocfs2 = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "o2cb" and r.parent != "NA":
self.debug("Found o2cb: %s" % self.r_o2cb)
self.r_o2cb = r.parent
if re.search("^Constraint", line):
c = AuditConstraint(self.CM, line)
if c.type == "rsc_colocation" and c.target == self.r_o2cb:
self.r_ocfs2.append(c.rsc)
self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
return len(self.r_ocfs2)
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
return 1
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return []
class StopTest(CTSTest):
'''Stop (deactivate) the cluster manager on a node'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stop"
def __call__(self, node):
'''Perform the 'stop' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] != "up":
return self.skipped()
patterns = []
# Technically we should always be able to notice ourselves stopping
patterns.append(self.templates["Pat:We_stopped"] % node)
# Any active node needs to notice this one left
# (note that this won't work if we have multiple partitions)
for other in self.Env["nodes"]:
if self.CM.ShouldBeStatus[other] == "up" and other != node:
patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
#self.debug("Checking %s will notice %s left"%(other, node))
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
if node == self.CM.OurNode:
self.incr("us")
else:
if self.CM.upcount() <= 1:
self.incr("all")
else:
self.incr("them")
self.CM.StopaCM(node)
watch_result = watch.lookforall()
failreason = None
UnmatchedList = "||"
if watch.unmatched:
(rc, output) = self.rsh(node, "/bin/ps axf", None)
for line in output:
self.debug(line)
(rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
for line in output:
self.debug(line)
for regex in watch.unmatched:
self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
UnmatchedList += regex + "||";
failreason = "Missing shutdown pattern"
self.CM.cluster_stable(self.Env["DeadTime"])
if not watch.unmatched or self.CM.upcount() == 0:
return self.success()
if len(watch.unmatched) >= self.CM.upcount():
return self.failure("no match against (%s)" % UnmatchedList)
if failreason == None:
return self.success()
else:
return self.failure(failreason)
#
# We don't register StopTest because it's better when called by
# another test...
#
class StartTest(CTSTest):
'''Start (activate) the cluster manager on a node'''
def __init__(self, cm, debug=None):
CTSTest.__init__(self,cm)
self.name = "start"
self.debug = debug
def __call__(self, node):
'''Perform the 'start' test. '''
self.incr("calls")
if self.CM.upcount() == 0:
self.incr("us")
else:
self.incr("them")
if self.CM.ShouldBeStatus[node] != "down":
return self.skipped()
elif self.CM.StartaCM(node):
return self.success()
else:
return self.failure("Startup %s on node %s failed"
% (self.Env["Name"], node))
#
# We don't register StartTest because it's better when called by
# another test...
#
class FlipTest(CTSTest):
'''If it's running, stop it. If it's stopped start it.
Overthrow the status quo...
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Flip"
self.start = StartTest(cm)
self.stop = StopTest(cm)
def __call__(self, node):
'''Perform the 'Flip' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] == "up":
self.incr("stopped")
ret = self.stop(node)
type = "up->down"
# Give the cluster time to recognize it's gone...
time.sleep(self.Env["StableTime"])
elif self.CM.ShouldBeStatus[node] == "down":
self.incr("started")
ret = self.start(node)
type = "down->up"
else:
return self.skipped()
self.incr(type)
if ret:
return self.success()
else:
return self.failure("%s failure" % type)
# Register FlipTest as a good test to run
AllTestClasses.append(FlipTest)
class RestartTest(CTSTest):
'''Stop and restart a node'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Restart"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.benchmark = 1
def __call__(self, node):
'''Perform the 'restart' test. '''
self.incr("calls")
self.incr("node:" + node)
ret1 = 1
if self.CM.StataCM(node):
self.incr("WasStopped")
if not self.start(node):
return self.failure("start (setup) failure: "+node)
self.set_timer()
if not self.stop(node):
return self.failure("stop failure: "+node)
if not self.start(node):
return self.failure("start failure: "+node)
return self.success()
# Register RestartTest as a good test to run
AllTestClasses.append(RestartTest)
class StonithdTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stonithd"
self.startall = SimulStartLite(cm)
self.benchmark = 1
def __call__(self, node):
self.incr("calls")
if len(self.Env["nodes"]) < 2:
return self.skipped()
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
is_dc = self.CM.is_node_dc(node)
watchpats = []
watchpats.append(self.templates["Pat:FenceOpOK"] % node)
watchpats.append(self.templates["Pat:NodeFenced"] % node)
if self.Env["at-boot"] == 0:
self.debug("Expecting %s to stay down" % node)
self.CM.ShouldBeStatus[node] = "down"
else:
self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
origin = self.Env.RandomGen.choice(self.Env["nodes"])
rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
if rc == 194:
# 194 - 256 = -62 = Timer expired
#
# Look for the patterns, usually this means the required
# device was running on the node to be fenced - or that
# the required devices were in the process of being loaded
# and/or moved
#
# Effectively the node committed suicide so there will be
# no confirmation, but pacemaker should be watching and
# fence the node again
self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
elif origin != node and rc != 0:
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
elif origin == node and rc != 255:
# 255 == broken pipe, ie. the node was fenced as expected
self.logger.log("Locally originated fencing returned %d" % rc)
self.set_timer("fence")
matched = watch.lookforall()
self.log_timer("fence")
self.set_timer("reform")
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected patterns")
elif not is_stable:
return self.failure("Cluster did not become stable")
self.log_timer("reform")
return self.success()
def errorstoignore(self):
return [
self.templates["Pat:Fencing_start"] % ".*",
self.templates["Pat:Fencing_ok"] % ".*",
r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return 1
AllTestClasses.append(StonithdTest)
class StartOnebyOne(CTSTest):
'''Start all the nodes ~ one by one'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StartOnebyOne"
self.stopall = SimulStopLite(cm)
self.start = StartTest(cm)
self.ns = CTS.NodeStatus(cm.Env)
def __call__(self, dummy):
'''Perform the 'StartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Test setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.start(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to start: " + repr(failed))
return self.success()
# Register StartOnebyOne as a good test to run
AllTestClasses.append(StartOnebyOne)
class SimulStart(CTSTest):
'''Start all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStart"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStart' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
if not self.startall(None):
return self.failure("Startall failed")
return self.success()
# Register SimulStart as a good test to run
AllTestClasses.append(SimulStart)
class SimulStop(CTSTest):
'''Stop all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStop"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStop' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.stopall(None):
return self.failure("Stopall failed")
return self.success()
# Register SimulStop as a good test to run
AllTestClasses.append(SimulStop)
class StopOnebyOne(CTSTest):
'''Stop all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StopOnebyOne"
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
def __call__(self, dummy):
'''Perform the 'StopOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.stop(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to stop: " + repr(failed))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(StopOnebyOne)
class RestartOnebyOne(CTSTest):
'''Restart all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RestartOnebyOne"
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'RestartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
did_fail = []
self.set_timer()
self.restart = RestartTest(self.CM)
for node in self.Env["nodes"]:
if not self.restart(node):
did_fail.append(node)
if did_fail:
return self.failure("Could not restart %d nodes: %s"
% (len(did_fail), repr(did_fail)))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(RestartOnebyOne)
class PartialStart(CTSTest):
'''Start a node - but tell it to stop before it finishes starting up'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "PartialStart"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
self.stop = StopTest(cm)
#self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'PartialStart' test. '''
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
# FIXME! This should use the CM class to get the pattern
# then it would be applicable in general
watchpats = []
watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.CM.StartaCMnoBlock(node)
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
return self.failure("Setup of %s failed" % node)
ret = self.stop(node)
if not ret:
return self.failure("%s did not stop in time" % node)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# We might do some fencing in the 2-node case if we make it up far enough
return [
r"Executing reboot fencing operation",
r"Requesting fencing \([^)]+\) of node ",
]
# Register StopOnebyOne as a good test to run
AllTestClasses.append(PartialStart)
class StandbyTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Standby"
self.benchmark = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
# make sure the node is active
# set the node to standby mode
# check resources, none resource should be running on the node
# set the node to active mode
# check resouces, resources should have been migrated back (SHOULD THEY?)
def __call__(self, node):
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
self.debug("Make sure node %s is active" % node)
if self.CM.StandbyStatus(node) != "off":
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.debug("Getting resources running on node %s" % node)
rsc_on_node = self.CM.active_resources(node)
watchpats = []
watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.debug("Setting node %s to standby mode" % node)
if not self.CM.SetStandbyMode(node, "on"):
return self.failure("can't set node %s to standby mode" % node)
self.set_timer("on")
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.CM.SetStandbyMode(node, "off")
return self.failure("cluster didn't react to standby change on %s" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "on":
return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
self.log_timer("on")
self.debug("Checking resources")
bad_run = self.CM.active_resources(node)
if len(bad_run) > 0:
rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
self.debug("Setting node %s to active mode" % node)
self.CM.SetStandbyMode(node, "off")
return rc
self.debug("Setting node %s to active mode" % node)
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.set_timer("off")
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.log_timer("off")
return self.success()
AllTestClasses.append(StandbyTest)
class ValgrindTest(CTSTest):
'''Check for memory leaks'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Valgrind"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_valgrind = 1
self.is_loop = 1
def setup(self, node):
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind,
# and clear any valgrind logs from previous runs. For now, we rely on
# the user to do this manually.
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
return self.success()
def teardown(self, node):
# Return all nodes to normal
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
return self.success()
def find_leaks(self):
# Check for leaks
# (no longer used but kept in case feature is restored)
leaked = []
self.stop = StopTest(self.CM)
for node in self.Env["nodes"]:
rc = self.stop(node)
if not rc:
self.failure("Couldn't shut down %s" % node)
rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
if rc != 1:
leaked.append(node)
self.failure("Valgrind errors detected on %s" % node)
(rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
for line in output:
self.logger.log(line)
(rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
for line in output:
self.debug(line)
self.rsh(node, "rm -f %s" % self.logger.logPat, None)
return leaked
def __call__(self, node):
#leaked = self.find_leaks()
#if len(leaked) > 0:
# return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
r"pacemaker-based.*: .* avoid confusing Valgrind",
r"HA_VALGRIND_ENABLED",
]
class StandbyLoopTest(ValgrindTest):
'''Check for memory leaks by putting a node in and out of standby for an hour'''
# @TODO This is not a useful test for memory leaks
def __init__(self, cm):
ValgrindTest.__init__(self,cm)
self.name = "StandbyLoop"
def __call__(self, node):
lpc = 0
delay = 2
failed = 0
done = time.time() + self.Env["loop-minutes"] * 60
while time.time() <= done and not failed:
lpc = lpc + 1
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "on"):
self.failure("can't set node %s to standby mode" % node)
failed = lpc
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "off"):
self.failure("can't set node %s to active mode" % node)
failed = lpc
leaked = self.find_leaks()
if failed:
return self.failure("Iteration %d failed" % failed)
elif len(leaked) > 0:
return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
#AllTestClasses.append(StandbyLoopTest)
class BandwidthTest(CTSTest):
# Tests should not be cluster-manager-specific
# If you need to find out cluster manager configuration to do this, then
# it should be added to the generic cluster manager API.
'''Test the bandwidth which the cluster uses'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Bandwidth"
self.start = StartTest(cm)
self.__setitem__("min",0)
self.__setitem__("max",0)
self.__setitem__("totalbandwidth",0)
(handle, self.tempfile) = tempfile.mkstemp(".cts")
os.close(handle)
self.startall = SimulStartLite(cm)
def __call__(self, node):
'''Perform the Bandwidth test'''
self.incr("calls")
if self.CM.upcount() < 1:
return self.skipped()
Path = self.CM.InternalCommConfig()
if "ip" not in Path["mediatype"]:
return self.skipped()
port = Path["port"][0]
port = int(port)
ret = self.startall(None)
if not ret:
return self.failure("Test setup failed")
time.sleep(5) # We get extra messages right after startup.
fstmpfile = "/var/run/band_estimate"
dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
% (port, fstmpfile)
rc = self.rsh(node, dumpcmd)
if rc == 0:
farfile = "root@%s:%s" % (node, fstmpfile)
self.rsh.cp(farfile, self.tempfile)
Bandwidth = self.countbandwidth(self.tempfile)
if not Bandwidth:
self.logger.log("Could not compute bandwidth.")
return self.success()
intband = int(Bandwidth + 0.5)
self.logger.log("...bandwidth: %d bits/sec" % intband)
self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
if self.Stats["min"] == 0:
self.Stats["min"] = Bandwidth
if Bandwidth > self.Stats["max"]:
self.Stats["max"] = Bandwidth
if Bandwidth < self.Stats["min"]:
self.Stats["min"] = Bandwidth
self.rsh(node, "rm -f %s" % fstmpfile)
os.unlink(self.tempfile)
return self.success()
else:
return self.failure("no response from tcpdump command [%d]!" % rc)
def countbandwidth(self, file):
fp = open(file, "r")
fp.seek(0)
count = 0
sum = 0
while 1:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count + 1
linesplit = line.split(" ")
for j in range(len(linesplit)-1):
if linesplit[j] == "udp": break
if linesplit[j] == "length:": break
try:
sum = sum + int(linesplit[j+1])
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T1 = linesplit[0]
timesplit = T1.split(":")
time2split = timesplit[2].split(".")
time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
break
while count < 100:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count+1
linessplit = line.split(" ")
for j in range(len(linessplit)-1):
if linessplit[j] == "udp": break
- if linesplit[j] == "length:": break
+ if linessplit[j] == "length:": break
try:
sum = int(linessplit[j+1]) + sum
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T2 = linessplit[0]
timesplit = T2.split(":")
time2split = timesplit[2].split(".")
time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
time = time2-time1
if (time <= 0):
return 0
return int((sum*8)/time)
def is_applicable(self):
'''BandwidthTest never applicable'''
return 0
AllTestClasses.append(BandwidthTest)
###################################################################
class MaintenanceMode(CTSTest):
###################################################################
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "MaintenanceMode"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
#self.is_unsafe = 1
self.benchmark = 1
self.action = "asyncmon"
self.interval = 0
self.rid = "maintenanceDummy"
def toggleMaintenanceMode(self, node, action):
pats = []
pats.append(self.templates["Pat:DC_IDLE"])
# fail the resource right after turning Maintenance mode on
# verify it is not recovered until maintenance mode is turned off
if action == "On":
pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of %s on" % (self.action, self.rid))
else:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.debug("Turning maintenance mode %s" % action)
self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
if (action == "On"):
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover%s" % (action))
watch.lookforall()
self.log_timer("recover%s" % (action))
if watch.unmatched:
self.debug("Failed to find patterns when turning maintenance mode %s" % action)
return repr(watch.unmatched)
return ""
def insertMaintenanceDummy(self, node):
pats = []
pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.AddDummyRsc(node, self.rid)
self.set_timer("addDummy")
watch.lookforall()
self.log_timer("addDummy")
if watch.unmatched:
self.debug("Failed to find patterns when adding maintenance dummy resource")
return repr(watch.unmatched)
return ""
def removeMaintenanceDummy(self, node):
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.RemoveDummyRsc(node, self.rid)
self.set_timer("removeDummy")
watch.lookforall()
self.log_timer("removeDummy")
if watch.unmatched:
self.debug("Failed to find patterns when removing maintenance dummy resource")
return repr(watch.unmatched)
return ""
def managedRscList(self, node):
rscList = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.managed():
rscList.append(tmp.id)
return rscList
def verifyResources(self, node, rscList, managed):
managedList = list(rscList)
managed_str = "managed"
if not managed:
managed_str = "unmanaged"
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if managed and not tmp.managed():
continue
elif not managed and tmp.managed():
continue
elif managedList.count(tmp.id):
managedList.remove(tmp.id)
if len(managedList) == 0:
self.debug("Found all %s resources on %s" % (managed_str, node))
return True
self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
return False
def __call__(self, node):
'''Perform the 'MaintenanceMode' test. '''
self.incr("calls")
verify_managed = False
verify_unmanaged = False
failPat = ""
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
# get a list of all the managed resources. We use this list
# after enabling maintenance mode to verify all managed resources
# become un-managed. After maintenance mode is turned off, we use
# this list to verify all the resources become managed again.
managedResources = self.managedRscList(node)
if len(managedResources) == 0:
self.logger.log("No managed resources on %s" % node)
return self.skipped()
# insert a fake resource we can fail during maintenance mode
# so we can verify recovery does not take place until after maintenance
# mode is disabled.
failPat = failPat + self.insertMaintenanceDummy(node)
# toggle maintenance mode ON, then fail dummy resource.
failPat = failPat + self.toggleMaintenanceMode(node, "On")
# verify all the resources are now unmanaged
if self.verifyResources(node, managedResources, False):
verify_unmanaged = True
# Toggle maintenance mode OFF, verify dummy is recovered.
failPat = failPat + self.toggleMaintenanceMode(node, "Off")
# verify all the resources are now managed again
if self.verifyResources(node, managedResources, True):
verify_managed = True
# Remove our maintenance dummy resource.
failPat = failPat + self.removeMaintenanceDummy(node)
self.CM.cluster_stable()
if failPat != "":
return self.failure("Unmatched patterns: %s" % (failPat))
elif verify_unmanaged is False:
return self.failure("Failed to verify resources became unmanaged during maintenance mode")
elif verify_managed is False:
return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(MaintenanceMode)
class ResourceRecover(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ResourceRecover"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
self.rid = None
self.rid_alt = None
#self.is_unsafe = 1
self.benchmark = 1
# these are the values used for the new LRM API call
self.action = "asyncmon"
self.interval = 0
def __call__(self, node):
'''Perform the 'ResourceRecover' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
resourcelist = self.CM.active_resources(node)
# if there are no resourcelist, return directly
if len(resourcelist) == 0:
self.logger.log("No active resources on %s" % node)
return self.skipped()
self.rid = self.Env.RandomGen.choice(resourcelist)
self.rid_alt = self.rid
rsc = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.id == self.rid:
rsc = tmp
# Handle anonymous clones that get renamed
self.rid = rsc.clone_id
break
if not rsc:
return self.failure("Could not find %s in the resource list" % self.rid)
self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
pats = []
pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of (%s|%s) on" % (self.action,
rsc.id, rsc.clone_id))
if rsc.managed():
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
if rsc.unique():
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
else:
# Anonymous clones may get restarted with a different clone number
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover")
watch.lookforall()
self.log_timer("recover")
self.CM.cluster_stable()
recovered = self.CM.ResourceLocation(self.rid)
if watch.unmatched:
return self.failure("Patterns not found: %s" % repr(watch.unmatched))
elif rsc.unique() and len(recovered) > 1:
return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
elif len(recovered) > 0:
self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
elif rsc.managed():
return self.failure("%s was not recovered and is inactive" % self.rid)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(ResourceRecover)
class ComponentFail(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ComponentFail"
# TODO make this work correctly in docker.
self.is_docker_unsafe = 1
self.startall = SimulStartLite(cm)
self.complist = cm.Components()
self.patterns = []
self.okerrpatterns = []
self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'ComponentFail' test. '''
self.incr("calls")
self.patterns = []
self.okerrpatterns = []
# start all nodes
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.CM.cluster_stable(self.Env["StableTime"]):
return self.failure("Setup failed - unstable")
node_is_dc = self.CM.is_node_dc(node, None)
# select a component to kill
chosen = self.Env.RandomGen.choice(self.complist)
while chosen.dc_only == 1 and node_is_dc == 0:
chosen = self.Env.RandomGen.choice(self.complist)
self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
self.incr(chosen.name)
if chosen.name != "corosync":
self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
self.patterns.extend(chosen.pats)
if node_is_dc:
self.patterns.extend(chosen.dc_pats)
if chosen.name == "pacemaker-fenced":
# Ignore actions for STONITH resources
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
# supply a copy so self.patterns doesn't end up empty
tmpPats = []
tmpPats.extend(self.patterns)
self.patterns.extend(chosen.badnews_ignore)
# Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
stonithPats = []
stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
stonith = self.create_watch(stonithPats, 0)
stonith.setwatch()
# set the watch for stable
watch = self.create_watch(
tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
# kill the component
chosen.kill(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for any fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
self.CM.cluster_stable(self.Env["StartTime"])
self.debug("Checking if %s was shot" % node)
shot = stonith.look(60)
if shot:
self.debug("Found: " + repr(shot))
self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
if self.Env["at-boot"] == 0:
self.CM.ShouldBeStatus[node] = "down"
# If fencing occurred, chances are many (if not all) the expected logs
# will not be sent - or will be lost when the node reboots
return self.success()
# check for logs indicating a graceful recovery
matched = watch.lookforall(allow_multiple_matches=1)
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected %s patterns" % chosen.name)
elif not is_stable:
return self.failure("Cluster did not become stable after killing %s" % chosen.name)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Note that okerrpatterns refers to the last time we ran this test
# The good news is that this works fine for us...
self.okerrpatterns.extend(self.patterns)
return self.okerrpatterns
AllTestClasses.append(ComponentFail)
class SplitBrainTest(CTSTest):
'''It is used to test split-brain. when the path between the two nodes break
check the two nodes both take over the resource'''
def __init__(self,cm):
CTSTest.__init__(self,cm)
self.name = "SplitBrain"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.is_experimental = 1
def isolate_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
if len(other_nodes) == 0:
return 1
self.debug("Creating partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
if not self.CM.isolate_node(node, other_nodes):
self.logger.log("Could not isolate %s" % node)
return 0
return 1
def heal_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
if len(other_nodes) == 0:
return 1
self.debug("Healing partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
self.CM.unisolate_node(node, other_nodes)
def __call__(self, node):
'''Perform split-brain test'''
self.incr("calls")
self.passed = 1
partitions = {}
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
while 1:
# Retry until we get multiple partitions
partitions = {}
p_max = len(self.Env["nodes"])
for node in self.Env["nodes"]:
p = self.Env.RandomGen.randint(1, p_max)
if not p in partitions:
partitions[p] = []
partitions[p].append(node)
p_max = len(list(partitions.keys()))
if p_max > 1:
break
# else, try again
self.debug("Created %d partitions" % p_max)
for key in list(partitions.keys()):
self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
# Disabling STONITH to reduce test complexity for now
self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
for key in list(partitions.keys()):
self.isolate_partition(partitions[key])
count = 30
while count > 0:
if len(self.CM.find_partitions()) != p_max:
time.sleep(10)
else:
break
else:
self.failure("Expected partitions were not created")
# Target number of partitions formed - wait for stability
if not self.CM.cluster_stable():
self.failure("Partitioned cluster not stable")
# Now audit the cluster state
self.CM.partitions_expected = p_max
if not self.audit():
self.failure("Audits failed")
self.CM.partitions_expected = 1
# And heal them again
for key in list(partitions.keys()):
self.heal_partition(partitions[key])
# Wait for a single partition to form
count = 30
while count > 0:
if len(self.CM.find_partitions()) != 1:
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not reform")
# Wait for it to have the right number of members
count = 30
while count > 0:
members = []
partitions = self.CM.find_partitions()
if len(partitions) > 0:
members = partitions[0].split()
if len(members) != len(self.Env["nodes"]):
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not completely reform")
# Wait up to 20 minutes - the delay is more preferable than
# trying to continue with in a messed up state
if not self.CM.cluster_stable(1200):
self.failure("Reformed cluster not stable")
if self.Env["continue"] == 1:
answer = "Y"
else:
try:
answer = input_wrapper('Continue? [nY]')
except EOFError as e:
answer = "n"
if answer and answer == "n":
raise ValueError("Reformed cluster not stable")
# Turn fencing back on
if self.Env["DoFencing"]:
self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
self.CM.cluster_stable()
if self.passed:
return self.success()
return self.failure("See previous errors")
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return [
r"Another DC detected:",
r"(ERROR|error).*: .*Application of an update diff failed",
r"pacemaker-controld.*:.*not in our membership list",
r"CRIT:.*node.*returning after partition",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
return len(self.Env["nodes"]) > 2
AllTestClasses.append(SplitBrainTest)
class Reattach(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Reattach"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
self.is_unsafe = 0 # Handled by canrunnow()
def _is_managed(self, node):
is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1)
is_managed = is_managed[:-1] # Strip off the newline
return is_managed == "true"
def _set_unmanaged(self, node):
self.debug("Disable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
def _set_managed(self, node):
self.debug("Re-enable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
def setup(self, node):
attempt = 0
if not self.startall(None):
return None
# Make sure we are really _really_ stable and that all
# resources, including those that depend on transient node
# attributes, are started
while not self.CM.cluster_stable(double_check=True):
if attempt < 5:
attempt += 1
self.debug("Not stable yet, re-testing")
else:
self.logger.log("Cluster is not stable")
return None
return 1
def teardown(self, node):
# Make sure 'node' is up
start = StartTest(self.CM)
start(node)
if not self._is_managed(node):
self.logger.log("Attempting to re-enable resource management on %s" % node)
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
self.logger.log("Could not re-enable resource management")
return 0
return 1
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
if self.find_ocfs2_resources(node):
self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
return 0
return 1
def __call__(self, node):
self.incr("calls")
pats = []
# Conveniently, the scheduler will display this message when disabling
# management, even if fencing is not enabled, so we can rely on it.
managed = self.create_watch(["Delaying fencing operations"], 60)
managed.setwatch()
self._set_unmanaged(node)
if not managed.lookforall():
self.logger.log("Patterns not found: " + repr(managed.unmatched))
return self.failure("Resource management not disabled")
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
watch = self.create_watch(pats, 60, "ShutdownActivity")
watch.setwatch()
self.debug("Shutting down the cluster")
ret = self.stopall(None)
if not ret:
self._set_managed(node)
return self.failure("Couldn't shut down the cluster")
self.debug("Bringing the cluster back up")
ret = self.startall(None)
time.sleep(5) # allow ping to update the CIB
if not ret:
self._set_managed(node)
return self.failure("Couldn't restart the cluster")
if self.local_badnews("ResourceActivity:", watch):
self._set_managed(node)
return self.failure("Resources stopped or started during cluster restart")
watch = self.create_watch(pats, 60, "StartupActivity")
watch.setwatch()
# Re-enable resource management (and verify it happened).
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
# Ignore actions for STONITH resources
ignore = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.debug("Ignoring start actions for %s" % r.id)
ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
if self.local_badnews("ResourceActivity:", watch, ignore):
return self.failure("Resources stopped or started after resource management was re-enabled")
return ret
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"resource( was|s were) active at shutdown",
]
def is_applicable(self):
return 1
AllTestClasses.append(Reattach)
class SpecialTest1(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SpecialTest1"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, node):
'''Perform the 'SpecialTest1' test for Andrew. '''
self.incr("calls")
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Could not stop all nodes")
# Test config recovery when the other nodes come up
self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
# Start the selected node
ret = self.restart1(node)
if not ret:
return self.failure("Could not start "+node)
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Could not start the remaining nodes")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Errors that occur as a result of the CIB being wiped
return [
r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
r"error.*: Resource start-up disabled since no STONITH resources have been defined",
r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
]
AllTestClasses.append(SpecialTest1)
class HAETest(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "HAETest"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_loop = 1
def setup(self, node):
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
return self.success()
def wait_on_state(self, node, resource, expected_clones, attempts=240):
while attempts > 0:
active = 0
(rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
# Hack until crm_resource does the right thing
if rc == 0 and lines:
active = len(lines)
if len(lines) == expected_clones:
return 1
elif rc == 1:
self.debug("Resource %s is still inactive" % resource)
elif rc == 234:
self.logger.log("Unknown resource %s" % resource)
return 0
elif rc == 246:
self.logger.log("Cluster is inactive")
return 0
elif rc != 0:
self.logger.log("Call to crm_resource failed, rc=%d" % rc)
return 0
else:
self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
attempts -= 1
time.sleep(1)
return 0
def find_dlm(self, node):
self.r_dlm = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "controld" and r.parent != "NA":
self.debug("Found dlm: %s" % self.r_dlm)
self.r_dlm = r.parent
return 1
return 0
def find_hae_resources(self, node):
self.r_dlm = None
self.r_o2cb = None
self.r_ocfs2 = []
if self.find_dlm(node):
self.find_ocfs2_resources(node)
def is_applicable(self):
if not self.is_applicable_common():
return 0
if self.Env["Schema"] == "hae":
return 1
return None
class HAERoleTest(HAETest):
def __init__(self, cm):
'''Lars' mount/unmount test for the HA extension. '''
HAETest.__init__(self,cm)
self.name = "HAERoleTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
delay = 2
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "Stopped")
if not self.wait_on_state(node, self.r_dlm, 0):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "Started")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAERoleTest)
class HAEStandbyTest(HAETest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
HAETest.__init__(self,cm)
self.name = "HAEStandbyTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "true")
if not self.wait_on_state(node, self.r_dlm, clone_max-1):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "false")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAEStandbyTest)
class NearQuorumPointTest(CTSTest):
'''
This test brings larger clusters near the quorum point (50%).
In addition, it will test doing starts and stops at the same time.
Here is how I think it should work:
- loop over the nodes and decide randomly which will be up and which
will be down Use a 50% probability for each of up/down.
- figure out what to do to get into that state from the current state
- in parallel, bring up those going up and bring those going down.
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "NearQuorumPoint"
def __call__(self, dummy):
'''Perform the 'NearQuorumPoint' test. '''
self.incr("calls")
startset = []
stopset = []
stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
#decide what to do with each node
for node in self.Env["nodes"]:
action = self.Env.RandomGen.choice(["start","stop"])
#action = self.Env.RandomGen.choice(["start","stop","no change"])
if action == "start" :
startset.append(node)
elif action == "stop" :
stopset.append(node)
self.debug("start nodes:" + repr(startset))
self.debug("stop nodes:" + repr(stopset))
#add search patterns
watchpats = [ ]
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
watchpats.append(self.templates["Pat:We_stopped"] % node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
#watchpats.append(self.templates["Pat:NonDC_started"] % node)
watchpats.append(self.templates["Pat:Local_started"] % node)
else:
for stopping in stopset:
if self.CM.ShouldBeStatus[stopping] == "up":
watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
if len(watchpats) == 0:
return self.skipped()
if len(startset) != 0:
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
#begin actions
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
self.CM.StartaCMnoBlock(node)
#get the result
if watch.lookforall():
self.CM.cluster_stable()
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
return self.success()
self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
#get the "bad" nodes
upnodes = []
for node in stopset:
if self.CM.StataCM(node) == 1:
upnodes.append(node)
downnodes = []
for node in startset:
if self.CM.StataCM(node) == 0:
downnodes.append(node)
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
if upnodes == [] and downnodes == []:
self.CM.cluster_stable()
# Make sure they're completely down with no residule
for node in stopset:
self.rsh(node, self.templates["StopCmd"])
return self.success()
if len(upnodes) > 0:
self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
if len(downnodes) > 0:
self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
return self.failure()
def is_applicable(self):
return 1
AllTestClasses.append(NearQuorumPointTest)
class RollingUpgradeTest(CTSTest):
'''Perform a rolling upgrade of the cluster'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RollingUpgrade"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def setup(self, node):
# Start all remaining nodes
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.downgrade(node, None):
return self.failure("Couldn't downgrade %s" % node)
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.upgrade(node, None):
return self.failure("Couldn't upgrade %s" % node)
return self.success()
def install(self, node, version, start=1, flags="--force"):
target_dir = "/tmp/rpm-%s" % version
src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
self.logger.log("Installing %s on %s with %s" % (version, node, flags))
if not self.stop(node):
return self.failure("stop failure: "+node)
rc = self.rsh(node, "mkdir -p %s" % target_dir)
rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
(rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
for line in lines:
line = line[:-1]
rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
if start and not self.start(node):
return self.failure("start failure: "+node)
return self.success()
def upgrade(self, node, start=1):
return self.install(node, self.Env["current-version"], start)
def downgrade(self, node, start=1):
return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
def __call__(self, node):
'''Perform the 'Rolling Upgrade' test. '''
self.incr("calls")
for node in self.Env["nodes"]:
if self.upgrade(node):
return self.failure("Couldn't upgrade %s" % node)
self.CM.cluster_stable()
return self.success()
def is_applicable(self):
if not self.is_applicable_common():
return None
if not "rpm-dir" in list(self.Env.keys()):
return None
if not "current-version" in list(self.Env.keys()):
return None
if not "previous-version" in list(self.Env.keys()):
return None
return 1
# Register RestartTest as a good test to run
AllTestClasses.append(RollingUpgradeTest)
class BSC_AddResource(CTSTest):
'''Add a resource to the cluster'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "AddResource"
self.resource_offset = 0
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
def __call__(self, node):
self.incr("calls")
self.resource_offset = self.resource_offset + 1
r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok"
patterns = []
patterns.append(start_pat % r_id)
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
ip = self.NextIP()
if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
return self.failure("Make resource %s failed" % r_id)
failed = 0
watch_result = watch.lookforall()
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Pattern not found: %s" % (regex))
failed = 1
if failed:
return self.failure("Resource pattern(s) not found")
if not self.CM.cluster_stable(self.Env["DeadTime"]):
return self.failure("Unstable cluster")
return self.success()
def NextIP(self):
ip = self.Env["IPBase"]
if ":" in ip:
fields = ip.rpartition(":")
fields[2] = str(hex(int(fields[2], 16)+1))
print(str(hex(int(f[2], 16)+1)))
else:
fields = ip.rpartition('.')
fields[2] = str(int(fields[2])+1)
ip = fields[0] + fields[1] + fields[3];
self.Env["IPBase"] = ip
return ip.strip()
def make_ip_resource(self, node, id, rclass, type, ip):
self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
rsc_xml="""
<primitive id="%s" class="%s" type="%s" provider="heartbeat">
<instance_attributes id="%s"><attributes>
<nvpair id="%s" name="ip" value="%s"/>
</attributes></instance_attributes>
</primitive>""" % (id, rclass, type, id, id, ip)
node_constraint = """
<rsc_location id="run_%s" rsc="%s">
<rule id="pref_run_%s" score="100">
<expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
</rule>
</rsc_location>""" % (id, id, id, id, node)
rc = 0
(rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
if rc != 0:
self.logger.log("Constraint creation failed: %d" % rc)
return None
(rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
if rc != 0:
self.logger.log("Resource creation failed: %d" % rc)
return None
return 1
def is_applicable(self):
if self.Env["DoBSC"]:
return 1
return None
AllTestClasses.append(BSC_AddResource)
class SimulStopLite(CTSTest):
'''Stop any active nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStopLite"
def __call__(self, dummy):
'''Perform the 'SimulStopLite' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
watchpats = [ ]
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.incr("WasStarted")
watchpats.append(self.templates["Pat:We_stopped"] % node)
if len(watchpats) == 0:
return self.success()
# Stop all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.set_timer()
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
if watch.lookforall():
# Make sure they're completely down with no residule
for node in self.Env["nodes"]:
self.rsh(node, self.templates["StopCmd"])
return self.success()
did_fail = 0
up_nodes = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 1:
did_fail = 1
up_nodes.append(node)
if did_fail:
return self.failure("Active nodes exist: " + repr(up_nodes))
self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
+ repr(watch.unmatched))
return self.failure("Missing log message: "+repr(watch.unmatched))
def is_applicable(self):
'''SimulStopLite is a setup test and never applicable'''
return 0
class SimulStartLite(CTSTest):
'''Start any stopped nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStartLite"
def __call__(self, dummy):
'''Perform the 'SimulStartList' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
node_list = []
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "down":
self.incr("WasStopped")
node_list.append(node)
self.set_timer()
while len(node_list) > 0:
# Repeat until all nodes come up
watchpats = [ ]
uppat = self.templates["Pat:NonDC_started"]
if self.CM.upcount() == 0:
uppat = self.templates["Pat:Local_started"]
watchpats.append(self.templates["Pat:DC_IDLE"])
for node in node_list:
watchpats.append(uppat % node)
watchpats.append(self.templates["Pat:InfraUp"] % node)
watchpats.append(self.templates["Pat:PacemakerUp"] % node)
# Start all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
stonith = self.CM.prepare_fencing_watcher(self.name)
for node in node_list:
self.CM.StartaCMnoBlock(node)
watch.lookforall()
node_list = self.CM.fencing_cleanup(self.name, stonith)
if node_list == None:
return self.failure("Cluster did not stabilize")
# Remove node_list messages from watch.unmatched
for node in node_list:
self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
if watch.unmatched:
try:
watch.unmatched.remove(uppat % node)
except:
self.debug("Already matched: %s" % (uppat % node))
try:
watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
try:
watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
if not self.CM.cluster_stable():
return self.failure("Cluster did not stabilize")
did_fail = 0
unstable = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 0:
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstarted nodes exist: " + repr(unstable))
unstable = []
for node in self.Env["nodes"]:
if not self.CM.node_stable(node):
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstable cluster nodes exist: " + repr(unstable))
return self.success()
def is_applicable(self):
'''SimulStartLite is a setup test and never applicable'''
return 0
def TestList(cm, audits):
result = []
for testclass in AllTestClasses:
bound_test = testclass(cm)
if bound_test.is_applicable():
bound_test.Audits = audits
result.append(bound_test)
return result
class RemoteLXC(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteLXC"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = 1
self.is_docker_unsafe = 1
self.failed = 0
self.fail_string = ""
def start_lxc_simple(self, node):
# restore any artifacts laying around from a previous test.
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
# generate the containers, put them in the config, add some resources to them
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
self.set_timer("remoteSimpleInit")
watch.lookforall()
self.log_timer("remoteSimpleInit")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
def cleanup_lxc_simple(self, node):
pats = [ ]
# if the test failed, attempt to clean up the cib and libvirt environment
# as best as possible
if self.failed == 1:
# restore libvirt and cib
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
return
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
self.set_timer("remoteSimpleCleanup")
watch.lookforall()
self.log_timer("remoteSimpleCleanup")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
# cleanup libvirt
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
def __call__(self, node):
'''Perform the 'RemoteLXC' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed, start all nodes failed.")
rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
if rc == 1:
self.log("Environment test for lxc support failed.")
return self.skipped()
self.start_lxc_simple(node)
self.cleanup_lxc_simple(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed == 1:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for ping",
r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
# The orphaned lxc-ms resource causes an expected transition error
# that is a result of the scheduler not having knowledge that the
# promotable resource used to be a clone. As a result, it looks like that
# resource is running in multiple locations when it shouldn't... But in
# this instance we know why this error is occurring and that it is expected.
r"Calculated [Tt]ransition .*pe-error",
r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
r"Unknown operation: fail",
r"VirtualDomain.*ERROR: Unable to determine emulator",
]
AllTestClasses.append(RemoteLXC)
class RemoteDriver(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = self.__class__.__name__
self.is_docker_unsafe = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
self.remote_rsc = "remote-rsc"
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
self.reset()
def reset(self):
self.pcmk_started = 0
self.failed = False
self.fail_string = ""
self.remote_node_added = 0
self.remote_rsc_added = 0
self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False])
def fail(self, msg):
""" Mark test as failed. """
self.failed = True
# Always log the failure.
self.logger.log(msg)
# Use first failure as test status, as it's likely to be most useful.
if not self.fail_string:
self.fail_string = msg
def get_othernode(self, node):
for othernode in self.Env["nodes"]:
if othernode == node:
# we don't want to try and use the cib that we just shutdown.
# find a cluster node that is not our soon to be remote-node.
continue
else:
return othernode
def del_rsc(self, node, rsc):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
if rc != 0:
self.fail("Removal of resource '%s' failed" % rsc)
def add_rsc(self, node, rsc_xml):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
if rc != 0:
self.fail("resource creation failed")
def add_primitive_rsc(self, node):
rsc_xml = """
<primitive class="ocf" id="%s" provider="heartbeat" type="Dummy">
<operations>
<op id="remote-rsc-monitor-interval-10s" interval="10s" name="monitor"/>
</operations>
<meta_attributes id="remote-meta_attributes"/>
</primitive>""" % (self.remote_rsc)
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_rsc_added = 1
def add_connection_rsc(self, node):
if self.remote_use_reconnect_interval:
# use reconnect interval and make sure to set cluster-recheck-interval as well.
rsc_xml = """
<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
<instance_attributes id="remote-instance_attributes"/>
<instance_attributes id="remote-instance_attributes">
<nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
<nvpair id="remote-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
</instance_attributes>
<operations>
<op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
<op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="60"/>
</operations>
</primitive>""" % (self.remote_node, node)
self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s"))
else:
# not using reconnect interval
rsc_xml = """
<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
<instance_attributes id="remote-instance_attributes"/>
<instance_attributes id="remote-instance_attributes">
<nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
</instance_attributes>
<operations>
<op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
<op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="120"/>
</operations>
</primitive>""" % (self.remote_node, node)
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_node_added = 1
def stop_pcmk_remote(self, node):
# disable pcmk remote
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote stop")
if rc != 0:
time.sleep(6)
else:
break
def start_pcmk_remote(self, node):
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote start")
if rc != 0:
time.sleep(6)
else:
self.pcmk_started = 1
break
def kill_pcmk_remote(self, node):
""" Simulate a Pacemaker Remote daemon failure. """
# We kill the process to prevent a graceful stop,
# then stop it to prevent the OS from restarting it.
self.rsh(node, "killall -9 pacemaker-remoted")
self.stop_pcmk_remote(node)
def start_metal(self, node):
pcmk_started = 0
# make sure the resource doesn't already exist for some reason
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
if not self.stop(node):
self.fail("Failed to shutdown cluster node %s" % node)
return
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
# Convert node to baremetal now that it has shutdown the cluster stack
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
self.add_connection_rsc(node)
self.set_timer("remoteMetalInit")
watch.lookforall()
self.log_timer("remoteMetalInit")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def migrate_connection(self, node):
if self.failed:
return
pats = [ ]
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(pats, 120)
watch.setwatch()
(rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
if rc != 0:
self.fail("failed to move remote node connection resource")
return
self.set_timer("remoteMetalMigrate")
watch.lookforall()
self.log_timer("remoteMetalMigrate")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def fail_rsc(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, 120)
watch.setwatch()
self.debug("causing dummy rsc to fail.")
rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
self.set_timer("remoteRscFail")
watch.lookforall()
self.log_timer("remoteRscFail")
if watch.unmatched:
self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)
def fail_connection(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
watch = self.create_watch(watchpats, 120)
watch.setwatch()
# force stop the pcmk remote daemon. this will result in fencing
self.debug("Force stopped active remote node")
self.kill_pcmk_remote(node)
self.debug("Waiting for remote node to be fenced.")
self.set_timer("remoteMetalFence")
watch.lookforall()
self.log_timer("remoteMetalFence")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
self.debug("Waiting for the remote node to come back up")
self.CM.ns.WaitForNodeToComeUp(node, 120);
pats = [ ]
watch = self.create_watch(pats, 240)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
# start the remote node again watch it integrate back into cluster.
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
self.debug("Waiting for remote node to rejoin cluster after being fenced.")
self.set_timer("remoteMetalRestart")
watch.lookforall()
self.log_timer("remoteMetalRestart")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def add_dummy_rsc(self, node):
if self.failed:
return
# verify we can put a resource on the remote node
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
# Add a resource that must live on remote-node
self.add_primitive_rsc(node)
# force that rsc to prefer the remote node.
(rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
if rc != 0:
self.fail("Failed to place remote resource on remote node.")
return
self.set_timer("remoteMetalRsc")
watch.lookforall()
self.log_timer("remoteMetalRsc")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def test_attributes(self, node):
if self.failed:
return
# This verifies permanent attributes can be set on a remote-node. It also
# verifies the remote-node can edit its own cib node section remotely.
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to get remote-node attribute")
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to delete remote-node attribute")
return
def cleanup_metal(self, node):
if self.pcmk_started == 0:
return
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
if self.remote_node_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))
self.set_timer("remoteMetalCleanup")
if self.remote_use_reconnect_interval:
self.debug("Cleaning up re-check interval")
self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"])
if self.remote_rsc_added == 1:
# Remove dummy resource added for remote node tests
self.debug("Cleaning up dummy rsc put on remote node")
self.rsh(node, "crm_resource -U -r %s" % self.remote_rsc)
self.del_rsc(node, self.remote_rsc)
if self.remote_node_added == 1:
# Remove remote node's connection resource
self.debug("Cleaning up remote node connection resource")
self.rsh(node, "crm_resource -U -r %s" % (self.remote_node))
self.del_rsc(node, self.remote_node)
watch.lookforall()
self.log_timer("remoteMetalCleanup")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
self.stop_pcmk_remote(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.remote_node_added == 1:
# Remove remote node itself
self.debug("Cleaning up node entry for remote node")
self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)
def setup_env(self, node):
self.remote_node = "remote-%s" % (node)
# we are assuming if all nodes have a key, that it is
# the right key... If any node doesn't have a remote
# key, we regenerate it everywhere.
if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
return
# create key locally
(handle, keyfile) = tempfile.mkstemp(".cts")
os.close(handle)
devnull = open(os.devnull, 'wb')
subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
stdout=devnull, stderr=devnull)
devnull.close()
# sync key throughout the cluster
for node in self.Env["nodes"]:
self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
os.unlink(keyfile)
def is_applicable(self):
if not self.is_applicable_common():
return False
for node in self.Env["nodes"]:
rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
if rc != 0:
return False
return True
def start_new_test(self, node):
self.incr("calls")
self.reset()
ret = self.startall(None)
if not ret:
return self.failure("setup failed: could not start all nodes")
self.setup_env(node)
self.start_metal(node)
self.add_dummy_rsc(node)
return True
def __call__(self, node):
return self.failure("This base class is not meant to be called directly.")
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [ """is running on remote.*which isn't allowed""",
"""Connection terminated""",
"""Failed to send remote""",
]
# RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses
class RemoteBasic(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteBaremetal' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.test_attributes(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteBasic)
class RemoteStonithd(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteStonithd' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.fail_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return False
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return True
def errorstoignore(self):
ignore_pats = [
r"Lost connection to Pacemaker Remote node",
r"Software caused connection abort",
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
r"Calculated [Tt]ransition .*pe-error",
r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteStonithd)
class RemoteMigrate(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteMigrate' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.migrate_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteMigrate)
class RemoteRscFailure(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteRscFailure' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
# This is an important step. We are migrating the connection
# before failing the resource. This verifies that the migration
# has properly maintained control over the remote-node.
self.migrate_connection(node)
self.fail_rsc(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
ignore_pats = [
r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
r"Dummy.*: No process state file found",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteRscFailure)
# vim:ts=4:sw=4:et:
diff --git a/cts/benchmark/clubench.in b/cts/benchmark/clubench.in
index 9194505cac..6adbe46cb0 100644
--- a/cts/benchmark/clubench.in
+++ b/cts/benchmark/clubench.in
@@ -1,195 +1,190 @@
#!/bin/sh
#
-PROG=`basename $0`
-DIR=`dirname $0`
SSHOPTS="-l root -o PasswordAuthentication=no -o ConnectTimeout=5"
msg() {
- echo $@ >&2
+ echo "$@" >&2
}
usage() {
echo "usage: $0 <dir>"
echo " dir: working directory (with the control file)"
exit 0
}
[ $# -eq 0 ] && usage
WORKDIR=$1
test -d "$WORKDIR" || usage
CTSCTRL=~/.cts
CTRL=$WORKDIR/control
CSV=$WORKDIR/bench.csv
STATS=$WORKDIR/bench.stats
test -f $CTRL && . $CTRL
@datadir@/@PACKAGE@/tests/cts/cluster_test 500 || {
msg "cluster_test failed"
exit 1
}
test -f $CTSCTRL || {
msg no CTS control file $CTSCTRL
exit 1
}
. $CTSCTRL
: ${CTS_logfacility:=local7}
: ${CTS_stack:=corosync}
: ${CTS_logfile:="@CRM_LOG_DIR@/ha-log-bench"}
: ${CTS_adv:="--schema pacemaker-1.2 --clobber-cib -r"}
: ${RUNS:=3}
: ${CTSTESTS:="--benchmark"}
: ${CTSDIR:="@datadir@/@PACKAGE@/tests/cts"}
[ "$CTS_node_list" ] || {
msg no node list specified
exit 1
}
case "$CTS_stack" in
corosync) CRM_REPORT_OPTS="--corosync";;
*) msg "$CTS_stack: cluster stack not recognized"; exit 1;;
esac
CTSOPTS="--stack $CTS_stack --at-boot $CTS_boot $CTS_adv"
CTSOPTS="$CTSOPTS --facility $CTS_logfacility --logfile $CTS_logfile"
if [ "x$CTS_stonith" != "x" ]; then
CTSOPTS="$CTSOPTS --stonith-type $CTS_stonith"
[ "x$CTS_stonith_args" != "x" ] &&
CTSOPTS="$CTSOPTS --stonith-params \"$CTS_stonith_args\""
else
CTSOPTS="$CTSOPTS --stonith 0"
fi
CTSOPTS="$CTSOPTS $CTSTESTS"
fibonacci() {
- local limit=$1
- local n=2 prev=1 tmp_n
- while [ $n -le $limit ]; do
- echo $n
- tmp_n=$n
- n=$((n+prev))
- prev=$tmp_n
+ F_LIMIT=$1
+ F_N=2
+ F_N_PREV=1
+ while [ $F_N -le $F_LIMIT ]; do
+ echo $F_N
+ F_N_TMP=$F_N
+ F_N=$((F_N+F_N_PREV))
+ F_N_PREV=$F_N_TMP
done
- [ $prev -ne $limit ] && echo $limit
+ [ $F_N_PREV -ne $F_LIMIT ] && echo $F_LIMIT
}
[ "$SERIES" ] ||
- SERIES=$(fibonacci `echo $CTS_node_list | wc -w`)
+ SERIES=$(fibonacci "$(echo $CTS_node_list | wc -w)")
get_nodes() {
- local c_nodes
- c_nodes=`echo $CTS_node_list | awk -v n=$1 '
+ GN_C_NODES=$(echo $CTS_node_list | awk -v n="$1" '
{ for( i=1; i<=NF; i++ ) node[cnt++]=$i }
END{for( i=0; i<n; i++ ) print node[i] }
- '`
- if [ `echo $c_nodes | wc -w` != $1 ]; then
+ ')
+ if [ "$(echo $GN_C_NODES | wc -w)" != "$1" ]; then
msg "not enough nodes in $CTSCTRL"
exit 1
fi
- echo $c_nodes
+ echo $GN_C_NODES
}
node_cleanup() {
msg "CIB cleanup ($nodes)"
- local n
- for n in $nodes; do
- ssh $SSHOPTS $n 'rm @CRM_CONFIG_DIR@/*'
+ for NC_N in $nodes; do
+ ssh $SSHOPTS $NC_N 'rm @CRM_CONFIG_DIR@/*'
done
}
testnum() {
printf '%03d' $1
}
mkreports() {
msg "Creating reports for the CTS run"
- local ctsdir=$1
- grep "Running test " $ctsdir/ctsrun.out | tr -d \[\] |
+ MKR_CTS_DIR=$1
+ grep "Running test " $MKR_CTS_DIR/ctsrun.out | tr -d \[\] |
awk '{print $6,$NF}' |
while read type num; do
teststg="`testnum $num`-$type"
(
- cd $ctsdir
- crm_report $CRM_REPORT_OPTS -f cts:$num -n "$nodes" `pwd`/$teststg < /dev/null
+ cd $MKR_CTS_DIR || return
+ crm_report $CRM_REPORT_OPTS -f "cts:$num" -n "$nodes" "$(pwd)/$teststg" < /dev/null
)
done
}
runcts() {
- local odir=$1
+ RC_ODIR="$1"
msg "Running CTS"
- python $CTSDIR/CTSlab.py $CTSOPTS --nodes "$nodes" > $odir/ctsrun.out 2>&1 &
+ python "$CTSDIR/CTSlab.py" $CTSOPTS --nodes "$nodes" > "$RC_ODIR/ctsrun.out" 2>&1 &
ctspid=$!
- tail -f $odir/ctsrun.out &
+ tail -f "$RC_ODIR/ctsrun.out" &
tailpid=$!
wait $ctspid
kill $tailpid >/dev/null 2>&1
}
bench_re='CTS:.*runtime:'
diginfo() {
- local d v
- local ctsdir=$1
- local s="$2"
- filter=$3
+ DI_CTS_DIR="$1"
+ DI_S="$2"
+ filter="$3"
(
- cd $ctsdir
+ cd "$DI_CTS_DIR" || return
for r in [0-9]*.tar.bz2; do
tar xjf $r
- d=`basename $r .tar.bz2`
- for v in `grep $bench_re $d/ha-log.txt | eval $filter`; do
- s="$s,$v"
+ DI_D=$(basename "$r" .tar.bz2)
+ for DI_V in $(grep "$bench_re" "$DI_D/ha-log.txt" | eval "$filter"); do
+ DI_S="$DI_S,$DI_V"
done
- rm -r $d
+ rm -r "$DI_D"
done
- echo $s
+ echo $DI_S
)
}
printheader() {
diginfo $1 "" "awk '{print \$(NF-2)}'"
}
printstats() {
diginfo $1 "$clusize" "awk '{print \$(NF)}'"
}
printmedians() {
- local f=$1
- local s="$clusize"
- local middle=$((RUNS/2 + 1))
- set `head -1 $f | sed 's/,/ /g'`
- local cols=$#
- local i v
- for i in `seq 2 $cols`; do
- v=`awk -v i=$i -F, '{print $i}' < $f | sort -n | head -$middle | tail -1`
- s="$s,$v"
+ PM_F="$1"
+ PM_S="$clusize"
+ PM_MIDDLE=$((RUNS/2 + 1))
+ set $(head -1 "$PM_F" | sed 's/,/ /g')
+ PM_COLS=$#
+ for PM_I in $(seq 2 $PM_COLS); do
+ PM_V=$(awk -v i=$PM_I -F, '{print $i}' < $PM_F | sort -n | head -$PM_MIDDLE | tail -1)
+ PM_S="$PM_S,$PM_V"
done
- echo $s
+ echo $PM_S
}
rm -f $CSV
tmpf=`mktemp`
test -f "$tmpf" || {
msg "can't create temporary file"
exit 1
}
trap "rm -f $tmpf" 0
for clusize in $SERIES; do
nodes=`get_nodes $clusize`
outdir=$WORKDIR/$clusize
rm -rf $outdir
mkdir -p $outdir
rm -f $tmpf
node_cleanup
for i in `seq $RUNS`; do
- > $CTS_logfile
+ true > $CTS_logfile
mkdir -p $outdir/$i
runcts $outdir/$i
mkreports $outdir/$i
printstats $outdir/$i >> $tmpf
done
[ -f "$CSV" ] || printheader $outdir/1 > $CSV
printmedians $tmpf >> $CSV
cat $tmpf >> $STATS
msg "Statistics for $clusize-node cluster saved"
done
msg "Tests done for series $SERIES, output in $CSV and $STATS"
diff --git a/cts/cts-cli.in b/cts/cts-cli.in
index 03cb67e7d7..a23b1e4ba6 100755
--- a/cts/cts-cli.in
+++ b/cts/cts-cli.in
@@ -1,968 +1,979 @@
#!@BASH_PATH@
#
# Copyright 2008-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
#
# Note on portable usage of sed: GNU/POSIX/*BSD sed have a limited subset of
# compatible functionality. Do not use the -i option, alternation (\|),
# \0, or character sequences such as \n or \s.
#
USAGE_TEXT="Usage: cts-cli [<options>]
Options:
--help Display this text, then exit
-V, --verbose Display any differences from expected output
-t 'TEST [...]' Run only specified tests (default: 'dates tools acls validity upgrade')
-p DIR Look for executables in DIR (may be specified multiple times)
-v, --valgrind Run all commands under valgrind
-s Save actual output as expected output"
# If readlink supports -e (i.e. GNU), use it
readlink -e / >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
- test_home="$(dirname $(readlink -e $0))"
+ test_home="$(dirname "$(readlink -e "$0")")"
else
- test_home="$(dirname $0)"
+ test_home="$(dirname "$0")"
fi
: ${shadow=cts-cli}
shadow_dir=$(mktemp -d ${TMPDIR:-/tmp}/cts-cli.shadow.XXXXXXXXXX)
num_errors=0
num_passed=0
-GREP_OPTIONS=
verbose=0
tests="dates tools acls validity upgrade"
do_save=0
VALGRIND_CMD=
VALGRIND_OPTS="
-q
--gen-suppressions=all
--show-reachable=no
--leak-check=full
--trace-children=no
--time-stamp=yes
--num-callers=20
--suppressions=$test_home/valgrind-pcmk.suppressions
"
# These constants must track crm_exit_t values
CRM_EX_OK=0
CRM_EX_ERROR=1
CRM_EX_INSUFFICIENT_PRIV=4
CRM_EX_USAGE=64
CRM_EX_CONFIG=78
CRM_EX_OLD=103
CRM_EX_NOSUCH=105
CRM_EX_UNSAFE=107
CRM_EX_EXISTS=108
CRM_EX_MULTIPLE=109
function test_assert() {
target=$1; shift
cib=$1; shift
app=`echo "$cmd" | sed 's/\ .*//'`
printf "* Running: $app - $desc\n" 1>&2
printf "=#=#=#= Begin test: $desc =#=#=#=\n"
eval $VALGRIND_CMD $cmd 2>&1
rc=$?
if [ x$cib != x0 ]; then
printf "=#=#=#= Current cib after: $desc =#=#=#=\n"
CIB_user=root cibadmin -Q
fi
printf "=#=#=#= End test: $desc - $(crm_error --exit $rc) ($rc) =#=#=#=\n"
if [ $rc -ne $target ]; then
num_errors=$(( $num_errors + 1 ))
printf "* Failed (rc=%.3d): %-14s - %s\n" $rc $app "$desc"
printf "* Failed (rc=%.3d): %-14s - %s\n" $rc $app "$desc (`which $app`)" 1>&2
return
exit $CRM_EX_ERROR
else
printf "* Passed: %-14s - %s\n" $app "$desc"
num_passed=$(( $num_passed + 1 ))
fi
}
function test_tools() {
- local TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.xml.XXXXXXXXXX)
- local TMPORIG=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.existing.xml.XXXXXXXXXX)
+ local TMPXML
+ local TMPORIG
+
+ TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.xml.XXXXXXXXXX)
+ TMPORIG=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.existing.xml.XXXXXXXXXX)
export CIB_shadow_dir="${shadow_dir}"
$VALGRIND_CMD crm_shadow --batch --force --create-empty $shadow 2>&1
export CIB_shadow=$shadow
desc="Validate CIB"
cmd="cibadmin -Q"
test_assert $CRM_EX_OK
desc="Configure something before erasing"
cmd="crm_attribute -n cluster-delay -v 60s"
test_assert $CRM_EX_OK
desc="Require --force for CIB erasure"
cmd="cibadmin -E"
test_assert $CRM_EX_UNSAFE
desc="Allow CIB erasure with --force"
cmd="cibadmin -E --force"
test_assert $CRM_EX_OK
desc="Query CIB"
cmd="cibadmin -Q > $TMPORIG"
test_assert $CRM_EX_OK
desc="Set cluster option"
cmd="crm_attribute -n cluster-delay -v 60s"
test_assert $CRM_EX_OK
desc="Query new cluster option"
cmd="cibadmin -Q -o crm_config | grep cib-bootstrap-options-cluster-delay"
test_assert $CRM_EX_OK
desc="Query cluster options"
cmd="cibadmin -Q -o crm_config > $TMPXML"
test_assert $CRM_EX_OK
desc="Set no-quorum policy"
cmd="crm_attribute -n no-quorum-policy -v ignore"
test_assert $CRM_EX_OK
desc="Delete nvpair"
cmd="cibadmin -D -o crm_config --xml-text '<nvpair id=\"cib-bootstrap-options-cluster-delay\"/>'"
test_assert $CRM_EX_OK
desc="Create operation should fail"
cmd="cibadmin -C -o crm_config --xml-file $TMPXML"
test_assert $CRM_EX_EXISTS
desc="Modify cluster options section"
cmd="cibadmin -M -o crm_config --xml-file $TMPXML"
test_assert $CRM_EX_OK
desc="Query updated cluster option"
cmd="cibadmin -Q -o crm_config | grep cib-bootstrap-options-cluster-delay"
test_assert $CRM_EX_OK
desc="Set duplicate cluster option"
cmd="crm_attribute -n cluster-delay -v 40s -s duplicate"
test_assert $CRM_EX_OK
desc="Setting multiply defined cluster option should fail"
cmd="crm_attribute -n cluster-delay -v 30s"
test_assert $CRM_EX_MULTIPLE
desc="Set cluster option with -s"
cmd="crm_attribute -n cluster-delay -v 30s -s duplicate"
test_assert $CRM_EX_OK
desc="Delete cluster option with -i"
cmd="crm_attribute -n cluster-delay -D -i cib-bootstrap-options-cluster-delay"
test_assert $CRM_EX_OK
desc="Create node1 and bring it online"
cmd="crm_simulate --live-check --in-place --node-up=node1"
test_assert $CRM_EX_OK
desc="Create node attribute"
cmd="crm_attribute -n ram -v 1024M -N node1 -t nodes"
test_assert $CRM_EX_OK
desc="Query new node attribute"
cmd="cibadmin -Q -o nodes | grep node1-ram"
test_assert $CRM_EX_OK
desc="Set a transient (fail-count) node attribute"
cmd="crm_attribute -n fail-count-foo -v 3 -N node1 -t status"
test_assert $CRM_EX_OK
desc="Query a fail count"
cmd="crm_failcount --query -r foo -N node1"
test_assert $CRM_EX_OK
desc="Delete a transient (fail-count) node attribute"
cmd="crm_attribute -n fail-count-foo -D -N node1 -t status"
test_assert $CRM_EX_OK
desc="Digest calculation"
cmd="cibadmin -Q | cibadmin -5 -p 2>&1 > /dev/null"
test_assert $CRM_EX_OK
# This update will fail because it has version numbers
desc="Replace operation should fail"
cmd="cibadmin -R --xml-file $TMPORIG"
test_assert $CRM_EX_OLD
desc="Default standby value"
cmd="crm_standby -N node1 -G"
test_assert $CRM_EX_OK
desc="Set standby status"
cmd="crm_standby -N node1 -v true"
test_assert $CRM_EX_OK
desc="Query standby value"
cmd="crm_standby -N node1 -G"
test_assert $CRM_EX_OK
desc="Delete standby value"
cmd="crm_standby -N node1 -D"
test_assert $CRM_EX_OK
desc="Create a resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"dummy\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/>'"
test_assert $CRM_EX_OK
desc="Create a resource meta attribute"
cmd="crm_resource -r dummy --meta -p is-managed -v false"
test_assert $CRM_EX_OK
desc="Query a resource meta attribute"
cmd="crm_resource -r dummy --meta -g is-managed"
test_assert $CRM_EX_OK
desc="Remove a resource meta attribute"
cmd="crm_resource -r dummy --meta -d is-managed"
test_assert $CRM_EX_OK
desc="Create a resource attribute"
cmd="crm_resource -r dummy -p delay -v 10s"
test_assert $CRM_EX_OK
desc="List the configured resources"
cmd="crm_resource -L"
test_assert $CRM_EX_OK
desc="Require a destination when migrating a resource that is stopped"
cmd="crm_resource -r dummy -M"
test_assert $CRM_EX_USAGE
desc="Don't support migration to non-existent locations"
cmd="crm_resource -r dummy -M -N i.dont.exist"
test_assert $CRM_EX_NOSUCH
desc="Create a fencing resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"Fence\" class=\"stonith\" type=\"fence_true\"/>'"
test_assert $CRM_EX_OK
desc="Bring resources online"
cmd="crm_simulate --live-check --in-place -S"
test_assert $CRM_EX_OK
desc="Try to move a resource to its existing location"
cmd="crm_resource -r dummy --move --host node1"
test_assert $CRM_EX_EXISTS
desc="Move a resource from its existing location"
cmd="crm_resource -r dummy --move"
test_assert $CRM_EX_OK
desc="Clear out constraints generated by --move"
cmd="crm_resource -r dummy --clear"
test_assert $CRM_EX_OK
desc="Default ticket granted state"
cmd="crm_ticket -t ticketA -G granted -d false"
test_assert $CRM_EX_OK
desc="Set ticket granted state"
cmd="crm_ticket -t ticketA -r --force"
test_assert $CRM_EX_OK
desc="Query ticket granted state"
cmd="crm_ticket -t ticketA -G granted"
test_assert $CRM_EX_OK
desc="Delete ticket granted state"
cmd="crm_ticket -t ticketA -D granted --force"
test_assert $CRM_EX_OK
desc="Make a ticket standby"
cmd="crm_ticket -t ticketA -s"
test_assert $CRM_EX_OK
desc="Query ticket standby state"
cmd="crm_ticket -t ticketA -G standby"
test_assert $CRM_EX_OK
desc="Activate a ticket"
cmd="crm_ticket -t ticketA -a"
test_assert $CRM_EX_OK
desc="Delete ticket standby state"
cmd="crm_ticket -t ticketA -D standby"
test_assert $CRM_EX_OK
desc="Ban a resource on unknown node"
cmd="crm_resource -r dummy -B -N host1"
test_assert $CRM_EX_NOSUCH
desc="Create two more nodes and bring them online"
cmd="crm_simulate --live-check --in-place --node-up=node2 --node-up=node3"
test_assert $CRM_EX_OK
desc="Ban dummy from node1"
cmd="crm_resource -r dummy -B -N node1"
test_assert $CRM_EX_OK
desc="Ban dummy from node2"
cmd="crm_resource -r dummy -B -N node2"
test_assert $CRM_EX_OK
desc="Relocate resources due to ban"
cmd="crm_simulate --live-check --in-place -S"
test_assert $CRM_EX_OK
desc="Move dummy to node1"
cmd="crm_resource -r dummy -M -N node1"
test_assert $CRM_EX_OK
desc="Clear implicit constraints for dummy on node2"
cmd="crm_resource -r dummy -U -N node2"
test_assert $CRM_EX_OK
desc="Drop the status section"
cmd="cibadmin -R -o status --xml-text '<status/>'"
test_assert $CRM_EX_OK 0
desc="Create a clone"
cmd="cibadmin -C -o resources --xml-text '<clone id=\"test-clone\"><primitive id=\"test-primitive\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/></clone>'"
test_assert $CRM_EX_OK 0
desc="Create a resource meta attribute"
cmd="crm_resource -r test-primitive --meta -p is-managed -v false"
test_assert $CRM_EX_OK
desc="Create a resource meta attribute in the primitive"
cmd="crm_resource -r test-primitive --meta -p is-managed -v false --force"
test_assert $CRM_EX_OK
desc="Update resource meta attribute with duplicates"
cmd="crm_resource -r test-clone --meta -p is-managed -v true"
test_assert $CRM_EX_OK
desc="Update resource meta attribute with duplicates (force clone)"
cmd="crm_resource -r test-clone --meta -p is-managed -v true --force"
test_assert $CRM_EX_OK
desc="Update child resource meta attribute with duplicates"
cmd="crm_resource -r test-primitive --meta -p is-managed -v false"
test_assert $CRM_EX_OK
desc="Delete resource meta attribute with duplicates"
cmd="crm_resource -r test-clone --meta -d is-managed"
test_assert $CRM_EX_OK
desc="Delete resource meta attribute in parent"
cmd="crm_resource -r test-primitive --meta -d is-managed"
test_assert $CRM_EX_OK
desc="Create a resource meta attribute in the primitive"
cmd="crm_resource -r test-primitive --meta -p is-managed -v false --force"
test_assert $CRM_EX_OK
desc="Update existing resource meta attribute"
cmd="crm_resource -r test-clone --meta -p is-managed -v true"
test_assert $CRM_EX_OK
desc="Create a resource meta attribute in the parent"
cmd="crm_resource -r test-clone --meta -p is-managed -v true --force"
test_assert $CRM_EX_OK
desc="Copy resources"
cmd="cibadmin -Q -o resources > $TMPXML"
test_assert $CRM_EX_OK 0
desc="Delete resource paremt meta attribute (force)"
cmd="crm_resource -r test-clone --meta -d is-managed --force"
test_assert $CRM_EX_OK
desc="Restore duplicates"
cmd="cibadmin -R -o resources --xml-file $TMPXML"
test_assert $CRM_EX_OK
desc="Delete resource child meta attribute"
cmd="crm_resource -r test-primitive --meta -d is-managed"
test_assert $CRM_EX_OK
unset CIB_shadow_dir
rm -f "$TMPXML" "$TMPORIG"
}
function test_dates() {
desc="2014-01-01 00:30:00 - 1 Hour"
cmd="iso8601 -d '2014-01-01 00:30:00Z' -D P-1H -E '2013-12-31 23:30:00Z'"
test_assert $CRM_EX_OK 0
for y in 06 07 08 09 10 11 12 13 14 15 16 17 18; do
desc="20$y-W01-7"
cmd="iso8601 -d '20$y-W01-7 00Z'"
test_assert $CRM_EX_OK 0
desc="20$y-W01-7 - round-trip"
cmd="iso8601 -d '20$y-W01-7 00Z' -W -E '20$y-W01-7 00:00:00Z'"
test_assert $CRM_EX_OK 0
desc="20$y-W01-1"
cmd="iso8601 -d '20$y-W01-1 00Z'"
test_assert $CRM_EX_OK 0
desc="20$y-W01-1 - round-trip"
cmd="iso8601 -d '20$y-W01-1 00Z' -W -E '20$y-W01-1 00:00:00Z'"
test_assert $CRM_EX_OK 0
done
desc="2009-W53-07"
cmd="iso8601 -d '2009-W53-7 00:00:00Z' -W -E '2009-W53-7 00:00:00Z'"
test_assert $CRM_EX_OK 0
desc="2009-01-31 + 1 Month"
cmd="iso8601 -d '2009-01-31 00:00:00Z' -D P1M -E '2009-02-28 00:00:00Z'"
test_assert $CRM_EX_OK 0
desc="2009-01-31 + 2 Months"
cmd="iso8601 -d '2009-01-31 00:00:00Z' -D P2M -E '2009-03-31 00:00:00Z'"
test_assert $CRM_EX_OK 0
desc="2009-01-31 + 3 Months"
cmd="iso8601 -d '2009-01-31 00:00:00Z' -D P3M -E '2009-04-30 00:00:00Z'"
test_assert $CRM_EX_OK 0
desc="2009-03-31 - 1 Month"
cmd="iso8601 -d '2009-03-31 00:00:00Z' -D P-1M -E '2009-02-28 00:00:00Z'"
test_assert $CRM_EX_OK 0
}
function test_acl_loop() {
- local TMPXML="$1"
+ local TMPXML
+
+ TMPXML="$1"
# Make sure we're rejecting things for the right reasons
export PCMK_trace_functions=__xml_acl_check,__xml_acl_post_process
export PCMK_stderr=1
CIB_user=root cibadmin --replace --xml-text '<resources/>'
export CIB_user=unknownguy
desc="$CIB_user: Query configuration"
cmd="cibadmin -Q"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Set enable-acl"
cmd="crm_attribute -n enable-acl -v false"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Set stonith-enabled"
cmd="crm_attribute -n stonith-enabled -v false"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Create a resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"dummy\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/>'"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
export CIB_user=l33t-haxor
desc="$CIB_user: Query configuration"
cmd="cibadmin -Q"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Set enable-acl"
cmd="crm_attribute -n enable-acl -v false"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Set stonith-enabled"
cmd="crm_attribute -n stonith-enabled -v false"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Create a resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"dummy\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/>'"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
export CIB_user=niceguy
desc="$CIB_user: Query configuration"
cmd="cibadmin -Q"
test_assert $CRM_EX_OK 0
desc="$CIB_user: Set enable-acl"
cmd="crm_attribute -n enable-acl -v false"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Set stonith-enabled"
cmd="crm_attribute -n stonith-enabled -v false"
test_assert $CRM_EX_OK
desc="$CIB_user: Create a resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"dummy\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/>'"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
export CIB_user=root
desc="$CIB_user: Query configuration"
cmd="cibadmin -Q"
test_assert $CRM_EX_OK 0
desc="$CIB_user: Set stonith-enabled"
cmd="crm_attribute -n stonith-enabled -v true"
test_assert $CRM_EX_OK
desc="$CIB_user: Create a resource"
cmd="cibadmin -C -o resources --xml-text '<primitive id=\"dummy\" class=\"ocf\" provider=\"pacemaker\" type=\"Dummy\"/>'"
test_assert $CRM_EX_OK
export CIB_user=l33t-haxor
desc="$CIB_user: Create a resource meta attribute"
cmd="crm_resource -r dummy --meta -p target-role -v Stopped"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Query a resource meta attribute"
cmd="crm_resource -r dummy --meta -g target-role"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
desc="$CIB_user: Remove a resource meta attribute"
cmd="crm_resource -r dummy --meta -d target-role"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
export CIB_user=niceguy
desc="$CIB_user: Create a resource meta attribute"
cmd="crm_resource -r dummy --meta -p target-role -v Stopped"
test_assert $CRM_EX_OK
desc="$CIB_user: Query a resource meta attribute"
cmd="crm_resource -r dummy --meta -g target-role"
test_assert $CRM_EX_OK
desc="$CIB_user: Remove a resource meta attribute"
cmd="crm_resource -r dummy --meta -d target-role"
test_assert $CRM_EX_OK
desc="$CIB_user: Create a resource meta attribute"
cmd="crm_resource -r dummy --meta -p target-role -v Started"
test_assert $CRM_EX_OK
export CIB_user=badidea
desc="$CIB_user: Query configuration - implied deny"
cmd="cibadmin -Q"
test_assert $CRM_EX_OK 0
export CIB_user=betteridea
desc="$CIB_user: Query configuration - explicit deny"
cmd="cibadmin -Q"
test_assert $CRM_EX_OK 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --delete --xml-text '<acls/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
export CIB_user=niceguy
desc="$CIB_user: Replace - remove acls"
cmd="cibadmin --replace --xml-file $TMPXML"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -C -o resources --xml-text '<primitive id="dummy2" class="ocf" provider="pacemaker" type="Dummy"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - create resource"
cmd="cibadmin --replace --xml-file $TMPXML"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" crm_attribute -n enable-acl -v false
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - modify attribute (deny)"
cmd="cibadmin --replace --xml-file $TMPXML"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --replace --xml-text '<nvpair id="cib-bootstrap-options-enable-acl" name="enable-acl"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - delete attribute (deny)"
cmd="cibadmin --replace --xml-file $TMPXML"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --modify --xml-text '<primitive id="dummy" description="nothing interesting"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - create attribute (deny)"
cmd="cibadmin --replace --xml-file $TMPXML"
test_assert $CRM_EX_INSUFFICIENT_PRIV 0
CIB_user=bob
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --modify --xml-text '<primitive id="dummy" description="nothing interesting"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - create attribute (allow)"
cmd="cibadmin --replace -o resources --xml-file $TMPXML"
test_assert $CRM_EX_OK 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --modify --xml-text '<primitive id="dummy" description="something interesting"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - modify attribute (allow)"
cmd="cibadmin --replace -o resources --xml-file $TMPXML"
test_assert $CRM_EX_OK 0
CIB_user=root cibadmin -Q > "$TMPXML"
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin --replace -o resources --xml-text '<primitive id="dummy" class="ocf" provider="pacemaker" type="Dummy"/>'
CIB_user=root CIB_file="$TMPXML" CIB_shadow="" cibadmin -Ql
desc="$CIB_user: Replace - delete attribute (allow)"
cmd="cibadmin --replace -o resources --xml-file $TMPXML"
test_assert $CRM_EX_OK 0
}
function test_acls() {
local SHADOWPATH
- local TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.acls.xml.XXXXXXXXXX)
+ local TMPXML
+
+ TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.acls.xml.XXXXXXXXXX)
export CIB_shadow_dir="${shadow_dir}"
$VALGRIND_CMD crm_shadow --batch --force --create-empty $shadow --validate-with pacemaker-1.3 2>&1
export CIB_shadow=$shadow
cat <<EOF > "$TMPXML"
<acls>
<acl_user id="l33t-haxor">
<deny id="crook-nothing" xpath="/cib"/>
</acl_user>
<acl_user id="niceguy">
<role_ref id="observer"/>
</acl_user>
<acl_user id="bob">
<role_ref id="admin"/>
</acl_user>
<acl_role id="observer">
<read id="observer-read-1" xpath="/cib"/>
<write id="observer-write-1" xpath="//nvpair[@name=&apos;stonith-enabled&apos;]"/>
<write id="observer-write-2" xpath="//nvpair[@name=&apos;target-role&apos;]"/>
</acl_role>
<acl_role id="admin">
<read id="admin-read-1" xpath="/cib"/>
<write id="admin-write-1" xpath="//resources"/>
</acl_role>
</acls>
EOF
desc="Configure some ACLs"
cmd="cibadmin -M -o acls --xml-file $TMPXML"
test_assert $CRM_EX_OK
desc="Enable ACLs"
cmd="crm_attribute -n enable-acl -v true"
test_assert $CRM_EX_OK
desc="Set cluster option"
cmd="crm_attribute -n no-quorum-policy -v ignore"
test_assert $CRM_EX_OK
desc="New ACL"
cmd="cibadmin --create -o acls --xml-text '<acl_user id=\"badidea\"><read id=\"badidea-resources\" xpath=\"//meta_attributes\"/></acl_user>'"
test_assert $CRM_EX_OK
desc="Another ACL"
cmd="cibadmin --create -o acls --xml-text '<acl_user id=\"betteridea\"><read id=\"betteridea-resources\" xpath=\"//meta_attributes\"/></acl_user>'"
test_assert $CRM_EX_OK
desc="Updated ACL"
cmd="cibadmin --replace -o acls --xml-text '<acl_user id=\"betteridea\"><deny id=\"betteridea-nothing\" xpath=\"/cib\"/><read id=\"betteridea-resources\" xpath=\"//meta_attributes\"/></acl_user>'"
test_assert $CRM_EX_OK
test_acl_loop "$TMPXML"
printf "\n\n !#!#!#!#! Upgrading to latest CIB schema and re-testing !#!#!#!#!\n"
printf "\nUpgrading to latest CIB schema and re-testing\n" 1>&2
export CIB_user=root
desc="$CIB_user: Upgrade to latest CIB schema"
cmd="cibadmin --upgrade --force -V"
test_assert $CRM_EX_OK
SHADOWPATH="$(crm_shadow --file)"
# sed -i isn't portable :-(
cp -p "$SHADOWPATH" "${SHADOWPATH}.$$" # to keep permissions
sed -e 's/epoch=.2/epoch=\"6/g' -e 's/admin_epoch=.1/admin_epoch=\"0/g' \
"$SHADOWPATH" > "${SHADOWPATH}.$$"
mv -- "${SHADOWPATH}.$$" "$SHADOWPATH"
test_acl_loop "$TMPXML"
unset CIB_shadow_dir
rm -f "$TMPXML"
}
function test_validity() {
- local TMPGOOD=$(mktemp ${TMPDIR:-/tmp}/cts-cli.validity.good.xml.XXXXXXXXXX)
- local TMPBAD=$(mktemp ${TMPDIR:-/tmp}/cts-cli.validity.bad.xml.XXXXXXXXXX)
+ local TMPGOOD
+ local TMPBAD
+
+ TMPGOOD=$(mktemp ${TMPDIR:-/tmp}/cts-cli.validity.good.xml.XXXXXXXXXX)
+ TMPBAD=$(mktemp ${TMPDIR:-/tmp}/cts-cli.validity.bad.xml.XXXXXXXXXX)
export CIB_shadow_dir="${shadow_dir}"
$VALGRIND_CMD crm_shadow --batch --force --create-empty $shadow --validate-with pacemaker-1.2 2>&1
export CIB_shadow=$shadow
export PCMK_trace_functions=apply_upgrade,update_validation,cli_config_update
export PCMK_stderr=1
cibadmin -C -o resources --xml-text '<primitive id="dummy1" class="ocf" provider="pacemaker" type="Dummy"/>'
cibadmin -C -o resources --xml-text '<primitive id="dummy2" class="ocf" provider="pacemaker" type="Dummy"/>'
cibadmin -C -o constraints --xml-text '<rsc_order id="ord_1-2" first="dummy1" first-action="start" then="dummy2"/>'
cibadmin -Q > "$TMPGOOD"
desc="Try to make resulting CIB invalid (enum violation)"
cmd="cibadmin -M -o constraints --xml-text '<rsc_order id=\"ord_1-2\" first=\"dummy1\" first-action=\"break\" then=\"dummy2\"/>'"
test_assert $CRM_EX_CONFIG
sed 's|"start"|"break"|' "$TMPGOOD" > "$TMPBAD"
desc="Run crm_simulate with invalid CIB (enum violation)"
cmd="crm_simulate -x $TMPBAD -S"
test_assert $CRM_EX_CONFIG 0
desc="Try to make resulting CIB invalid (unrecognized validate-with)"
cmd="cibadmin -M --xml-text '<cib validate-with=\"pacemaker-9999.0\"/>'"
test_assert $CRM_EX_CONFIG
sed 's|"pacemaker-1.2"|"pacemaker-9999.0"|' "$TMPGOOD" > "$TMPBAD"
desc="Run crm_simulate with invalid CIB (unrecognized validate-with)"
cmd="crm_simulate -x $TMPBAD -S"
test_assert $CRM_EX_CONFIG 0
desc="Try to make resulting CIB invalid, but possibly recoverable (valid with X.Y+1)"
cmd="cibadmin -C -o configuration --xml-text '<tags/>'"
test_assert $CRM_EX_CONFIG
sed 's|</configuration>|<tags/></configuration>|' "$TMPGOOD" > "$TMPBAD"
desc="Run crm_simulate with invalid, but possibly recoverable CIB (valid with X.Y+1)"
cmd="crm_simulate -x $TMPBAD -S"
test_assert $CRM_EX_OK 0
sed 's|[ ][ ]*validate-with="[^"]*"||' "$TMPGOOD" > "$TMPBAD"
desc="Make resulting CIB valid, although without validate-with attribute"
cmd="cibadmin -R --xml-file $TMPBAD"
test_assert $CRM_EX_OK
desc="Run crm_simulate with valid CIB, but without validate-with attribute"
cmd="crm_simulate -x $TMPBAD -S"
test_assert $CRM_EX_OK 0
# this will just disable validation and accept the config, outputting
# validation errors
sed -e 's|[ ][ ]*validate-with="[^"]*"||' \
-e 's|\([ ][ ]*epoch="[^"]*\)"|\10"|' -e 's|"start"|"break"|' \
"$TMPGOOD" > "$TMPBAD"
desc="Make resulting CIB invalid, and without validate-with attribute"
cmd="cibadmin -R --xml-file $TMPBAD"
test_assert $CRM_EX_OK
desc="Run crm_simulate with invalid CIB, also without validate-with attribute"
cmd="crm_simulate -x $TMPBAD -S"
test_assert $CRM_EX_OK 0
unset CIB_shadow_dir
rm -f "$TMPGOOD" "$TMPBAD"
}
test_upgrade() {
- local TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.xml.XXXXXXXXXX)
+ local TMPXML
+
+ TMPXML=$(mktemp ${TMPDIR:-/tmp}/cts-cli.tools.xml.XXXXXXXXXX)
export CIB_shadow_dir="${shadow_dir}"
$VALGRIND_CMD crm_shadow --batch --force --create-empty $shadow --validate-with pacemaker-2.10 2>&1
export CIB_shadow=$shadow
desc="Set stonith-enabled=false"
cmd="crm_attribute -n stonith-enabled -v false"
test_assert $CRM_EX_OK
cat <<EOF > "$TMPXML"
<resources>
<primitive id="mySmartFuse" class="ocf" provider="experiment" type="SmartFuse">
<operations>
<op id="mySmartFuse-start" name="start" interval="0" timeout="40s"/>
<op id="mySmartFuse-monitor-inputpower" name="monitor" interval="30s">
<instance_attributes id="mySmartFuse-inputpower-instanceparams">
<nvpair id="mySmartFuse-inputpower-requires" name="requires" value="inputpower"/>
</instance_attributes>
</op>
<op id="mySmartFuse-monitor-outputpower" name="monitor" interval="2s">
<instance_attributes id="mySmartFuse-outputpower-instanceparams">
<nvpair id="mySmartFuse-outputpower-requires" name="requires" value="outputpower"/>
</instance_attributes>
</op>
</operations>
<instance_attributes id="mySmartFuse-params">
<nvpair id="mySmartFuse-params-ip" name="ip" value="192.0.2.10"/>
</instance_attributes>
<!-- a bit hairy but valid -->
<instance_attributes id-ref="mySmartFuse-outputpower-instanceparams"/>
</primitive>
</resources>
EOF
desc="Configure the initial resource"
cmd="cibadmin -M -o resources --xml-file $TMPXML"
test_assert $CRM_EX_OK
desc="Upgrade to latest CIB schema (trigger 2.10.xsl + the wrapping)"
cmd="cibadmin --upgrade --force -V -V"
test_assert $CRM_EX_OK
desc="Query a resource instance attribute (shall survive)"
cmd="crm_resource -r mySmartFuse -g requires"
test_assert $CRM_EX_OK
unset CIB_shadow_dir
rm -f "$TMPXML"
}
# Process command-line arguments
while [ $# -gt 0 ]; do
case "$1" in
-t)
tests="$2"
shift 2
;;
-V|--verbose)
verbose=1
shift
;;
-v|--valgrind)
export G_SLICE=always-malloc
VALGRIND_CMD="valgrind $VALGRIND_OPTS"
shift
;;
-s)
do_save=1
shift
;;
-p)
export PATH="$2:$PATH"
shift
;;
--help)
echo "$USAGE_TEXT"
exit $CRM_EX_OK
;;
*)
echo "error: unknown option $1"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
done
for t in $tests; do
case "$t" in
dates) ;;
tools) ;;
acls) ;;
validity) ;;
upgrade) ;;
*)
echo "error: unknown test $t"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
done
# Check whether we're running from source directory
SRCDIR=$(dirname $test_home)
if [ -x "$SRCDIR/tools/crm_simulate" ]; then
export PATH="$SRCDIR/tools:$PATH"
echo "Using local binaries from: $SRCDIR/tools"
if [ -x "$SRCDIR/xml" ]; then
export PCMK_schema_directory="$SRCDIR/xml"
echo "Using local schemas from: $PCMK_schema_directory"
fi
fi
for t in $tests; do
echo "Testing $t"
TMPFILE=$(mktemp ${TMPDIR:-/tmp}/cts-cli.$t.XXXXXXXXXX)
eval TMPFILE_$t="$TMPFILE"
test_$t > "$TMPFILE"
sed -e 's/cib-last-written.*>/>/'\
-e 's/ last-run=\"[0-9]*\"//'\
-e 's/crm_feature_set="[^"]*" //'\
-e 's/validate-with="[^"]*" //'\
-e 's/Created new pacemaker-.* configuration/Created new pacemaker configuration/'\
-e 's/.*\(__xml_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \
-e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \
-e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \
-e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \
-e 's/ last-rc-change=\"[0-9]*\"//'\
-e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\
-e 's/^Entity: line [0-9][0-9]*: //'\
-e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \
"$TMPFILE" > "${TMPFILE}.$$"
mv -- "${TMPFILE}.$$" "$TMPFILE"
if [ $do_save -eq 1 ]; then
cp "$TMPFILE" $test_home/cli/regression.$t.exp
fi
done
rm -rf "${shadow_dir}"
failed=0
if [ $verbose -eq 1 ]; then
echo -e "\n\nResults"
fi
for t in $tests; do
eval TMPFILE="\$TMPFILE_$t"
if [ $verbose -eq 1 ]; then
diff -wu $test_home/cli/regression.$t.exp "$TMPFILE"
else
diff -w $test_home/cli/regression.$t.exp "$TMPFILE" >/dev/null 2>&1
fi
if [ $? -ne 0 ]; then
failed=1
fi
done
echo -e "\n\nSummary"
for t in $tests; do
eval TMPFILE="\$TMPFILE_$t"
- grep -e "^*" "$TMPFILE"
+ grep -e '^\*' "$TMPFILE"
done
if [ $num_errors -ne 0 ]; then
echo "$num_errors tests failed; see output in:"
for t in $tests; do
eval TMPFILE="\$TMPFILE_$t"
echo " $TMPFILE"
done
exit $CRM_EX_ERROR
elif [ $failed -eq 1 ]; then
echo "$num_passed tests passed but output was unexpected; see output in:"
for t in $tests; do
eval TMPFILE="\$TMPFILE_$t"
echo " $TMPFILE"
done
exit $CRM_EX_DIGEST
else
echo $num_passed tests passed
for t in $tests; do
eval TMPFILE="\$TMPFILE_$t"
rm -f "$TMPFILE"
done
crm_shadow --force --delete $shadow >/dev/null 2>&1
exit $CRM_EX_OK
fi
diff --git a/cts/cts-coverage.in b/cts/cts-coverage.in
index 0fdfe918fd..ba831810a2 100644
--- a/cts/cts-coverage.in
+++ b/cts/cts-coverage.in
@@ -1,62 +1,68 @@
#!@BASH_PATH@
+#
+# Copyright 2012-2018 Andrew Beekhof <andrew@beekhof.net>
+#
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
+#
-start=$PWD
-test_home=`dirname $0`
+start="$PWD"
+test_home=$(dirname "$0")
test_dir="@datadir@/@PACKAGE@/tests"
if [ "$test_home" != "$test_dir" ]; then
# Running against the source tree
- GCOV_BASE=@abs_top_srcdir@
+ GCOV_BASE="@abs_top_srcdir@"
test_dir="@abs_top_srcdir@/cts"
- cd @abs_top_srcdir@
+ cd "@abs_top_srcdir@" || exit 1
grep with-gcov config.log
- if [ $? = 0 ]; then
+ if [ $? -eq 0 ]; then
echo "Pacemaker was built with gcov support"
else
echo "Re-building with gcov support"
last=`grep --color=never "$.*configure" config.log | tail -n 1 | sed s:.*configure:./configure: | sed s:--no-create:--with-gcov:`
eval $last
fi
#sudo make core core-install
else
GCOV_BASE=@localstatedir@/lib/pacemaker/gcov/
mkdir -p $GCOV_BASE
export GCOV_PREFIX_STRIP=4
export GCOV_PREFIX=$GCOV_BASE
top=`find / -name crm_internal.h 2>/dev/null | grep debug | head -n 1`
if [ "x$top" = x ]; then
echo "Could not locate the pacemaker headers"
exit 1
fi
- cd `dirname $top`
- cd ..
+ cd "$(dirname "$top")" || exit 1
+ cd .. || exit 1
echo "Creating the directory structure in $GCOV_BASE from $PWD"
# The .gcno files will already be there for sources,
# but we still need to create the include/ subtree
find . -type d -exec mkdir -p $GCOV_BASE/\{\} \;
echo "Now linking the source files into place"
find . -type f -name "*.c" -exec ln -s $PWD/\{\} $GCOV_BASE\{\} \;
find . -type f -name "*.h" -exec ln -s $PWD/\{\} $GCOV_BASE\{\} \;
find . -type f -name "*.debug" -exec ln -s $PWD/\{\} $GCOV_BASE\{\} \;
fi
-cd $start
+cd "$start" || exit 1
lcov -d $GCOV_BASE -z
# Run all active regression tests
$test_dir/cts-regression
lcov -d $GCOV_BASE -c -o pacemaker.info
rm -rf html
mkdir html
genhtml -o html pacemaker.info
diff --git a/cts/cts-regression.in b/cts/cts-regression.in
index d458a5e989..19d8612a73 100755
--- a/cts/cts-regression.in
+++ b/cts/cts-regression.in
@@ -1,215 +1,214 @@
#!@BASH_PATH@
#
# cts-regression
#
# Convenience wrapper for running any of the Pacemaker regression tests
#
# Copyright 2012-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
USAGE_TEXT="Usage: cts-regression [<options>] [<test> ...]
Options:
--help Display this text, then exit
-V, --verbose Increase test verbosity
-v, --valgrind Run test commands under valgrind
Tests (default tests are 'scheduler cli'):
scheduler Action scheduler
cli Command-line tools
exec Local resource agent executor
pacemaker_remote Resource agent executor in remote mode
fencing Fencer
all Synonym for 'scheduler cli exec fencing'"
# If readlink supports -e (i.e. GNU), use it
readlink -e / >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
- test_home="$(dirname $(readlink -e $0))"
+ test_home="$(dirname "$(readlink -e "$0")")"
else
- test_home="$(dirname $0)"
+ test_home="$(dirname "$0")"
fi
valgrind=""
verbose=""
tests=""
# These constants must track crm_exit_t values
CRM_EX_OK=0
CRM_EX_ERROR=1
CRM_EX_NOT_INSTALLED=5
CRM_EX_USAGE=64
function info() {
printf "$*\n"
}
function error() {
printf " * ERROR: $*\n"
}
function run_as_root() {
CMD="$1"
shift
- ARGS="$@"
+ ARGS="$*" # assumes arguments don't need quoting
# Test might not be executable if run from source directory
chmod a+x $CMD
CMD="$CMD $ARGS $verbose"
if [ $EUID -eq 0 ]; then
$CMD
elif [ -z $TRAVIS ]; then
# sudo doesn't work in buildbot, su doesn't work in travis
echo "Enter the root password..."
su root -c "$CMD"
else
echo "Enter the root password if prompted..."
sudo -- $CMD
fi
}
add_test() {
local TEST="$1"
case "$TEST" in
scheduler|exec|pacemaker_remote|fencing|cli)
if [[ ! $tests =~ $TEST ]]; then
tests="$tests $TEST"
fi
;;
*)
error "unknown test: $TEST"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
}
run_test() {
local t="$1"
info "Executing the $t regression tests"
info "============================================================"
case $t in
scheduler)
if [ -x $test_home/cts-scheduler ]; then
$test_home/cts-scheduler $verbose $valgrind
rc=$?
else
error "scheduler regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
exec)
if [ -x $test_home/cts-exec ]; then
run_as_root $test_home/cts-exec
rc=$?
else
error "executor regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
pacemaker_remote)
if [ -x $test_home/cts-exec ]; then
run_as_root $test_home/cts-exec -R
rc=$?
else
error "pacemaker_remote regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
fencing)
if [ -x $test_home/cts-fencing ]; then
run_as_root $test_home/cts-fencing
rc=$?
else
error "fencing regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
cli)
if [ -x $test_home/cts-cli ]; then
$test_home/cts-cli $verbose $valgrind
rc=$?
else
error "cli regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
esac
info "============================================================"
info ""
info ""
return $rc
}
run_tests() {
- TESTS="$@"
local TEST
local TEST_RC
local FAILED
FAILED=""
- for TEST in $TESTS; do
+ for TEST in "$@"; do
run_test $TEST
TEST_RC=$?
if [ $TEST_RC -ne 0 ]; then
info "$TEST regression tests failed ($TEST_RC)"
FAILED="$FAILED $TEST"
fi
done
if [ -n "$FAILED" ]; then
error "failed regression tests: $FAILED"
return $CRM_EX_ERROR
fi
return $CRM_EX_OK
}
while [ $# -gt 0 ] ; do
case "$1" in
--help)
echo "$USAGE_TEXT"
exit $CRM_EX_OK
;;
-V|--verbose)
verbose="-V"
shift
;;
-v|--valgrind)
valgrind="-v"
shift
;;
scheduler|exec|pacemaker_remote|fencing|cli)
add_test $1
shift
;;
all)
add_test scheduler
add_test cli
add_test exec
add_test fencing
shift
;;
*)
error "unknown option: $1"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
done
if [ -z "$tests" ]; then
add_test scheduler
add_test cli
fi
run_tests $tests
diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in
index 9a6fb6b3a3..d3c1778343 100644
--- a/cts/cts-scheduler.in
+++ b/cts/cts-scheduler.in
@@ -1,1292 +1,1294 @@
#!@BASH_PATH@
#
# Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
USAGE_TEXT="Usage: cts-scheduler [<options>]
Options:
--help Display this text, then exit
-V, --verbose Display any differences from expected output
--run TEST Run only single specified test
--update Update expected results with actual results
-b, --binary PATH Specify path to crm_simulate
-i, --io-dir PATH Specify path to regression test data directory
-v, --valgrind Run all commands under valgrind
--valgrind-dhat Run all commands under valgrind with heap analyzer
--valgrind-skip-output If running under valgrind, don't display output
--testcmd-options Additional options for command under test"
SBINDIR="@sbindir@"
BUILDDIR="@abs_top_builddir@"
CRM_SCHEMA_DIRECTORY="@CRM_SCHEMA_DIRECTORY@"
# If readlink supports -e (i.e. GNU), use it
readlink -e / >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
- test_home="$(dirname $(readlink -e $0))"
+ test_home="$(dirname "$(readlink -e "$0")")"
else
- test_home="$(dirname $0)"
+ test_home="$(dirname "$0")"
fi
io_dir="$test_home/scheduler"
failed="$test_home/.regression.failed.diff"
test_binary=
testcmd_options=
single_test=
verbose=0
num_failed=0
num_tests=0
VALGRIND_CMD=""
VALGRIND_OPTS="-q
--gen-suppressions=all
--log-file=%q{valgrind_output}
--time-stamp=yes
--trace-children=no
--show-reachable=no
--leak-check=full
--num-callers=20
--suppressions=$test_home/valgrind-pcmk.suppressions"
VALGRIND_DHAT_OPTS="--tool=exp-dhat
--log-file=%q{valgrind_output}
--time-stamp=yes
--trace-children=no
--show-top-n=100
--num-callers=4"
diff_opts="--ignore-all-space --ignore-blank-lines -u -N"
# These constants must track crm_exit_t values
CRM_EX_OK=0
CRM_EX_ERROR=1
CRM_EX_NOT_INSTALLED=5
CRM_EX_USAGE=64
CRM_EX_NOINPUT=66
EXITCODE=$CRM_EX_OK
function info() {
printf "$*\n"
}
function error() {
printf " * ERROR: $*\n"
}
function failed() {
printf " * FAILED: $*\n"
}
function show_test() {
name=$1; shift
printf " Test %-25s $*\n" "$name:"
}
# Normalize scheduler output for comparison
normalize() {
for NORMALIZE_FILE in "$@"; do
# sed -i is not portable :-(
sed -e 's/crm_feature_set="[^"]*"//' \
-e 's/batch-limit="[0-9]*"//' \
"$NORMALIZE_FILE" > "${NORMALIZE_FILE}.$$"
mv -- "${NORMALIZE_FILE}.$$" "$NORMALIZE_FILE"
done
}
info "Test home is:\t$test_home"
create_mode="false"
while [ $# -gt 0 ] ; do
case "$1" in
-V|--verbose)
verbose=1
shift
;;
-v|--valgrind)
export G_SLICE=always-malloc
VALGRIND_CMD="valgrind $VALGRIND_OPTS"
shift
;;
--valgrind-dhat)
VALGRIND_CMD="valgrind $VALGRIND_DHAT_OPTS"
shift
;;
--valgrind-skip-output)
VALGRIND_SKIP_OUTPUT=1
shift
;;
--update)
create_mode="true"
shift
;;
--run)
single_test=$(basename "$2" ".xml")
shift 2
+ break # any remaining arguments will be passed to test command
;;
-b|--binary)
test_binary="$2"
shift 2
;;
-i|--io-dir)
io_dir="$2"
shift 2
;;
--help)
echo "$USAGE_TEXT"
exit $CRM_EX_OK
;;
--testcmd-options)
testcmd_options=$2
shift 2
;;
*)
error "unknown option: $1"
exit $CRM_EX_USAGE
;;
esac
done
if [ -z "$PCMK_schema_directory" ]; then
if [ -d "$BUILDDIR/xml" ]; then
export PCMK_schema_directory="$BUILDDIR/xml"
elif [ -d "$CRM_SCHEMA_DIRECTORY" ]; then
export PCMK_schema_directory="$CRM_SCHEMA_DIRECTORY"
fi
fi
if [ -z "$test_binary" ]; then
if [ -x "$BUILDDIR/tools/crm_simulate" ]; then
test_binary="$BUILDDIR/tools/crm_simulate"
elif [ -x "$SBINDIR/crm_simulate" ]; then
test_binary="$SBINDIR/crm_simulate"
fi
fi
if [ ! -x "$test_binary" ]; then
error "Test binary $test_binary not found"
exit $CRM_EX_NOT_INSTALLED
fi
info "Test binary is:\t$test_binary"
if [ -n "$PCMK_schema_directory" ]; then
info "Schema home is:\t$PCMK_schema_directory"
fi
if [ "x$VALGRIND_CMD" != "x" ]; then
info "Activating memory testing with valgrind";
fi
info " "
test_cmd="$VALGRIND_CMD $test_binary $testcmd_options"
#echo $test_cmd
-if [ `whoami` != root ]; then
+if [ "$(whoami)" != "root" ]; then
declare -x CIB_shadow_dir=/tmp
fi
do_test() {
did_fail=0
expected_rc=0
num_tests=$(( $num_tests + 1 ))
base=$1; shift
name=$1; shift
input=$io_dir/${base}.xml
output=$io_dir/${base}.out
expected=$io_dir/${base}.exp
- dot_png=$io_dir/${base}.png
dot_expected=$io_dir/${base}.dot
dot_output=$io_dir/${base}.pe.dot
scores=$io_dir/${base}.scores
score_output=$io_dir/${base}.scores.pe
stderr_expected=$io_dir/${base}.stderr
stderr_output=$io_dir/${base}.stderr.pe
summary=$io_dir/${base}.summary
summary_output=$io_dir/${base}.summary.pe
valgrind_output=$io_dir/${base}.valgrind
export valgrind_output
if [ "x$1" = "x--rc" ]; then
expected_rc=$2
shift; shift;
fi
show_test "$base" "$name"
if [ ! -f $input ]; then
error "No input";
did_fail=1
num_failed=$(( $num_failed + 1 ))
return $CRM_EX_NOINPUT;
fi
- if [ "$create_mode" != "true" -a ! -f $expected ]; then
+ if [ "$create_mode" != "true" ] && [ ! -f "$expected" ]; then
error "no stored output";
return $CRM_EX_NOINPUT;
fi
# ../admin/crm_verify -X $input
if [ ! -z "$single_test" ]; then
- echo CIB_shadow_dir=$io_dir $test_cmd -x $input -D $dot_output -G $output -S $*
- CIB_shadow_dir=$io_dir $test_cmd -x $input -D $dot_output -G $output -S $* 2>&1 | tee $summary_output
+ echo "CIB_shadow_dir=\"$io_dir\" $test_cmd -x \"$input\" -D \"$dot_output\" -G \"$output\" -S" "$@"
+ CIB_shadow_dir="$io_dir" $test_cmd -x "$input" -D "$dot_output" \
+ -G "$output" -S "$@" 2>&1 | tee "$summary_output"
else
- CIB_shadow_dir=$io_dir $test_cmd -x $input -S &> $summary_output
+ CIB_shadow_dir="$io_dir" $test_cmd -x "$input" -S &> "$summary_output"
fi
- CIB_shadow_dir=$io_dir $test_cmd -x $input -D $dot_output -G $output -SQ -s $* 2> $stderr_output > $score_output
+ CIB_shadow_dir="$io_dir" $test_cmd -x "$input" -D "$dot_output" \
+ -G "$output" -SQ -s "$@" 2> "$stderr_output" > "$score_output"
rc=$?
if [ $rc -ne $expected_rc ]; then
failed "Test returned: $rc";
did_fail=1
- echo "CIB_shadow_dir=$io_dir $test_cmd -x $input -D $dot_output -G $output -SQ -s $*"
+ echo "CIB_shadow_dir=\"$io_dir\" $test_cmd -x \"$input\" -D \"$dot_output\" -G \"$output\" -SQ -s" "$@"
fi
if [ -z "$VALGRIND_SKIP_OUTPUT" ]; then
if [ -s "${valgrind_output}" ]; then
error "Valgrind reported errors";
did_fail=1
cat ${valgrind_output}
fi
rm -f ${valgrind_output}
fi
if [ -s core ]; then
error "Core-file detected: core.${base}";
did_fail=1
rm -f $test_home/core.$base
mv core $test_home/core.$base
fi
if [ -e "$stderr_expected" ]; then
diff $diff_opts $stderr_expected $stderr_output >/dev/null
rc2=$?
if [ $rc2 -ne 0 ]; then
failed "stderr changed";
diff $diff_opts $stderr_expected $stderr_output 2>/dev/null >> $failed
echo "" >> $failed
did_fail=1
fi
elif [ -s "$stderr_output" ]; then
error "Output was written to stderr"
did_fail=1
cat $stderr_output
fi
rm -f $stderr_output
if [ ! -s $output ]; then
error "No graph produced";
did_fail=1
num_failed=$(( $num_failed + 1 ))
rm -f $output
return $CRM_EX_ERROR;
fi
if [ ! -s $dot_output ]; then
error "No dot-file summary produced";
did_fail=1
num_failed=$(( $num_failed + 1 ))
rm -f $output
return $CRM_EX_ERROR;
else
echo "digraph \"g\" {" > $dot_output.sort
- LC_ALL=POSIX sort -u $dot_output | grep -v -e ^}$ -e digraph >> $dot_output.sort
+ LC_ALL=POSIX sort -u $dot_output | grep -v -e '^}$' -e digraph >> $dot_output.sort
echo "}" >> $dot_output.sort
mv -f $dot_output.sort $dot_output
fi
if [ ! -s $score_output ]; then
error "No allocation scores produced";
did_fail=1
num_failed=$(( $num_failed + 1 ))
rm $output
return $CRM_EX_ERROR;
else
LC_ALL=POSIX sort $score_output > $score_output.sorted
mv -f $score_output.sorted $score_output
fi
if [ "$create_mode" = "true" ]; then
cp "$output" "$expected"
cp "$dot_output" "$dot_expected"
cp "$score_output" "$scores"
cp "$summary_output" "$summary"
info " Updated expected outputs"
fi
diff $diff_opts $summary $summary_output >/dev/null
rc2=$?
if [ $rc2 -ne 0 ]; then
failed "summary changed";
diff $diff_opts $summary $summary_output 2>/dev/null >> $failed
echo "" >> $failed
did_fail=1
fi
diff $diff_opts $dot_expected $dot_output >/dev/null
rc=$?
if [ $rc -ne 0 ]; then
failed "dot-file summary changed";
diff $diff_opts $dot_expected $dot_output 2>/dev/null >> $failed
echo "" >> $failed
did_fail=1
else
rm -f $dot_output
fi
normalize "$expected" "$output"
diff $diff_opts $expected $output >/dev/null
rc2=$?
if [ $rc2 -ne 0 ]; then
failed "xml-file changed";
diff $diff_opts $expected $output 2>/dev/null >> $failed
echo "" >> $failed
did_fail=1
fi
diff $diff_opts $scores $score_output >/dev/null
rc=$?
if [ $rc -ne 0 ]; then
failed "scores-file changed";
diff $diff_opts $scores $score_output 2>/dev/null >> $failed
echo "" >> $failed
did_fail=1
fi
rm -f $output $score_output $summary_output
if [ $did_fail -eq 1 ]; then
num_failed=$(( $num_failed + 1 ))
return $CRM_EX_ERROR
fi
return $CRM_EX_OK
}
function test_results {
if [ $num_failed -ne 0 ]; then
if [ -s "$failed" ]; then
if [ $verbose -eq 1 ]; then
error "Results of $num_failed failed tests (out of $num_tests)...."
cat $failed
else
error "Results of $num_failed failed tests (out of $num_tests) are in $failed...."
error "Use $0 -V to display them automatically."
fi
else
error "$num_failed (of $num_tests) tests failed (no diff results)"
rm $failed
fi
EXITCODE=$CRM_EX_ERROR
fi
}
# zero out the error log
-> $failed
+true > $failed
if [ -n "$single_test" ]; then
- do_test $single_test "Single shot" $*
+ do_test "$single_test" "Single shot" "$@"
TEST_RC=$?
- cat $failed
+ cat "$failed"
exit $TEST_RC
fi
DO_VERSIONED_TESTS=0
create_mode=true
# info Creating the following tests from $io_dir
# do_test order-expired-failure "Order failcount cleanup after remote fencing"
create_mode=false
info Performing the following tests from $io_dir
echo ""
do_test simple1 "Offline "
do_test simple2 "Start "
do_test simple3 "Start 2 "
do_test simple4 "Start Failed"
do_test simple6 "Stop Start "
do_test simple7 "Shutdown "
#do_test simple8 "Stonith "
#do_test simple9 "Lower version"
#do_test simple10 "Higher version"
do_test simple11 "Priority (ne)"
do_test simple12 "Priority (eq)"
do_test simple8 "Stickiness"
echo ""
do_test group1 "Group "
do_test group2 "Group + Native "
do_test group3 "Group + Group "
do_test group4 "Group + Native (nothing)"
do_test group5 "Group + Native (move) "
do_test group6 "Group + Group (move) "
do_test group7 "Group colocation"
do_test group13 "Group colocation (cant run)"
do_test group8 "Group anti-colocation"
do_test group9 "Group recovery"
do_test group10 "Group partial recovery"
do_test group11 "Group target_role"
do_test group14 "Group stop (graph terminated)"
do_test group15 "Negative group colocation"
do_test bug-1573 "Partial stop of a group with two children"
do_test bug-1718 "Mandatory group ordering - Stop group_FUN"
do_test bug-lf-2613 "Move group on failure"
do_test bug-lf-2619 "Move group on clone failure"
do_test group-fail "Ensure stop order is preserved for partially active groups"
do_test group-unmanaged "No need to restart r115 because r114 is unmanaged"
do_test group-unmanaged-stopped "Make sure r115 is stopped when r114 fails"
do_test group-dependents "Account for the location preferences of things colocated with a group"
echo ""
do_test rsc_dep1 "Must not "
do_test rsc_dep3 "Must "
do_test rsc_dep5 "Must not 3 "
do_test rsc_dep7 "Must 3 "
do_test rsc_dep10 "Must (but cant)"
do_test rsc_dep2 "Must (running) "
do_test rsc_dep8 "Must (running : alt) "
do_test rsc_dep4 "Must (running + move)"
do_test asymmetric "Asymmetric - require explicit location constraints"
echo ""
do_test orphan-0 "Orphan ignore"
do_test orphan-1 "Orphan stop"
do_test orphan-2 "Orphan stop, remove failcount"
echo ""
do_test params-0 "Params: No change"
do_test params-1 "Params: Changed"
do_test params-2 "Params: Resource definition"
do_test params-4 "Params: Reload"
do_test params-5 "Params: Restart based on probe digest"
do_test novell-251689 "Resource definition change + target_role=stopped"
do_test bug-lf-2106 "Restart all anonymous clone instances after config change"
do_test params-6 "Params: Detect reload in previously migrated resource"
do_test nvpair-id-ref "Support id-ref in nvpair with optional name"
do_test not-reschedule-unneeded-monitor "Do not reschedule unneeded monitors while resource definitions have changed"
do_test reload-becomes-restart "Cancel reload if restart becomes required"
echo ""
do_test target-0 "Target Role : baseline"
do_test target-1 "Target Role : master"
do_test target-2 "Target Role : invalid"
echo ""
do_test base-score "Set a node's default score for all nodes"
echo ""
do_test date-1 "Dates" -t "2005-020"
do_test date-2 "Date Spec - Pass" -t "2005-020T12:30"
do_test date-3 "Date Spec - Fail" -t "2005-020T11:30"
do_test origin "Timing of recurring operations" -t "2014-05-07 00:28:00"
do_test probe-0 "Probe (anon clone)"
do_test probe-1 "Pending Probe"
do_test probe-2 "Correctly re-probe cloned groups"
do_test probe-3 "Probe (pending node)"
do_test probe-4 "Probe (pending node + stopped resource)"
do_test standby "Standby"
do_test comments "Comments"
echo ""
do_test one-or-more-0 "Everything starts"
do_test one-or-more-1 "Nothing starts because of A"
do_test one-or-more-2 "D can start because of C"
do_test one-or-more-3 "D cannot start because of B and C"
do_test one-or-more-4 "D cannot start because of target-role"
do_test one-or-more-5 "Start A and F even though C and D are stopped"
do_test one-or-more-6 "Leave A running even though B is stopped"
do_test one-or-more-7 "Leave A running even though C is stopped"
do_test bug-5140-require-all-false "Allow basegrp:0 to stop"
do_test clone-require-all-1 "clone B starts node 3 and 4"
do_test clone-require-all-2 "clone B remains stopped everywhere"
do_test clone-require-all-3 "clone B stops everywhere because A stops everywhere"
do_test clone-require-all-4 "clone B remains on node 3 and 4 with only one instance of A remaining."
do_test clone-require-all-5 "clone B starts on node 1 3 and 4"
do_test clone-require-all-6 "clone B remains active after shutting down instances of A"
do_test clone-require-all-7 "clone A and B both start at the same time. all instances of A start before B."
do_test clone-require-all-no-interleave-1 "C starts everywhere after A and B"
do_test clone-require-all-no-interleave-2 "C starts on nodes 1, 2, and 4 with only one active instance of B"
do_test clone-require-all-no-interleave-3 "C remains active when instance of B is stopped on one node and started on another."
do_test one-or-more-unrunnable-instances "Avoid dependencies on instances that won't ever be started"
echo ""
do_test order1 "Order start 1 "
do_test order2 "Order start 2 "
do_test order3 "Order stop "
do_test order4 "Order (multiple) "
do_test order5 "Order (move) "
do_test order6 "Order (move w/ restart) "
do_test order7 "Order (mandatory) "
do_test order-optional "Order (score=0) "
do_test order-required "Order (score=INFINITY) "
do_test bug-lf-2171 "Prevent group start when clone is stopped"
do_test order-clone "Clone ordering should be able to prevent startup of dependent clones"
do_test order-sets "Ordering for resource sets"
do_test order-serialize "Serialize resources without inhibiting migration"
do_test order-serialize-set "Serialize a set of resources without inhibiting migration"
do_test clone-order-primitive "Order clone start after a primitive"
do_test clone-order-16instances "Verify ordering of 16 cloned resources"
do_test order-optional-keyword "Order (optional keyword)"
do_test order-mandatory "Order (mandatory keyword)"
do_test bug-lf-2493 "Don't imply colocation requirements when applying ordering constraints with clones"
do_test ordered-set-basic-startup "Constraint set with default order settings."
do_test ordered-set-natural "Allow natural set ordering"
do_test order-wrong-kind "Order (error)"
echo ""
do_test coloc-loop "Colocation - loop"
do_test coloc-many-one "Colocation - many-to-one"
do_test coloc-list "Colocation - many-to-one with list"
do_test coloc-group "Colocation - groups"
do_test coloc-slave-anti "Anti-colocation with slave shouldn't prevent master colocation"
do_test coloc-attr "Colocation based on node attributes"
do_test coloc-negative-group "Negative colocation with a group"
do_test coloc-intra-set "Intra-set colocation"
do_test bug-lf-2435 "Colocation sets with a negative score"
do_test coloc-clone-stays-active "Ensure clones don't get stopped/demoted because a dependent must stop"
do_test coloc_fp_logic "Verify floating point calculations in colocation are working"
do_test colo_master_w_native "cl#5070 - Verify promotion order is affected when colocating master to native rsc."
do_test colo_slave_w_native "cl#5070 - Verify promotion order is affected when colocating slave to native rsc."
do_test anti-colocation-order "cl#5187 - Prevent resources in an anti-colocation from even temporarily running on a same node"
do_test anti-colocation-master "Organize order of actions for master resources in anti-colocations"
do_test anti-colocation-slave "Organize order of actions for slave resources in anti-colocations"
do_test enforce-colo1 "Always enforce B with A INFINITY."
do_test complex_enforce_colo "Always enforce B with A INFINITY. (make sure heat-engine stops)"
echo ""
do_test rsc-sets-seq-true "Resource Sets - sequential=false"
do_test rsc-sets-seq-false "Resource Sets - sequential=true"
do_test rsc-sets-clone "Resource Sets - Clone"
do_test rsc-sets-master "Resource Sets - Master"
do_test rsc-sets-clone-1 "Resource Sets - Clone (lf#2404)"
#echo ""
#do_test agent1 "version: lt (empty)"
#do_test agent2 "version: eq "
#do_test agent3 "version: gt "
echo ""
do_test attrs1 "string: eq (and) "
do_test attrs2 "string: lt / gt (and)"
do_test attrs3 "string: ne (or) "
do_test attrs4 "string: exists "
do_test attrs5 "string: not_exists "
do_test attrs6 "is_dc: true "
do_test attrs7 "is_dc: false "
do_test attrs8 "score_attribute "
do_test per-node-attrs "Per node resource parameters"
echo ""
do_test mon-rsc-1 "Schedule Monitor - start"
do_test mon-rsc-2 "Schedule Monitor - move "
do_test mon-rsc-3 "Schedule Monitor - pending start "
do_test mon-rsc-4 "Schedule Monitor - move/pending start"
echo ""
do_test rec-rsc-0 "Resource Recover - no start "
do_test rec-rsc-1 "Resource Recover - start "
do_test rec-rsc-2 "Resource Recover - monitor "
do_test rec-rsc-3 "Resource Recover - stop - ignore"
do_test rec-rsc-4 "Resource Recover - stop - block "
do_test rec-rsc-5 "Resource Recover - stop - fence "
do_test rec-rsc-6 "Resource Recover - multiple - restart"
do_test rec-rsc-7 "Resource Recover - multiple - stop "
do_test rec-rsc-8 "Resource Recover - multiple - block "
do_test rec-rsc-9 "Resource Recover - group/group"
do_test monitor-recovery "on-fail=block + resource recovery detected by recurring monitor"
do_test stop-failure-no-quorum "Stop failure without quorum"
do_test stop-failure-no-fencing "Stop failure without fencing available"
do_test stop-failure-with-fencing "Stop failure with fencing available"
do_test multiple-active-block-group "Support of multiple-active=block for resource groups"
do_test multiple-monitor-one-failed "Consider resource failed if any of the configured monitor operations failed"
echo ""
do_test quorum-1 "No quorum - ignore"
do_test quorum-2 "No quorum - freeze"
do_test quorum-3 "No quorum - stop "
do_test quorum-4 "No quorum - start anyway"
do_test quorum-5 "No quorum - start anyway (group)"
do_test quorum-6 "No quorum - start anyway (clone)"
do_test bug-cl-5212 "No promotion with no-quorum-policy=freeze"
do_test suicide-needed-inquorate "no-quorum-policy=suicide: suicide necessary"
do_test suicide-not-needed-initial-quorum "no-quorum-policy=suicide: suicide not necessary at initial quorum"
do_test suicide-not-needed-never-quorate "no-quorum-policy=suicide: suicide not necessary if never quorate"
do_test suicide-not-needed-quorate "no-quorum-policy=suicide: suicide necessary if quorate"
echo ""
do_test rec-node-1 "Node Recover - Startup - no fence"
do_test rec-node-2 "Node Recover - Startup - fence "
do_test rec-node-3 "Node Recover - HA down - no fence"
do_test rec-node-4 "Node Recover - HA down - fence "
do_test rec-node-5 "Node Recover - CRM down - no fence"
do_test rec-node-6 "Node Recover - CRM down - fence "
do_test rec-node-7 "Node Recover - no quorum - ignore "
do_test rec-node-8 "Node Recover - no quorum - freeze "
do_test rec-node-9 "Node Recover - no quorum - stop "
do_test rec-node-10 "Node Recover - no quorum - stop w/fence"
do_test rec-node-11 "Node Recover - CRM down w/ group - fence "
do_test rec-node-12 "Node Recover - nothing active - fence "
do_test rec-node-13 "Node Recover - failed resource + shutdown - fence "
do_test rec-node-15 "Node Recover - unknown lrm section"
do_test rec-node-14 "Serialize all stonith's"
echo ""
do_test multi1 "Multiple Active (stop/start)"
echo ""
do_test migrate-begin "Normal migration"
do_test migrate-success "Completed migration"
do_test migrate-partial-1 "Completed migration, missing stop on source"
do_test migrate-partial-2 "Successful migrate_to only"
do_test migrate-partial-3 "Successful migrate_to only, target down"
do_test migrate-partial-4 "Migrate from the correct host after migrate_to+migrate_from"
do_test bug-5186-partial-migrate "Handle partial migration when src node loses membership"
do_test migrate-fail-2 "Failed migrate_from"
do_test migrate-fail-3 "Failed migrate_from + stop on source"
do_test migrate-fail-4 "Failed migrate_from + stop on target - ideally we wouldn't need to re-stop on target"
do_test migrate-fail-5 "Failed migrate_from + stop on source and target"
do_test migrate-fail-6 "Failed migrate_to"
do_test migrate-fail-7 "Failed migrate_to + stop on source"
do_test migrate-fail-8 "Failed migrate_to + stop on target - ideally we wouldn't need to re-stop on target"
do_test migrate-fail-9 "Failed migrate_to + stop on source and target"
do_test migrate-stop "Migration in a stopping stack"
do_test migrate-start "Migration in a starting stack"
do_test migrate-stop_start "Migration in a restarting stack"
do_test migrate-stop-complex "Migration in a complex stopping stack"
do_test migrate-start-complex "Migration in a complex starting stack"
do_test migrate-stop-start-complex "Migration in a complex moving stack"
do_test migrate-shutdown "Order the post-migration 'stop' before node shutdown"
do_test migrate-1 "Migrate (migrate)"
do_test migrate-2 "Migrate (stable)"
do_test migrate-3 "Migrate (failed migrate_to)"
do_test migrate-4 "Migrate (failed migrate_from)"
do_test novell-252693 "Migration in a stopping stack"
do_test novell-252693-2 "Migration in a starting stack"
do_test novell-252693-3 "Non-Migration in a starting and stopping stack"
do_test bug-1820 "Migration in a group"
do_test bug-1820-1 "Non-migration in a group"
do_test migrate-5 "Primitive migration with a clone"
do_test migrate-fencing "Migration after Fencing"
do_test migrate-both-vms "Migrate two VMs that have no colocation"
do_test migration-behind-migrating-remote "Migrate resource behind migrating remote connection"
do_test 1-a-then-bm-move-b "Advanced migrate logic. A then B. migrate B."
do_test 2-am-then-b-move-a "Advanced migrate logic, A then B, migrate A without stopping B"
do_test 3-am-then-bm-both-migrate "Advanced migrate logic. A then B. migrate both"
do_test 4-am-then-bm-b-not-migratable "Advanced migrate logic, A then B, B not migratable"
do_test 5-am-then-bm-a-not-migratable "Advanced migrate logic. A then B. move both, a not migratable"
do_test 6-migrate-group "Advanced migrate logic, migrate a group"
do_test 7-migrate-group-one-unmigratable "Advanced migrate logic, migrate group mixed with allow-migrate true/false"
do_test 8-am-then-bm-a-migrating-b-stopping "Advanced migrate logic, A then B, A migrating, B stopping"
do_test 9-am-then-bm-b-migrating-a-stopping "Advanced migrate logic, A then B, B migrate, A stopping"
do_test 10-a-then-bm-b-move-a-clone "Advanced migrate logic, A clone then B, migrate B while stopping A"
do_test 11-a-then-bm-b-move-a-clone-starting "Advanced migrate logic, A clone then B, B moving while A is start/stopping"
do_test a-promote-then-b-migrate "A promote then B start. migrate B"
do_test a-demote-then-b-migrate "A demote then B stop. migrate B"
if [ $DO_VERSIONED_TESTS -eq 1 ]; then
do_test migrate-versioned "Disable migration for versioned resources"
fi
#echo ""
#do_test complex1 "Complex "
do_test bug-lf-2422 "Dependency on partially active group - stop ocfs:*"
echo ""
do_test clone-anon-probe-1 "Probe the correct (anonymous) clone instance for each node"
do_test clone-anon-probe-2 "Avoid needless re-probing of anonymous clones"
do_test clone-anon-failcount "Merge failcounts for anonymous clones"
do_test inc0 "Incarnation start"
do_test inc1 "Incarnation start order"
do_test inc2 "Incarnation silent restart, stop, move"
do_test inc3 "Inter-incarnation ordering, silent restart, stop, move"
do_test inc4 "Inter-incarnation ordering, silent restart, stop, move (ordered)"
do_test inc5 "Inter-incarnation ordering, silent restart, stop, move (restart 1)"
do_test inc6 "Inter-incarnation ordering, silent restart, stop, move (restart 2)"
do_test inc7 "Clone colocation"
do_test inc8 "Clone anti-colocation"
do_test inc9 "Non-unique clone"
do_test inc10 "Non-unique clone (stop)"
do_test inc11 "Primitive colocation with clones"
do_test inc12 "Clone shutdown"
do_test cloned-group "Make sure only the correct number of cloned groups are started"
do_test cloned-group-stop "Ensure stopping qpidd also stops glance and cinder"
do_test clone-no-shuffle "Don't prioritize allocation of instances that must be moved"
do_test clone-max-zero "Orphan processing with clone-max=0"
do_test clone-anon-dup "Bug LF#2087 - Correctly parse the state of anonymous clones that are active more than once per node"
do_test bug-lf-2160 "Don't shuffle clones due to colocation"
do_test bug-lf-2213 "clone-node-max enforcement for cloned groups"
do_test bug-lf-2153 "Clone ordering constraints"
do_test bug-lf-2361 "Ensure clones observe mandatory ordering constraints if the LHS is unrunnable"
do_test bug-lf-2317 "Avoid needless restart of primitive depending on a clone"
do_test clone-colocate-instance-1 "Colocation with a specific clone instance (negative example)"
do_test clone-colocate-instance-2 "Colocation with a specific clone instance"
do_test clone-order-instance "Ordering with specific clone instances"
do_test bug-lf-2453 "Enforce mandatory clone ordering without colocation"
do_test bug-lf-2508 "Correctly reconstruct the status of anonymous cloned groups"
do_test bug-lf-2544 "Balanced clone placement"
do_test bug-lf-2445 "Redistribute clones with node-max > 1 and stickiness = 0"
do_test bug-lf-2574 "Avoid clone shuffle"
do_test bug-lf-2581 "Avoid group restart due to unrelated clone (re)start"
do_test bug-cl-5168 "Don't shuffle clones"
do_test bug-cl-5170 "Prevent clone from starting with on-fail=block"
do_test clone-fail-block-colocation "Move colocated group when failed clone has on-fail=block"
do_test clone-interleave-1 "Clone-3 cannot start on pcmk-1 due to interleaved ordering (no colocation)"
do_test clone-interleave-2 "Clone-3 must stop on pcmk-1 due to interleaved ordering (no colocation)"
do_test clone-interleave-3 "Clone-3 must be recovered on pcmk-1 due to interleaved ordering (no colocation)"
do_test rebalance-unique-clones "Rebalance unique clone instances with no stickiness"
do_test clone-requires-quorum-recovery "Clone with requires=quorum on failed node needing recovery"
do_test clone-requires-quorum "Clone with requires=quorum with presumed-inactive instance on failed node"
echo ""
do_test cloned_start_one "order first clone then clone... first clone_min=2"
do_test cloned_start_two "order first clone then clone... first clone_min=2"
do_test cloned_stop_one "order first clone then clone... first clone_min=2"
do_test cloned_stop_two "order first clone then clone... first clone_min=2"
do_test clone_min_interleave_start_one "order first clone then clone... first clone_min=2 and then has interleave=true"
do_test clone_min_interleave_start_two "order first clone then clone... first clone_min=2 and then has interleave=true"
do_test clone_min_interleave_stop_one "order first clone then clone... first clone_min=2 and then has interleave=true"
do_test clone_min_interleave_stop_two "order first clone then clone... first clone_min=2 and then has interleave=true"
do_test clone_min_start_one "order first clone then primitive... first clone_min=2"
do_test clone_min_start_two "order first clone then primitive... first clone_min=2"
do_test clone_min_stop_all "order first clone then primitive... first clone_min=2"
do_test clone_min_stop_one "order first clone then primitive... first clone_min=2"
do_test clone_min_stop_two "order first clone then primitive... first clone_min=2"
echo ""
do_test unfence-startup "Clean unfencing"
do_test unfence-definition "Unfencing when the agent changes"
do_test unfence-parameters "Unfencing when the agent parameters changes"
do_test unfence-device "Unfencing when a cluster has only fence devices"
echo ""
do_test master-0 "Stopped -> Slave"
do_test master-1 "Stopped -> Promote"
do_test master-2 "Stopped -> Promote : notify"
do_test master-3 "Stopped -> Promote : master location"
do_test master-4 "Started -> Promote : master location"
do_test master-5 "Promoted -> Promoted"
do_test master-6 "Promoted -> Promoted (2)"
do_test master-7 "Promoted -> Fenced"
do_test master-8 "Promoted -> Fenced -> Moved"
do_test master-9 "Stopped + Promotable + No quorum"
do_test master-10 "Stopped -> Promotable : notify with monitor"
do_test master-11 "Stopped -> Promote : colocation"
do_test novell-239082 "Demote/Promote ordering"
do_test novell-239087 "Stable master placement"
do_test master-12 "Promotion based solely on rsc_location constraints"
do_test master-13 "Include preferences of colocated resources when placing master"
do_test master-demote "Ordering when actions depends on demoting a slave resource"
do_test master-ordering "Prevent resources from starting that need a master"
do_test bug-1765 "Master-Master Colocation (dont stop the slaves)"
do_test master-group "Promotion of cloned groups"
do_test bug-lf-1852 "Don't shuffle master/slave instances unnecessarily"
do_test master-failed-demote "Don't retry failed demote actions"
do_test master-failed-demote-2 "Don't retry failed demote actions (notify=false)"
do_test master-depend "Ensure resources that depend on the master don't get allocated until the master does"
do_test master-reattach "Re-attach to a running master"
do_test master-allow-start "Don't include master score if it would prevent allocation"
do_test master-colocation "Allow master instances placemaker to be influenced by colocation constraints"
do_test master-pseudo "Make sure promote/demote pseudo actions are created correctly"
do_test master-role "Prevent target-role from promoting more than master-max instances"
do_test bug-lf-2358 "Master-Master anti-colocation"
do_test master-promotion-constraint "Mandatory master colocation constraints"
do_test unmanaged-master "Ensure role is preserved for unmanaged resources"
do_test master-unmanaged-monitor "Start the correct monitor operation for unmanaged masters"
do_test master-demote-2 "Demote does not clear past failure"
do_test master-move "Move master based on failure of colocated group"
do_test master-probed-score "Observe the promotion score of probed resources"
do_test colocation_constraint_stops_master "cl#5054 - Ensure master is demoted when stopped by colocation constraint"
do_test colocation_constraint_stops_slave "cl#5054 - Ensure slave is not demoted when stopped by colocation constraint"
do_test order_constraint_stops_master "cl#5054 - Ensure master is demoted when stopped by order constraint"
do_test order_constraint_stops_slave "cl#5054 - Ensure slave is not demoted when stopped by order constraint"
do_test master_monitor_restart "cl#5072 - Ensure master monitor operation will start after promotion."
do_test bug-rh-880249 "Handle replacement of an m/s resource with a primitive"
do_test bug-5143-ms-shuffle "Prevent master shuffling due to promotion score"
do_test master-demote-block "Block promotion if demote fails with on-fail=block"
do_test master-dependent-ban "Don't stop instances from being active because a dependent is banned from that host"
do_test master-stop "Stop instances due to location constraint with role=Started"
do_test master-partially-demoted-group "Allow partially demoted group to finish demoting"
do_test bug-cl-5213 "Ensure role colocation with -INFINITY is enforced"
do_test bug-cl-5219 "Allow unrelated resources with a common colocation target to remain promoted"
do_test master-asymmetrical-order "Fix the behaviors of multi-state resources with asymmetrical ordering"
do_test master-notify "Master promotion with notifies"
do_test master-score-startup "Use permanent master scores without LRM history"
do_test failed-demote-recovery "Recover resource in slave role after demote fails"
do_test failed-demote-recovery-master "Recover resource in master role after demote fails"
echo ""
do_test history-1 "Correctly parse stateful-1 resource state"
echo ""
do_test managed-0 "Managed (reference)"
do_test managed-1 "Not managed - down "
do_test managed-2 "Not managed - up "
do_test bug-5028 "Shutdown should block if anything depends on an unmanaged resource"
do_test bug-5028-detach "Ensure detach still works"
do_test bug-5028-bottom "Ensure shutdown still blocks if the blocked resource is at the bottom of the stack"
do_test unmanaged-stop-1 "cl#5155 - Block the stop of resources if any depending resource is unmanaged "
do_test unmanaged-stop-2 "cl#5155 - Block the stop of resources if the first resource in a mandatory stop order is unmanaged "
do_test unmanaged-stop-3 "cl#5155 - Block the stop of resources if any depending resource in a group is unmanaged "
do_test unmanaged-stop-4 "cl#5155 - Block the stop of resources if any depending resource in the middle of a group is unmanaged "
do_test unmanaged-block-restart "Block restart of resources if any dependent resource in a group is unmanaged"
echo ""
do_test interleave-0 "Interleave (reference)"
do_test interleave-1 "coloc - not interleaved"
do_test interleave-2 "coloc - interleaved "
do_test interleave-3 "coloc - interleaved (2)"
do_test interleave-pseudo-stop "Interleaved clone during stonith"
do_test interleave-stop "Interleaved clone during stop"
do_test interleave-restart "Interleaved clone during dependency restart"
echo ""
do_test notify-0 "Notify reference"
do_test notify-1 "Notify simple"
do_test notify-2 "Notify simple, confirm"
do_test notify-3 "Notify move, confirm"
do_test novell-239079 "Notification priority"
#do_test notify-2 "Notify - 764"
do_test notifs-for-unrunnable "Don't schedule notifications for an unrunnable action"
echo ""
do_test 594 "OSDL #594 - Unrunnable actions scheduled in transition"
do_test 662 "OSDL #662 - Two resources start on one node when incarnation_node_max = 1"
do_test 696 "OSDL #696 - CRM starts stonith RA without monitor"
do_test 726 "OSDL #726 - Attempting to schedule rsc_posic041_monitor_5000 _after_ a stop"
do_test 735 "OSDL #735 - Correctly detect that rsc_hadev1 is stopped on hadev3"
do_test 764 "OSDL #764 - Missing monitor op for DoFencing:child_DoFencing:1"
do_test 797 "OSDL #797 - Assert triggered: task_id_i > max_call_id"
do_test 829 "OSDL #829"
do_test 994 "OSDL #994 - Stopping the last resource in a resource group causes the entire group to be restarted"
do_test 994-2 "OSDL #994 - with a dependent resource"
do_test 1360 "OSDL #1360 - Clone stickiness"
do_test 1484 "OSDL #1484 - on_fail=stop"
do_test 1494 "OSDL #1494 - Clone stability"
do_test unrunnable-1 "Unrunnable"
do_test unrunnable-2 "Unrunnable 2"
do_test stonith-0 "Stonith loop - 1"
do_test stonith-1 "Stonith loop - 2"
do_test stonith-2 "Stonith loop - 3"
do_test stonith-3 "Stonith startup"
do_test stonith-4 "Stonith node state"
do_test bug-1572-1 "Recovery of groups depending on master/slave"
do_test bug-1572-2 "Recovery of groups depending on master/slave when the master is never re-promoted"
do_test bug-1685 "Depends-on-master ordering"
do_test bug-1822 "Don't promote partially active groups"
do_test bug-pm-11 "New resource added to a m/s group"
do_test bug-pm-12 "Recover only the failed portion of a cloned group"
do_test bug-n-387749 "Don't shuffle clone instances"
do_test bug-n-385265 "Don't ignore the failure stickiness of group children - resource_idvscommon should stay stopped"
do_test bug-n-385265-2 "Ensure groups are migrated instead of remaining partially active on the current node"
do_test bug-lf-1920 "Correctly handle probes that find active resources"
do_test bnc-515172 "Location constraint with multiple expressions"
do_test colocate-primitive-with-clone "Optional colocation with a clone"
do_test use-after-free-merge "Use-after-free in native_merge_weights"
do_test bug-lf-2551 "STONITH ordering for stop"
do_test bug-lf-2606 "Stonith implies demote"
do_test bug-lf-2474 "Ensure resource op timeout takes precedence over op_defaults"
do_test bug-suse-707150 "Prevent vm-01 from starting due to colocation/ordering"
do_test bug-5014-A-start-B-start "Verify when A starts B starts using symmetrical=false"
do_test bug-5014-A-stop-B-started "Verify when A stops B does not stop if it has already started using symmetric=false"
do_test bug-5014-A-stopped-B-stopped "Verify when A is stopped and B has not started, B does not start before A using symmetric=false"
do_test bug-5014-CthenAthenB-C-stopped "Verify when C then A is symmetrical=true, A then B is symmetric=false, and C is stopped that nothing starts."
do_test bug-5014-CLONE-A-start-B-start "Verify when A starts B starts using clone resources with symmetric=false"
do_test bug-5014-CLONE-A-stop-B-started "Verify when A stops B does not stop if it has already started using clone resources with symmetric=false."
do_test bug-5014-GROUP-A-start-B-start "Verify when A starts B starts when using group resources with symmetric=false."
do_test bug-5014-GROUP-A-stopped-B-started "Verify when A stops B does not stop if it has already started using group resources with symmetric=false."
do_test bug-5014-GROUP-A-stopped-B-stopped "Verify when A is stopped and B has not started, B does not start before A using group resources with symmetric=false."
do_test bug-5014-ordered-set-symmetrical-false "Verify ordered sets work with symmetrical=false"
do_test bug-5014-ordered-set-symmetrical-true "Verify ordered sets work with symmetrical=true"
do_test bug-5007-masterslave_colocation "Verify use of colocation scores other than INFINITY and -INFINITY work on multi-state resources."
do_test bug-5038 "Prevent restart of anonymous clones when clone-max decreases"
do_test bug-5025-1 "Automatically clean up failcount after resource config change with reload"
do_test bug-5025-2 "Make sure clear failcount action isn't set when config does not change."
do_test bug-5025-3 "Automatically clean up failcount after resource config change with restart"
do_test bug-5025-4 "Clear failcount when last failure is a start op and rsc attributes changed."
do_test failcount "Ensure failcounts are correctly expired"
do_test failcount-block "Ensure failcounts are not expired when on-fail=block is present"
do_test per-op-failcount "Ensure per-operation failcount is handled and not passed to fence agent"
do_test on-fail-ignore "Ensure on-fail=ignore works even beyond migration-threshold"
do_test monitor-onfail-restart "bug-5058 - Monitor failure with on-fail set to restart"
do_test monitor-onfail-stop "bug-5058 - Monitor failure wiht on-fail set to stop"
do_test bug-5059 "No need to restart p_stateful1:*"
do_test bug-5069-op-enabled "Test on-fail=ignore with failure when monitor is enabled."
do_test bug-5069-op-disabled "Test on-fail-ignore with failure when monitor is disabled."
do_test obsolete-lrm-resource "cl#5115 - Do not use obsolete lrm_resource sections"
do_test expire-non-blocked-failure "Ignore failure-timeout only if the failed operation has on-fail=block"
do_test asymmetrical-order-move "Respect asymmetrical ordering when trying to move resources"
do_test start-then-stop-with-unfence "Avoid graph loop with start-then-stop constraint plus unfencing"
do_test order-expired-failure "Order failcount cleanup after remote fencing"
do_test ignore_stonith_rsc_order1 "cl#5056- Ignore order constraint between stonith and non-stonith rsc."
do_test ignore_stonith_rsc_order2 "cl#5056- Ignore order constraint with group rsc containing mixed stonith and non-stonith."
do_test ignore_stonith_rsc_order3 "cl#5056- Ignore order constraint, stonith clone and mixed group"
do_test ignore_stonith_rsc_order4 "cl#5056- Ignore order constraint, stonith clone and clone with nested mixed group"
do_test honor_stonith_rsc_order1 "cl#5056- Honor order constraint, stonith clone and pure stonith group(single rsc)."
do_test honor_stonith_rsc_order2 "cl#5056- Honor order constraint, stonith clone and pure stonith group(multiple rsc)"
do_test honor_stonith_rsc_order3 "cl#5056- Honor order constraint, stonith clones with nested pure stonith group."
do_test honor_stonith_rsc_order4 "cl#5056- Honor order constraint, between two native stonith rscs."
do_test multiply-active-stonith
do_test probe-timeout "cl#5099 - Default probe timeout"
do_test concurrent-fencing "Allow performing fencing operations in parallel"
echo ""
do_test systemhealth1 "System Health () #1"
do_test systemhealth2 "System Health () #2"
do_test systemhealth3 "System Health () #3"
do_test systemhealthn1 "System Health (None) #1"
do_test systemhealthn2 "System Health (None) #2"
do_test systemhealthn3 "System Health (None) #3"
do_test systemhealthm1 "System Health (Migrate On Red) #1"
do_test systemhealthm2 "System Health (Migrate On Red) #2"
do_test systemhealthm3 "System Health (Migrate On Red) #3"
do_test systemhealtho1 "System Health (Only Green) #1"
do_test systemhealtho2 "System Health (Only Green) #2"
do_test systemhealtho3 "System Health (Only Green) #3"
do_test systemhealthp1 "System Health (Progessive) #1"
do_test systemhealthp2 "System Health (Progessive) #2"
do_test systemhealthp3 "System Health (Progessive) #3"
echo ""
do_test utilization "Placement Strategy - utilization"
do_test minimal "Placement Strategy - minimal"
do_test balanced "Placement Strategy - balanced"
echo ""
do_test placement-stickiness "Optimized Placement Strategy - stickiness"
do_test placement-priority "Optimized Placement Strategy - priority"
do_test placement-location "Optimized Placement Strategy - location"
do_test placement-capacity "Optimized Placement Strategy - capacity"
echo ""
do_test utilization-order1 "Utilization Order - Simple"
do_test utilization-order2 "Utilization Order - Complex"
do_test utilization-order3 "Utilization Order - Migrate"
do_test utilization-order4 "Utilization Order - Live Migration (bnc#695440)"
do_test utilization-shuffle "Don't displace prmExPostgreSQLDB2 on act2, Start prmExPostgreSQLDB1 on act3"
do_test load-stopped-loop "Avoid transition loop due to load_stopped (cl#5044)"
do_test load-stopped-loop-2 "cl#5235 - Prevent graph loops that can be introduced by load_stopped -> migrate_to ordering"
echo ""
do_test colocated-utilization-primitive-1 "Colocated Utilization - Primitive"
do_test colocated-utilization-primitive-2 "Colocated Utilization - Choose the most capable node"
do_test colocated-utilization-group "Colocated Utilization - Group"
do_test colocated-utilization-clone "Colocated Utilization - Clone"
do_test utilization-check-allowed-nodes "Only check the capacities of the nodes that can run the resource"
echo ""
do_test reprobe-target_rc "Ensure correct target_rc for reprobe of inactive resources"
do_test node-maintenance-1 "cl#5128 - Node maintenance"
do_test node-maintenance-2 "cl#5128 - Node maintenance (coming out of maintenance mode)"
do_test shutdown-maintenance-node "Do not fence a maintenance node if it shuts down cleanly"
do_test rsc-maintenance "Per-resource maintenance"
echo ""
do_test not-installed-agent "The resource agent is missing"
do_test not-installed-tools "Something the resource agent needs is missing"
echo ""
do_test stopped-monitor-00 "Stopped Monitor - initial start"
do_test stopped-monitor-01 "Stopped Monitor - failed started"
do_test stopped-monitor-02 "Stopped Monitor - started multi-up"
do_test stopped-monitor-03 "Stopped Monitor - stop started"
do_test stopped-monitor-04 "Stopped Monitor - failed stop"
do_test stopped-monitor-05 "Stopped Monitor - start unmanaged"
do_test stopped-monitor-06 "Stopped Monitor - unmanaged multi-up"
do_test stopped-monitor-07 "Stopped Monitor - start unmanaged multi-up"
do_test stopped-monitor-08 "Stopped Monitor - migrate"
do_test stopped-monitor-09 "Stopped Monitor - unmanage started"
do_test stopped-monitor-10 "Stopped Monitor - unmanaged started multi-up"
do_test stopped-monitor-11 "Stopped Monitor - stop unmanaged started"
-do_test stopped-monitor-12 "Stopped Monitor - unmanaged started multi-up (targer-role="Stopped")"
+do_test stopped-monitor-12 "Stopped Monitor - unmanaged started multi-up (target-role=Stopped)"
do_test stopped-monitor-20 "Stopped Monitor - initial stop"
do_test stopped-monitor-21 "Stopped Monitor - stopped single-up"
do_test stopped-monitor-22 "Stopped Monitor - stopped multi-up"
do_test stopped-monitor-23 "Stopped Monitor - start stopped"
do_test stopped-monitor-24 "Stopped Monitor - unmanage stopped"
do_test stopped-monitor-25 "Stopped Monitor - unmanaged stopped multi-up"
do_test stopped-monitor-26 "Stopped Monitor - start unmanaged stopped"
-do_test stopped-monitor-27 "Stopped Monitor - unmanaged stopped multi-up (target-role="Started")"
+do_test stopped-monitor-27 "Stopped Monitor - unmanaged stopped multi-up (target-role=Started)"
do_test stopped-monitor-30 "Stopped Monitor - new node started"
do_test stopped-monitor-31 "Stopped Monitor - new node stopped"
echo ""
# This is a combo test to check:
# - probe timeout defaults to the minimum-interval monitor's
# - duplicate recurring operations are ignored
# - if timeout spec is bad, the default timeout is used
# - failure is blocked with on-fail=block even if ISO8601 interval is specified
# - started/stopped role monitors are started/stopped on right nodes
do_test intervals "Recurring monitor interval handling"
echo""
do_test ticket-primitive-1 "Ticket - Primitive (loss-policy=stop, initial)"
do_test ticket-primitive-2 "Ticket - Primitive (loss-policy=stop, granted)"
do_test ticket-primitive-3 "Ticket - Primitive (loss-policy-stop, revoked)"
do_test ticket-primitive-4 "Ticket - Primitive (loss-policy=demote, initial)"
do_test ticket-primitive-5 "Ticket - Primitive (loss-policy=demote, granted)"
do_test ticket-primitive-6 "Ticket - Primitive (loss-policy=demote, revoked)"
do_test ticket-primitive-7 "Ticket - Primitive (loss-policy=fence, initial)"
do_test ticket-primitive-8 "Ticket - Primitive (loss-policy=fence, granted)"
do_test ticket-primitive-9 "Ticket - Primitive (loss-policy=fence, revoked)"
do_test ticket-primitive-10 "Ticket - Primitive (loss-policy=freeze, initial)"
do_test ticket-primitive-11 "Ticket - Primitive (loss-policy=freeze, granted)"
do_test ticket-primitive-12 "Ticket - Primitive (loss-policy=freeze, revoked)"
do_test ticket-primitive-13 "Ticket - Primitive (loss-policy=stop, standby, granted)"
do_test ticket-primitive-14 "Ticket - Primitive (loss-policy=stop, granted, standby)"
do_test ticket-primitive-15 "Ticket - Primitive (loss-policy=stop, standby, revoked)"
do_test ticket-primitive-16 "Ticket - Primitive (loss-policy=demote, standby, granted)"
do_test ticket-primitive-17 "Ticket - Primitive (loss-policy=demote, granted, standby)"
do_test ticket-primitive-18 "Ticket - Primitive (loss-policy=demote, standby, revoked)"
do_test ticket-primitive-19 "Ticket - Primitive (loss-policy=fence, standby, granted)"
do_test ticket-primitive-20 "Ticket - Primitive (loss-policy=fence, granted, standby)"
do_test ticket-primitive-21 "Ticket - Primitive (loss-policy=fence, standby, revoked)"
do_test ticket-primitive-22 "Ticket - Primitive (loss-policy=freeze, standby, granted)"
do_test ticket-primitive-23 "Ticket - Primitive (loss-policy=freeze, granted, standby)"
do_test ticket-primitive-24 "Ticket - Primitive (loss-policy=freeze, standby, revoked)"
echo""
do_test ticket-group-1 "Ticket - Group (loss-policy=stop, initial)"
do_test ticket-group-2 "Ticket - Group (loss-policy=stop, granted)"
do_test ticket-group-3 "Ticket - Group (loss-policy-stop, revoked)"
do_test ticket-group-4 "Ticket - Group (loss-policy=demote, initial)"
do_test ticket-group-5 "Ticket - Group (loss-policy=demote, granted)"
do_test ticket-group-6 "Ticket - Group (loss-policy=demote, revoked)"
do_test ticket-group-7 "Ticket - Group (loss-policy=fence, initial)"
do_test ticket-group-8 "Ticket - Group (loss-policy=fence, granted)"
do_test ticket-group-9 "Ticket - Group (loss-policy=fence, revoked)"
do_test ticket-group-10 "Ticket - Group (loss-policy=freeze, initial)"
do_test ticket-group-11 "Ticket - Group (loss-policy=freeze, granted)"
do_test ticket-group-12 "Ticket - Group (loss-policy=freeze, revoked)"
do_test ticket-group-13 "Ticket - Group (loss-policy=stop, standby, granted)"
do_test ticket-group-14 "Ticket - Group (loss-policy=stop, granted, standby)"
do_test ticket-group-15 "Ticket - Group (loss-policy=stop, standby, revoked)"
do_test ticket-group-16 "Ticket - Group (loss-policy=demote, standby, granted)"
do_test ticket-group-17 "Ticket - Group (loss-policy=demote, granted, standby)"
do_test ticket-group-18 "Ticket - Group (loss-policy=demote, standby, revoked)"
do_test ticket-group-19 "Ticket - Group (loss-policy=fence, standby, granted)"
do_test ticket-group-20 "Ticket - Group (loss-policy=fence, granted, standby)"
do_test ticket-group-21 "Ticket - Group (loss-policy=fence, standby, revoked)"
do_test ticket-group-22 "Ticket - Group (loss-policy=freeze, standby, granted)"
do_test ticket-group-23 "Ticket - Group (loss-policy=freeze, granted, standby)"
do_test ticket-group-24 "Ticket - Group (loss-policy=freeze, standby, revoked)"
echo""
do_test ticket-clone-1 "Ticket - Clone (loss-policy=stop, initial)"
do_test ticket-clone-2 "Ticket - Clone (loss-policy=stop, granted)"
do_test ticket-clone-3 "Ticket - Clone (loss-policy-stop, revoked)"
do_test ticket-clone-4 "Ticket - Clone (loss-policy=demote, initial)"
do_test ticket-clone-5 "Ticket - Clone (loss-policy=demote, granted)"
do_test ticket-clone-6 "Ticket - Clone (loss-policy=demote, revoked)"
do_test ticket-clone-7 "Ticket - Clone (loss-policy=fence, initial)"
do_test ticket-clone-8 "Ticket - Clone (loss-policy=fence, granted)"
do_test ticket-clone-9 "Ticket - Clone (loss-policy=fence, revoked)"
do_test ticket-clone-10 "Ticket - Clone (loss-policy=freeze, initial)"
do_test ticket-clone-11 "Ticket - Clone (loss-policy=freeze, granted)"
do_test ticket-clone-12 "Ticket - Clone (loss-policy=freeze, revoked)"
do_test ticket-clone-13 "Ticket - Clone (loss-policy=stop, standby, granted)"
do_test ticket-clone-14 "Ticket - Clone (loss-policy=stop, granted, standby)"
do_test ticket-clone-15 "Ticket - Clone (loss-policy=stop, standby, revoked)"
do_test ticket-clone-16 "Ticket - Clone (loss-policy=demote, standby, granted)"
do_test ticket-clone-17 "Ticket - Clone (loss-policy=demote, granted, standby)"
do_test ticket-clone-18 "Ticket - Clone (loss-policy=demote, standby, revoked)"
do_test ticket-clone-19 "Ticket - Clone (loss-policy=fence, standby, granted)"
do_test ticket-clone-20 "Ticket - Clone (loss-policy=fence, granted, standby)"
do_test ticket-clone-21 "Ticket - Clone (loss-policy=fence, standby, revoked)"
do_test ticket-clone-22 "Ticket - Clone (loss-policy=freeze, standby, granted)"
do_test ticket-clone-23 "Ticket - Clone (loss-policy=freeze, granted, standby)"
do_test ticket-clone-24 "Ticket - Clone (loss-policy=freeze, standby, revoked)"
echo""
do_test ticket-master-1 "Ticket - Master (loss-policy=stop, initial)"
do_test ticket-master-2 "Ticket - Master (loss-policy=stop, granted)"
do_test ticket-master-3 "Ticket - Master (loss-policy-stop, revoked)"
do_test ticket-master-4 "Ticket - Master (loss-policy=demote, initial)"
do_test ticket-master-5 "Ticket - Master (loss-policy=demote, granted)"
do_test ticket-master-6 "Ticket - Master (loss-policy=demote, revoked)"
do_test ticket-master-7 "Ticket - Master (loss-policy=fence, initial)"
do_test ticket-master-8 "Ticket - Master (loss-policy=fence, granted)"
do_test ticket-master-9 "Ticket - Master (loss-policy=fence, revoked)"
do_test ticket-master-10 "Ticket - Master (loss-policy=freeze, initial)"
do_test ticket-master-11 "Ticket - Master (loss-policy=freeze, granted)"
do_test ticket-master-12 "Ticket - Master (loss-policy=freeze, revoked)"
do_test ticket-master-13 "Ticket - Master (loss-policy=stop, standby, granted)"
do_test ticket-master-14 "Ticket - Master (loss-policy=stop, granted, standby)"
do_test ticket-master-15 "Ticket - Master (loss-policy=stop, standby, revoked)"
do_test ticket-master-16 "Ticket - Master (loss-policy=demote, standby, granted)"
do_test ticket-master-17 "Ticket - Master (loss-policy=demote, granted, standby)"
do_test ticket-master-18 "Ticket - Master (loss-policy=demote, standby, revoked)"
do_test ticket-master-19 "Ticket - Master (loss-policy=fence, standby, granted)"
do_test ticket-master-20 "Ticket - Master (loss-policy=fence, granted, standby)"
do_test ticket-master-21 "Ticket - Master (loss-policy=fence, standby, revoked)"
do_test ticket-master-22 "Ticket - Master (loss-policy=freeze, standby, granted)"
do_test ticket-master-23 "Ticket - Master (loss-policy=freeze, granted, standby)"
do_test ticket-master-24 "Ticket - Master (loss-policy=freeze, standby, revoked)"
echo ""
do_test ticket-rsc-sets-1 "Ticket - Resource sets (1 ticket, initial)"
do_test ticket-rsc-sets-2 "Ticket - Resource sets (1 ticket, granted)"
do_test ticket-rsc-sets-3 "Ticket - Resource sets (1 ticket, revoked)"
do_test ticket-rsc-sets-4 "Ticket - Resource sets (2 tickets, initial)"
do_test ticket-rsc-sets-5 "Ticket - Resource sets (2 tickets, granted)"
do_test ticket-rsc-sets-6 "Ticket - Resource sets (2 tickets, granted)"
do_test ticket-rsc-sets-7 "Ticket - Resource sets (2 tickets, revoked)"
do_test ticket-rsc-sets-8 "Ticket - Resource sets (1 ticket, standby, granted)"
do_test ticket-rsc-sets-9 "Ticket - Resource sets (1 ticket, granted, standby)"
do_test ticket-rsc-sets-10 "Ticket - Resource sets (1 ticket, standby, revoked)"
do_test ticket-rsc-sets-11 "Ticket - Resource sets (2 tickets, standby, granted)"
do_test ticket-rsc-sets-12 "Ticket - Resource sets (2 tickets, standby, granted)"
do_test ticket-rsc-sets-13 "Ticket - Resource sets (2 tickets, granted, standby)"
do_test ticket-rsc-sets-14 "Ticket - Resource sets (2 tickets, standby, revoked)"
do_test cluster-specific-params "Cluster-specific instance attributes based on rules"
do_test site-specific-params "Site-specific instance attributes based on rules"
echo ""
do_test template-1 "Template - 1"
do_test template-2 "Template - 2"
do_test template-3 "Template - 3 (merge operations)"
do_test template-coloc-1 "Template - Colocation 1"
do_test template-coloc-2 "Template - Colocation 2"
do_test template-coloc-3 "Template - Colocation 3"
do_test template-order-1 "Template - Order 1"
do_test template-order-2 "Template - Order 2"
do_test template-order-3 "Template - Order 3"
do_test template-ticket "Template - Ticket"
do_test template-rsc-sets-1 "Template - Resource Sets 1"
do_test template-rsc-sets-2 "Template - Resource Sets 2"
do_test template-rsc-sets-3 "Template - Resource Sets 3"
do_test template-rsc-sets-4 "Template - Resource Sets 4"
do_test template-clone-primitive "Cloned primitive from template"
do_test template-clone-group "Cloned group from template"
do_test location-sets-templates "Resource sets and templates - Location"
do_test tags-coloc-order-1 "Tags - Colocation and Order (Simple)"
do_test tags-coloc-order-2 "Tags - Colocation and Order (Resource Sets with Templates)"
do_test tags-location "Tags - Location"
do_test tags-ticket "Tags - Ticket"
echo ""
do_test container-1 "Container - initial"
do_test container-2 "Container - monitor failed"
do_test container-3 "Container - stop failed"
do_test container-4 "Container - reached migration-threshold"
do_test container-group-1 "Container in group - initial"
do_test container-group-2 "Container in group - monitor failed"
do_test container-group-3 "Container in group - stop failed"
do_test container-group-4 "Container in group - reached migration-threshold"
do_test container-is-remote-node "Place resource within container when container is remote-node"
do_test bug-rh-1097457 "Kill user defined container/contents ordering"
do_test bug-cl-5247 "Graph loop when recovering m/s resource in a container"
do_test bundle-order-startup "Bundle startup ordering"
do_test bundle-order-partial-start "Bundle startup ordering when some dependancies are already running"
do_test bundle-order-partial-start-2 "Bundle startup ordering when some dependancies and the container are already running"
do_test bundle-order-stop "Bundle stop ordering"
do_test bundle-order-partial-stop "Bundle startup ordering when some dependancies are already stopped"
do_test bundle-order-stop-on-remote "Stop nested resource after bringing up the connection"
do_test bundle-order-startup-clone "Prevent startup because bundle isn't promoted"
do_test bundle-order-startup-clone-2 "Bundle startup with clones"
do_test bundle-order-stop-clone "Stop bundle because clone is stopping"
do_test bundle-nested-colocation "Colocation of nested connection resources"
do_test bundle-order-fencing "Order pseudo bundle fencing after parent node fencing if both are happening"
do_test bundle-probe-order-1 "order 1"
do_test bundle-probe-order-2 "order 2"
do_test bundle-probe-order-3 "order 3"
do_test bundle-probe-remotes "Ensure remotes get probed too"
do_test bundle-replicas-change "Change bundle from 1 replica to multiple"
echo ""
do_test whitebox-fail1 "Fail whitebox container rsc."
do_test whitebox-fail2 "Fail cluster connection to guest node"
do_test whitebox-fail3 "Failed containers should not run nested on remote nodes."
do_test whitebox-start "Start whitebox container with resources assigned to it"
do_test whitebox-stop "Stop whitebox container with resources assigned to it"
do_test whitebox-move "Move whitebox container with resources assigned to it"
do_test whitebox-asymmetric "Verify connection rsc opts-in based on container resource"
do_test whitebox-ms-ordering "Verify promote/demote can not occur before connection is established"
do_test whitebox-ms-ordering-move "Stop/Start cycle within a moving container"
do_test whitebox-orphaned "Properly shutdown orphaned whitebox container"
do_test whitebox-orphan-ms "Properly tear down orphan ms resources on remote-nodes"
do_test whitebox-unexpectedly-running "Recover container nodes the cluster did not start."
do_test whitebox-migrate1 "Migrate both container and connection resource"
do_test whitebox-imply-stop-on-fence "imply stop action on container node rsc when host node is fenced"
do_test whitebox-nested-group "Verify guest remote-node works nested in a group"
do_test guest-node-host-dies "Verify guest node is recovered if host goes away"
echo ""
do_test remote-startup-probes "Baremetal remote-node startup probes"
do_test remote-startup "Startup a newly discovered remote-nodes with no status."
do_test remote-fence-unclean "Fence unclean baremetal remote-node"
do_test remote-fence-unclean2 "Fence baremetal remote-node after cluster node fails and connection can not be recovered"
do_test remote-fence-unclean-3 "Probe failed remote nodes (triggers fencing)"
do_test remote-move "Move remote-node connection resource"
do_test remote-disable "Disable a baremetal remote-node"
do_test remote-probe-disable "Probe then stop a baremetal remote-node"
do_test remote-orphaned "Properly shutdown orphaned connection resource"
do_test remote-orphaned2 "verify we can handle orphaned remote connections with active resources on the remote"
do_test remote-recover "Recover connection resource after cluster-node fails."
do_test remote-stale-node-entry "Make sure we properly handle leftover remote-node entries in the node section"
do_test remote-partial-migrate "Make sure partial migrations are handled before ops on the remote node."
do_test remote-partial-migrate2 "Make sure partial migration target is prefered for remote connection."
do_test remote-recover-fail "Make sure start failure causes fencing if rsc are active on remote."
do_test remote-start-fail "Make sure a start failure does not result in fencing if no active resources are on remote."
do_test remote-unclean2 "Make monitor failure always results in fencing, even if no rsc are active on remote."
do_test remote-fence-before-reconnect "Fence before clearing recurring monitor failure"
do_test remote-recovery "Recover remote connections before attempting demotion"
do_test remote-recover-connection "Optimistically recovery of only the connection"
do_test remote-recover-all "Fencing when the connection has no home"
do_test remote-recover-no-resources "Fencing when the connection has no home and no active resources"
do_test remote-recover-unknown "Fencing when the connection has no home and the remote has no operation history"
do_test remote-reconnect-delay "Waiting for remote reconnect interval to expire"
do_test remote-connection-unrecoverable "Remote connection host must be fenced, with connection unrecoverable"
echo ""
do_test resource-discovery "Exercises resource-discovery location constraint option."
do_test rsc-discovery-per-node "Disable resource discovery per node"
if [ $DO_VERSIONED_TESTS -eq 1 ]; then
echo ""
do_test versioned-resources "Start resources with #ra-version rules"
do_test restart-versioned "Restart resources on #ra-version change"
do_test reload-versioned "Reload resources on #ra-version change"
echo ""
do_test versioned-operations-1 "Use #ra-version to configure operations of native resources"
do_test versioned-operations-2 "Use #ra-version to configure operations of stonith resources"
do_test versioned-operations-3 "Use #ra-version to configure operations of master/slave resources"
do_test versioned-operations-4 "Use #ra-version to configure operations of groups of the resources"
fi
echo ""
test_results
exit $EXITCODE
diff --git a/cts/cts-support.in b/cts/cts-support.in
index f3ab7924b4..ca87ff7a41 100644
--- a/cts/cts-support.in
+++ b/cts/cts-support.in
@@ -1,128 +1,128 @@
#!/bin/sh
#
# Installer for support files needed by Pacemaker's Cluster Test Suite
#
# Copyright 2018 Red Hat, Inc.
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
USAGE_TEXT="Usage: $0 <install|uninstall|--help>"
HELP_TEXT="$USAGE_TEXT
Commands (must be run as root):
install Install support files needed by Pacemaker CTS
uninstall Remove support files needed by Pacemaker CTS"
# These constants must track crm_exit_t values
CRM_EX_OK=0
CRM_EX_ERROR=1
CRM_EX_USAGE=64
UNIT_DIR="@systemdunitdir@"
LIBEXEC_DIR="@libexecdir@/pacemaker"
INIT_DIR="@INITDIR@"
DATA_DIR="@datadir@/pacemaker/tests/cts"
UPSTART_DIR="/etc/init"
DUMMY_DAEMON="pacemaker-cts-dummyd"
DUMMY_DAEMON_UNIT="pacemaker-cts-dummyd@.service"
LSB_DUMMY="LSBDummy"
UPSTART_DUMMY="pacemaker-cts-dummyd.conf"
# If the install directory doesn't exist, assume we're in a build directory.
if [ ! -d "$DATA_DIR" ]; then
# If readlink supports -e (i.e. GNU), use it.
readlink -e / >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
- DATA_DIR="$(dirname $(readlink -e $0))"
+ DATA_DIR="$(dirname "$(readlink -e "$0")")"
else
- DATA_DIR="$(dirname $0)"
+ DATA_DIR="$(dirname "$0")"
fi
fi
usage() {
- echo "Error: $@"
+ echo "Error:" "$@"
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
}
must_be_root() {
if ! [ "$(id -u)" = "0" ]; then
usage "this command must be run as root"
return $CRM_EX_ERROR
fi
return $CRM_EX_OK
}
support_uninstall() {
must_be_root || return $CRM_EX_ERROR
if [ -e "$UNIT_DIR/$DUMMY_DAEMON_UNIT" ]; then
echo "Removing $UNIT_DIR/$DUMMY_DAEMON_UNIT ..."
rm -f "$UNIT_DIR/$DUMMY_DAEMON_UNIT"
systemctl daemon-reload # Ignore failure
fi
for FILE in \
"$LIBEXEC_DIR/$DUMMY_DAEMON" \
"$UPSTART_DIR/$UPSTART_DUMMY" \
"$INIT_DIR/$LSB_DUMMY"
do
if [ -e "$FILE" ]; then
echo "Removing $FILE ..."
rm -f "$FILE"
fi
done
return $CRM_EX_OK
}
support_install() {
support_uninstall || return $CRM_EX_ERROR
- cd "$DATA_DIR"
+ cd "$DATA_DIR" || return $CRM_EX_ERROR
if [ -d "$UNIT_DIR" ]; then
echo "Installing $DUMMY_DAEMON ..."
mkdir -p "$LIBEXEC_DIR"
install -m 0755 "$DUMMY_DAEMON" "$LIBEXEC_DIR" || return $CRM_EX_ERROR
echo "Installing $DUMMY_DAEMON_UNIT ..."
install -m 0644 "$DUMMY_DAEMON_UNIT" "$UNIT_DIR" || return $CRM_EX_ERROR
systemctl daemon-reload # Ignore failure
fi
echo "Installing $LSB_DUMMY to $INIT_DIR ..."
mkdir -p "$INIT_DIR"
install -m 0755 "$LSB_DUMMY" "$INIT_DIR" || return $CRM_EX_ERROR
if [ -d "$UPSTART_DIR" -a -f "$UPSTART_DUMMY" ]; then
echo "Installing $UPSTART_DUMMY to $UPSTART_DIR ..."
install -m 0644 "$UPSTART_DUMMY" "$UPSTART_DIR" || return $CRM_EX_ERROR
fi
return $CRM_EX_OK
}
COMMAND=""
while [ $# -gt 0 ] ; do
case "$1" in
--help)
echo "$HELP_TEXT"
exit $CRM_EX_OK
;;
install|uninstall)
COMMAND="$1"
shift
;;
*)
usage "unknown option '$1'"
;;
esac
done
case "$COMMAND" in
install) support_install ;;
uninstall) support_uninstall ;;
*) usage "must specify command" ;;
esac
diff --git a/cts/cts.in b/cts/cts.in
index 8b5074991f..eb908dc88b 100755
--- a/cts/cts.in
+++ b/cts/cts.in
@@ -1,328 +1,328 @@
#!@BASH_PATH@
#
# Copyright 2012-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
if [ -e $PWD/cts/CTSlab.py ]; then
cts_root=$PWD/cts
elif [ -e $PWD/CTSlab.py ]; then
cts_root=$PWD
else
cts_root=`dirname $0`
fi
logfile=0
summary=0
verbose=0
watch=0
saved=0
tests=""
install=0
clean=0
build=0
kill=0
run=0
boot=0
setup=0
target=rhel-7
cmd=""
trace=""
custom_log=""
patterns="-e CTS:"
helpmsg=$(cat <<EOF
Usage: %s [options] {[{init|local-init|setup} [TARGET]] | [OTHER-CMDS]}
[--]help, -h show help screen and exit
-x turn on debugging
-a show relevant screen sessions and exit
-c,-g CLUSTER_NAME set the cluster name
-S show summary from the last CTS run
-s show summary for the current log (see -l)
-v increase verbosity
-p (currently unused)
-e PATTERN grep pattern to apply when 'summary' or 'watch' requested
-l print the filename of the log that would be operated on
-w continous (filtered) monitoring of the log file
-f,-sf FILE show summary for the provided log
-t TEST, [0-9]* add a test to the working set
[--]build [???] request building Pacemaker
[--]kill request termination of cluster software
[--]run request CTS run (passing remaining arguments through)
[--]boot, start request CTS run (with --boot option)
[--]clean request cleaning up after CTS run
[--]install, --inst request installing packages to get ready to run CTS
[--]setup request initialization to get ready to run CTS
trace-ls, tls list traced functions
trace-add, tadd FUNC add a function to the list of traced ones
trace-rm, trm FUNC remove a function from the list of traced ones
trace-set, tset FUNC set function(s) as the only to be traced
(f|fedora|r|rhel).* specify target distro
init, local-init [local] initialize CTS environment
--wget [local] download up-to-date CTS helpers
-- delimits tests that follow
EOF
)
# Modify an uninstalled source checkout to allow running CTS there
local_init() {
local_root="$(dirname "$cts_root")"
if [ ! -r "$cts_root/CTSvars.py.in" ]; then
echo "$local_root does not appear to be a source code directory"
exit 1
elif [ ! -r "$cts_root/CTSvars.py" ]; then
echo "You must run configure and make first"
exit 1
fi
# Update CTS_home and Fencing_Home in CTSvars.py
sed -e "s:@datadir@/@PACKAGE@/tests/cts:$local_root/cts:" \
"$cts_root/CTSvars.py" > "$cts_root/CTSvars.py.$$"
mv -- "$cts_root/CTSvars.py.$$" "$cts_root/CTSvars.py"
files="extra/cluster-init extra/cluster-helper extra/cluster-clean"
for f in $files; do
cp "$local_root/$f" "$cts_root/"
done
# Update report_data in a local copy of crm_report
sed -e "s:@datadir@/@PACKAGE@:$local_root/tools:" \
"$local_root/tools/crm_report" > "$cts_root/crm_report"
chmod +x "$cts_root/crm_report"
# Install the necessary helpers to system locations (must be done as root)
"$cts_root/cts-support" install
echo "Make sure you add $cts_root to your PATH and set a value for \$cluster_name in .bashrc"
exit 0
}
while true; do
case $1 in
-h|--help|help) printf "${helpmsg}\n" "$0"; exit;;
-x) set -x; shift;;
-a)
screen -ls | grep cts
exit 0;;
-c|-g) cluster_name=$2; shift; shift;;
-S) summary=1; saved=1; shift;;
-s) summary=1; shift;;
-v) verbose=`expr $verbose + 1`; shift;;
-p) shift;;
-e) patterns="$patterns -e `echo $2 | sed 's/ /\\\W/g'`"; shift; shift;;
-l) logfile=1; shift;;
-w) watch=1; shift;;
-f|-sf) summary=1; custom_log=$2; shift; shift;;
-t) tests="$tests $2"; shift; shift;;
[0-9]*) tests="$tests $1"; shift;;
--build|build) build=1; shift;;
--kill|kill) kill=1; shift; break;;
--run|run) run=1; shift; break;;
--boot|boot|start) boot=1; clean=1; shift; break;;
--clean|clean) clean=1; shift;;
--inst|--install|install) install=1; clean=1; shift;;
--setup|setup) setup=1; shift;;
trace-ls|tls) cmd=$1; shift;;
trace-add|tadd|trace-rm|trm|trace-set|tset) cmd=$1; trace=$2; shift; shift;;
- f*|fedora*)
+ f*)
target="fedora-`echo $1 | sed -e s/fedora// -e s/-// -e s/f//`"
shift;;
r|rhel) target="rhel-7"; shift;;
- r*|rhel*)
+ r*)
target="rhel-`echo $1 | sed -e s/rhel// -e s/-// -e s/r//`"
shift;;
init|local-init) local_init ;;
--wget)
files="cluster-helper cluster-init cluster-clean"
for f in $files; do
rm -f $cts_root/$f
echo "Downloading helper script $f from GitHub"
wget -O $cts_root/$f https://raw.github.com/ClusterLabs/pacemaker/master/extra/$f
chmod +x $cts_root/$f
done
shift
;;
--) shift; tests="$tests $*"; break;;
"") break;;
*) echo "Unknown argument: $1"; exit 1;;
esac
done
# Add the location of this script
export PATH="$PATH:$cts_root"
which cluster-helper &>/dev/null
if [ $? != 0 ]; then
echo $0 needs the cluster-helper script to be in your path
echo You can obtain it from: https://raw.github.com/ClusterLabs/pacemaker/master/extra/cluster-helper
exit 1
fi
which cluster-clean &>/dev/null
if [ $? != 0 ]; then
echo $0 needs the cluster-clean script to be in your path
echo You can obtain it from: https://raw.github.com/ClusterLabs/pacemaker/master/extra/cluster-clean
exit 1
fi
-if [ "x$cluster_name" = x -o "x$cluster_name" = xpick ]; then
+if [ "x$cluster_name" = x ] || [ "x$cluster_name" = xpick ]; then
clusters=`ls -1 ~/.dsh/group/[a-z]+[0-9] | sed s/.*group.// | tr '\n' ' ' `
echo "custom) interactively define a cluster"
for i in $clusters; do
echo "$i) `cluster-helper --list short -g $i`"
done
read -p "Choose a cluster [custom]: " cluster_name
echo
fi
if [ -z $cluster_name ]; then
cluster_name=custom
fi
case $cluster_name in
custom)
read -p "Cluster name: " cluster_name
read -p "Cluster hosts: " cluster_hosts
read -p "Cluster log file: " cluster_log
cluster-helper add -g "$cluster_name" -w "$cluster_hosts"
;;
*)
cluster_hosts=`cluster-helper --list short -g $cluster_name`
cluster_log=~/cluster-$cluster_name.log;
;;
esac
if [ x$cmd != x ]; then
config=/etc/sysconfig/pacemaker
case $cmd in
trace-ls|tls)
cluster-helper -g $cluster_name -- grep PCMK_trace_functions $config
;;
trace-add|tadd)
echo "Adding $trace to PCMK_trace_functions"
cluster-helper -g $cluster_name -- sed -i "s/.*PCMK_trace_functions=/PCMK_trace_functions=$trace,/" $config
;;
trace-rm|trm)
echo "Removing $trace from PCMK_trace_functions"
cluster-helper -g $cluster_name -- sed -i "s/.*PCMK_trace_functions=\\\\\\(.*\\\\\\)$trace,\\\\\\(.*\\\\\\)/PCMK_trace_functions=\\\\\\1\\\\\\2/" $config
;;
trace-set|tset)
echo "Setting PCMK_trace_functions to '$trace'"
cluster-helper -g $cluster_name -- sed -i "s/.*PCMK_trace_functions.*/PCMK_trace_functions=$trace/" $config
;;
esac
exit 0
fi
-if [ $build = 1 -a $run = 1 ]; then
+if [ $build = 1 ] && [ $run = 1 ]; then
install=1
clean=1
fi
if [ $build = 1 ]; then
which build-pcmk
if [ $? != 0 ]; then
echo "You'll need to write/obtain build-pcmk in order to build pacemaker from here. Skipping"
else
build-pcmk r7
rc=$?
if [ $rc != 0 ]; then
echo "Build failed: $rc"
exit $rc
fi
fi
fi
if [ $clean = 1 ]; then
rm -f $cluster_log; cluster-clean -g $cluster_name --kill
elif [ $kill = 1 ]; then
cluster-clean -g $cluster_name --kill-only
exit 0
fi
if [ $install = 1 ]; then
cluster-helper -g $cluster_name -- yum install -y pacemaker pacemaker-debuginfo pacemaker-cts libqb libqb-debuginfo
fi
if [ $setup = 1 ]; then
cluster-init -g $cluster_name $target -u --test
exit 0
elif [ $boot = 1 ]; then
$cts_root/CTSlab.py -r -c -g $cluster_name --boot
rc=$?
if [ $rc = 0 ]; then
echo "The cluster is ready..."
fi
exit $rc
elif [ $run = 1 ]; then
$cts_root/CTSlab.py -r -c -g $cluster_name 500 "$@"
exit $?
elif [ $clean = 1 ]; then
exit 0
fi
screen -ls | grep cts-$cluster_name &>/dev/null
active=$?
if [ ! -z $custom_log ]; then
cluster_log=$custom_log
fi
-if [ "x$tests" != x -a "x$tests" != "x " ]; then
+if [ "x$tests" != x ] && [ "x$tests" != "x " ]; then
for t in $tests; do
echo "crm_report --cts-log $cluster_log -d -T $t"
crm_report --cts-log $cluster_log -d -T $t
done
elif [ $logfile = 1 ]; then
echo $cluster_log
elif [ $summary = 1 ]; then
files=$cluster_log
if [ $saved = 1 ]; then
files=`ls -1tr ~/CTS-*/cluster-log.txt`
fi
for f in $files; do
echo $f
case $verbose in
0) cat -n $f | grep $patterns | grep -v "CTS: debug:"
;;
1) cat -n $f | grep $patterns | grep -v "CTS:.* cmd:"
;;
*) cat -n $f | grep $patterns
;;
esac
echo ""
done
elif [ $watch = 1 ]; then
case $verbose in
0) tail -F $cluster_log | grep $patterns | grep -v "CTS: debug:"
;;
1) tail -F $cluster_log | grep $patterns | grep -v "CTS:.* cmd:"
;;
*) tail -F $cluster_log | grep $patterns
;;
esac
elif [ $active = 0 ]; then
screen -x cts-$cluster_name
else
touch $cluster_log
# . ~/.bashrc
export cluster_name cluster_hosts cluster_log
screen -S cts-$cluster_name bash
fi
diff --git a/cts/lxc_autogen.sh.in b/cts/lxc_autogen.sh.in
index 1479296af8..62c62d960e 100644
--- a/cts/lxc_autogen.sh.in
+++ b/cts/lxc_autogen.sh.in
@@ -1,532 +1,532 @@
#!@BASH_PATH@
#
# Copyright 2013-2018 David Vossel <davidvossel@gmail.com>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
containers="2"
download=0
share_configs=0
# different than default libvirt network in case this is run nested in a KVM instance
addr="192.168.123.1"
restore=0
restore_pcmk=0
restore_all=0
generate=0
key_gen=0
cib=0
anywhere=0
add_master=0
verify=0
working_dir="@CRM_CONFIG_CTS@/lxc"
run_dirs="/run /var/run /usr/var/run"
SSH_CMD_OPTS="
-o StrictHostKeyChecking=no
-o ConnectTimeout=30
-o BatchMode=yes
-l root
-T
"
# must be on one line b/c used inside quotes
SSH_RSYNC_OPTS="-o UserKnownHostsFile=/dev/null -o BatchMode=yes -o StrictHostKeyChecking=no"
function helptext() {
echo "lxc_autogen.sh - A tool for generating libvirt lxc containers for testing purposes."
echo ""
echo "Usage: lxc-autogen [options]"
echo ""
echo "Options:"
echo "-g, --generate Generate libvirt lxc environment in the directory this script is run from."
echo "-k, --key-gen Generate pacemaker remote key only."
echo "-r, --restore-libvirt Restore the default network, and libvirt config to before this script ran."
echo "-p, --restore-cib Remove cib entries this script generated."
echo "-R, --restore-all Restore both libvirt and cib plus clean working directory. This will leave libvirt xml files though so rsc can be stopped properly."
echo ""
echo "-A, --allow-anywhere Allow the containers to live anywhere in the cluster"
echo "-a, --add-cib Add remote-node entries for each lxc instance into the cib"
echo "-m, --add-master Add master resource shared between remote-nodes"
echo "-d, --download-agent Download and install the latest VirtualDomain agent."
echo "-s, --share-configs Synchronize on all known cluster nodes"
echo "-c, --containers Specify the number of containers to generate, defaults to $containers. Used with -g"
echo "-n, --network What network to override default libvirt network to. Example: -n 192.168.123.1. Used with -g"
echo "-v, --verify Verify environment is capable of running lxc"
echo ""
exit $1
}
while true ; do
case "$1" in
--help|-h|-\?) helptext 0;;
-c|--containers) containers="$2"; shift; shift;;
-d|--download-agent) download=1; shift;;
-s|--share-configs) share_configs=1; shift;;
-n|--network) addr="$2"; shift; shift;;
-r|--restore-libvirt) restore=1; shift;;
-p|--restore-cib) restore_pcmk=1; shift;;
-R|--restore-all)
restore_all=1
restore=1
restore_pcmk=1
shift;;
-g|--generate) generate=1; key_gen=1; shift;;
-k|--key-gen) key_gen=1; shift;;
-a|--add-cib) cib=1; shift;;
-A|--allow-anywhere) anywhere=1; shift;;
-m|--add-master) add_master=1; shift;;
-v|--verify) verify=1; shift;;
"") break;;
*) helptext 1;;
esac
done
if [ $verify -eq 1 ]; then
# verify virsh tool is available and that
# we can connect to lxc driver.
virsh -c lxc:/// list --all > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Could not connect 'virsh -c lxc:///' check that libvirt lxc driver is installed"
# yum install -y libvirt-daemon-driver-lxc libvirt-daemon-lxc libvirt-login-shell
exit 1
fi
cat /etc/selinux/config | grep -e "SELINUX.*=.*permissive" -e "SELINUX.*=.*enforcing" > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "/etc/selinux/config must have SELINUX set to permissive or enforcing mode."
exit 1
fi
ps x > /tmp/lxc-autogen-libvirt-test.txt
grep "libvirtd" /tmp/lxc-autogen-libvirt-test.txt
if [ $? -ne 0 ]; then
rm -f /tmp/lxc-autogen-libvirt-test.txt
echo "libvirtd isn't up."
exit 1
fi
rm -f /tmp/lxc-autogen-libvirt-test.txt
which rsync > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "rsync is required"
fi
which pacemaker-remoted > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "pacemaker-remoted is required"
fi
fi
#strip last digits off addr
addr=$(echo $addr | awk -F. '{print $1"."$2"."$3}')
this_node()
{
crm_node -n
}
other_nodes()
{
crm_node -l | awk "\$2 != \"$(this_node)\" {print \$2}"
}
make_directory()
{
# argument must be full path
DIR="$1"
mkdir -p "$DIR"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node mkdir -p "$DIR"
done
fi
}
sync_file()
{
TARGET="$1"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
rsync -ave "ssh $SSH_RSYNC_OPTS" "$TARGET" "${node}:${TARGET}"
done
fi
}
download_agent()
{
wget https://raw.github.com/ClusterLabs/resource-agents/master/heartbeat/VirtualDomain
chmod 755 VirtualDomain
mv -f VirtualDomain /usr/lib/ocf/resource.d/heartbeat/VirtualDomain
sync_file /usr/lib/ocf/resource.d/heartbeat/VirtualDomain
}
set_network()
{
rm -f cur_network.xml
cat << END >> cur_network.xml
<network>
<name>default</name>
<uuid>41ebdb84-7134-1111-a136-91f0f1119225</uuid>
<forward mode='nat'/>
<bridge name='virbr0' stp='on' delay='0' />
<mac address='52:54:00:A8:12:35'/>
<ip address='$addr.1' netmask='255.255.255.0'>
<dhcp>
<range start='$addr.2' end='$addr.254' />
</dhcp>
</ip>
</network>
END
sync_file ${working_dir}/cur_network.xml
}
distribute_configs()
{
for node in $(other_nodes); do
rsync -ave "ssh $SSH_RSYNC_OPTS" ${working_dir}/lxc*.xml ${node}:${working_dir}
rsync -ave "ssh $SSH_RSYNC_OPTS" ${working_dir}/lxc*-filesystem ${node}:${working_dir}
done
}
start_network()
{
NODE="$1"
ssh $SSH_CMD_OPTS $NODE <<-EOF
cd $working_dir
virsh net-info default >/dev/null 2>&1
if [ \$? -eq 0 ]; then
if [ ! -f restore_default.xml ]; then
virsh net-dumpxml default > restore_default.xml
fi
virsh net-destroy default
virsh net-undefine default
fi
virsh net-define cur_network.xml
virsh net-start default
virsh net-autostart default
EOF
}
start_network_all()
{
- start_network $(this_node)
+ start_network "$(this_node)"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
- start_network $node
+ start_network "$node"
done
fi
}
add_hosts_entry()
{
IP="$1"
HNAME="$2"
echo $IP $HNAME >>/etc/hosts
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node "echo $IP $HNAME >>/etc/hosts"
done
fi
}
generate_key()
{
if [ ! -e /etc/pacemaker/authkey ]; then
make_directory /etc/pacemaker
dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1
sync_file /etc/pacemaker/authkey
fi
}
generate()
{
set_network
# Generate libvirt domains in xml
for (( c=1; c <= $containers; c++ ))
do
# Clean any previous definition
rm -rf lxc$c.xml lxc$c-filesystem
# Create a basic filesystem with run directories
for dir in $run_dirs; do
mkdir -p lxc$c-filesystem/$dir
done
# Create libvirt definition
suffix=$((10 + $c))
prefix=$(echo $addr | awk -F. '{print $1"."$2}')
subnet=$(echo $addr | awk -F. '{print $3}')
while [ $suffix -gt 255 ]; do
subnet=$(($subnet + 1))
suffix=$(($subnet - 255))
done
cip=$prefix.$subnet.$suffix
cat << END >> lxc$c.xml
<domain type='lxc'>
<name>lxc$c</name>
<memory unit='KiB'>200704</memory>
<os>
<type>exe</type>
<init>$working_dir/lxc$c-filesystem/launch-helper</init>
</os>
<devices>
<console type='pty'/>
<filesystem type='ram'>
<source usage='150528'/>
<target dir='/dev/shm'/>
</filesystem>
END
for dir in $run_dirs; do
cat << END >> lxc$c.xml
<filesystem type='mount'>
<source dir='$working_dir/lxc$c-filesystem${dir}'/>
<target dir='$dir'/>
</filesystem>
END
done
cat << END >> lxc$c.xml
<interface type='network'>
<mac address='52:54:$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9))'/>
<source network='default'/>
</interface>
</devices>
</domain>
END
# Create CIB definition
rm -f container$c.cib
cat << END >> container$c.cib
<primitive class="ocf" id="container$c" provider="heartbeat" type="VirtualDomain">
<instance_attributes id="container$c-instance_attributes">
<nvpair id="container$c-instance_attributes-force_stop" name="force_stop" value="true"/>
<nvpair id="container$c-instance_attributes-hypervisor" name="hypervisor" value="lxc:///"/>
<nvpair id="container$c-instance_attributes-config" name="config" value="$working_dir/lxc$c.xml"/>
</instance_attributes>
<utilization id="container$c-utilization">
<nvpair id="container$c-utilization-cpu" name="cpu" value="1"/>
<nvpair id="container$c-utilization-hv_memory" name="hv_memory" value="100"/>
</utilization>
<meta_attributes id="container$c-meta_attributes">
<nvpair id="container$c-meta_attributes-remote-node" name="remote-node" value="lxc$c"/>
</meta_attributes>
</primitive>
END
# Create container init
rm -f lxc$c-filesystem/launch-helper
cat << END >> lxc$c-filesystem/launch-helper
#!@BASH_PATH@
ip -f inet addr add $cip/24 dev eth0
ip link set eth0 up
ip route add default via $addr.1
hostname lxc$c
df > $working_dir/lxc$c-filesystem/disk_usage.txt
export PCMK_debugfile=@CRM_LOG_DIR@/pacemaker_remote_lxc$c.log
/usr/sbin/pacemaker-remoted
END
chmod 711 lxc$c-filesystem/launch-helper
add_hosts_entry $cip lxc$c
done
# Create CIB fragment for a master-slave resource
rm -f lxc-ms.cib
cat << END >> lxc-ms.cib
<master id="lxc-ms-master">
<primitive class="ocf" id="lxc-ms" provider="pacemaker" type="Stateful">
<instance_attributes id="lxc-ms-instance_attributes"/>
<operations>
<op id="lxc-ms-monitor-interval-10s" interval="10s" name="monitor"/>
</operations>
</primitive>
<meta_attributes id="lxc-ms-meta_attributes">
<nvpair id="lxc-ms-meta_attributes-master-max" name="master-max" value="1"/>
<nvpair id="lxc-ms-meta_attributes-clone-max" name="clone-max" value="$containers"/>
</meta_attributes>
</master>
END
}
apply_cib_master()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
cibadmin -o resources -Mc -x lxc-ms.cib
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
echo "<rsc_location id=\"lxc-ms-location-${tmp}\" node=\"${tmp}\" rsc=\"lxc-ms-master\" score=\"INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -Mc -x tmp_constraint
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
}
apply_cib_entries()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
- for tmp in $(ls container*.cib); do
+ for tmp in container*.cib; do
cibadmin -o resources -Mc -x $tmp
remote_node=$(cat ${tmp} | grep remote-node | sed -n -e 's/^.*value=\"\(.*\)\".*/\1/p')
if [ $anywhere -eq 0 ]; then
tmp=$(echo $tmp | sed -e 's/\.cib//g')
- crm_resource -M -r $tmp -H $(this_node)
+ crm_resource -M -r "$tmp" -H "$(this_node)"
fi
echo "<rsc_location id=\"lxc-ping-location-${remote_node}\" node=\"${remote_node}\" rsc=\"Connectivity\" score=\"-INFINITY\"/>" > tmp_constraint
# it's fine if applying this constraint fails. it's just to help with cts
# when the connectivity resources are in use. those resources fail the remote-nodes.
cibadmin -o constraints -Mc -x tmp_constraint > /dev/null 2>&1
for rsc in $(crm_resource -l | grep rsc_ ); do
echo "<rsc_location id=\"lxc-${rsc}-location-${remote_node}\" node=\"${remote_node}\" rsc=\"${rsc}\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -Mc -x tmp_constraint > /dev/null 2>&1
done
rm -f tmp_constraint
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
}
restore_cib()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
echo "<rsc_location id=\"lxc-ms-location-${tmp}\" node=\"${tmp}\" rsc=\"lxc-ms-master\" score=\"INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
echo "<rsc_location id=\"lxc-ping-location-${tmp}\" node=\"${tmp}\" rsc=\"Connectivity\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
for rsc in $(crm_resource -l | grep rsc_ ); do
echo "<rsc_location id=\"lxc-${rsc}-location-${tmp}\" node=\"${tmp}\" rsc=\"${rsc}\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
done
rm -f tmp_constraint
done
cibadmin -o resources -D -x lxc-ms.cib
- for tmp in $(ls container*.cib); do
+ for tmp in container*.cib; do
tmp=$(echo $tmp | sed -e 's/\.cib//g')
- crm_resource -U -r $tmp -H $(this_node)
- crm_resource -D -r $tmp -t primitive
+ crm_resource -U -r "$tmp" -H "$(this_node)"
+ crm_resource -D -r "$tmp" -t primitive
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
# Allow the cluster to stabilize before continuing
crm_resource --wait
# Purge nodes from caches and CIB status section
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
crm_node --force --remove $tmp
done
}
restore_network()
{
NODE="$1"
ssh $SSH_CMD_OPTS $NODE <<-EOF
cd $working_dir
for tmp in \$(ls lxc*.xml | sed -e 's/\.xml//g'); do
virsh -c lxc:/// destroy \$tmp >/dev/null 2>&1
virsh -c lxc:/// undefine \$tmp >/dev/null 2>&1
sed -i.bak "/...\....\....\..* \${tmp}/d" /etc/hosts
done
virsh net-destroy default >/dev/null 2>&1
virsh net-undefine default >/dev/null 2>&1
if [ -f restore_default.xml ]; then
virsh net-define restore_default.xml
virsh net-start default
rm restore_default.xml
fi
EOF
echo "Containers destroyed and default network restored on $NODE"
}
restore_libvirt()
{
- restore_network $(this_node)
+ restore_network "$(this_node)"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
restore_network $node
done
fi
}
restore_files()
{
- ls | grep -v "lxc.\.xml" | xargs rm -rf
+ find . -maxdepth 1 -not -name "lxc*.xml" -a -not -name . -exec rm -rf "{}" ";"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node rm -rf \
$working_dir/lxc*-filesystem \
$working_dir/cur_network.xml
done
fi
}
make_directory $working_dir
-cd $working_dir
+cd $working_dir || exit 1
# Generate files as requested
if [ $download -eq 1 ]; then
download_agent
fi
if [ $key_gen -eq 1 ]; then
generate_key
fi
if [ $generate -eq 1 ]; then
generate
fi
if [ $share_configs -eq 1 ]; then
distribute_configs
fi
if [ $generate -eq 1 ]; then
start_network_all
fi
# Update cluster as requested
if [ $cib -eq 1 ]; then
apply_cib_entries
fi
if [ $add_master -eq 1 ]; then
apply_cib_master
fi
# Restore original state as requested
if [ $restore_pcmk -eq 1 ]; then
restore_cib
fi
if [ $restore -eq 1 ]; then
restore_libvirt
fi
if [ $restore_all -eq 1 ]; then
restore_files
fi
diff --git a/cts/scheduler/origin.exp b/cts/scheduler/origin.exp
index 780be47616..cda2b5808d 100644
--- a/cts/scheduler/origin.exp
+++ b/cts/scheduler/origin.exp
@@ -1,11 +1,11 @@
<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY" transition_id="0">
<synapse id="0">
<action_set>
<rsc_op id="4" operation="monitor" operation_key="resD_monitor_3600000" on_node="node1" on_node_uuid="node1">
<primitive id="resD" class="ocf" provider="heartbeat" type="Dummy"/>
- <attributes CRM_meta_interval="3600000" CRM_meta_interval_origin="2014-06-01 00:35:00" CRM_meta_name="monitor" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_start_delay="2100000" CRM_meta_timeout="60000" />
+ <attributes CRM_meta_interval="3600000" CRM_meta_interval_origin="2014-06-01 00:35:00" CRM_meta_name="monitor" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_start_delay="420000" CRM_meta_timeout="60000" />
</rsc_op>
</action_set>
<inputs/>
</synapse>
</transition_graph>
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 167c66e131..9748a3a6ec 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1,2675 +1,2675 @@
/*
* Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <crm/crm.h>
#include <crm/services.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <pacemaker-controld.h>
#include <controld_fsa.h>
#include <controld_messages.h>
#include <controld_callbacks.h>
#include <controld_lrm.h>
#include <regex.h>
#include <crm/pengine/rules.h>
#define START_DELAY_THRESHOLD 5 * 60 * 1000
#define MAX_LRM_REG_FAILS 30
#define s_if_plural(i) (((i) == 1)? "" : "s")
struct delete_event_s {
int rc;
const char *rsc;
lrm_state_t *lrm_state;
};
static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id);
static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list);
static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data);
static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options,
const char *user_name);
static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
const char *rsc_id, const char *operation);
static void do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation,
xmlNode * msg, xmlNode * request);
void send_direct_ack(const char *to_host, const char *to_sys,
lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id);
static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
int log_level);
static int do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op);
static void
lrm_connection_destroy(void)
{
if (is_set(fsa_input_register, R_LRM_CONNECTED)) {
crm_crit("Connection to executor failed");
register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
clear_bit(fsa_input_register, R_LRM_CONNECTED);
} else {
crm_info("Disconnected from executor");
}
}
static char *
make_stop_id(const char *rsc, int call_id)
{
return crm_strdup_printf("%s:%d", rsc, call_id);
}
static void
copy_instance_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") == NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
static void
copy_meta_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") != NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
/*!
* \internal
* \brief Remove a recurring operation from a resource's history
*
* \param[in,out] history Resource history to modify
* \param[in] op Operation to remove
*
* \return TRUE if the operation was found and removed, FALSE otherwise
*/
static gboolean
history_remove_recurring_op(rsc_history_t *history, const lrmd_event_data_t *op)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_event_data_t *existing = iter->data;
if ((op->interval_ms == existing->interval_ms)
&& crm_str_eq(op->rsc_id, existing->rsc_id, TRUE)
&& safe_str_eq(op->op_type, existing->op_type)) {
history->recurring_op_list = g_list_delete_link(history->recurring_op_list, iter);
lrmd_free_event(existing);
return TRUE;
}
}
return FALSE;
}
/*!
* \internal
* \brief Free all recurring operations in resource history
*
* \param[in,out] history Resource history to modify
*/
static void
history_free_recurring_ops(rsc_history_t *history)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_free_event(iter->data);
}
g_list_free(history->recurring_op_list);
history->recurring_op_list = NULL;
}
/*!
* \internal
* \brief Free resource history
*
* \param[in,out] history Resource history to free
*/
void
history_free(gpointer data)
{
rsc_history_t *history = (rsc_history_t*)data;
if (history->stop_params) {
g_hash_table_destroy(history->stop_params);
}
/* Don't need to free history->rsc.id because it's set to history->id */
free(history->rsc.type);
free(history->rsc.standard);
free(history->rsc.provider);
lrmd_free_event(history->failed);
lrmd_free_event(history->last);
free(history->id);
history_free_recurring_ops(history);
free(history);
}
static void
update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op)
{
int target_rc = 0;
rsc_history_t *entry = NULL;
if (op->rsc_deleted) {
crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type);
delete_rsc_status(lrm_state, op->rsc_id, cib_quorum_override, NULL);
return;
}
if (safe_str_eq(op->op_type, RSC_NOTIFY)) {
return;
}
crm_debug("Updating history for '%s' with %s op", op->rsc_id, op->op_type);
entry = g_hash_table_lookup(lrm_state->resource_history, op->rsc_id);
if (entry == NULL && rsc) {
entry = calloc(1, sizeof(rsc_history_t));
entry->id = strdup(op->rsc_id);
g_hash_table_insert(lrm_state->resource_history, entry->id, entry);
entry->rsc.id = entry->id;
entry->rsc.type = strdup(rsc->type);
entry->rsc.standard = strdup(rsc->standard);
if (rsc->provider) {
entry->rsc.provider = strdup(rsc->provider);
} else {
entry->rsc.provider = NULL;
}
} else if (entry == NULL) {
crm_info("Resource %s no longer exists, not updating cache", op->rsc_id);
return;
}
entry->last_callid = op->call_id;
target_rc = rsc_op_expected_rc(op);
if (op->op_status == PCMK_LRM_OP_CANCELLED) {
if (op->interval_ms > 0) {
crm_trace("Removing cancelled recurring op: " CRM_OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
history_remove_recurring_op(entry, op);
return;
} else {
crm_trace("Skipping " CRM_OP_FMT " rc=%d, status=%d",
op->rsc_id, op->op_type, op->interval_ms, op->rc,
op->op_status);
}
} else if (did_rsc_op_fail(op, target_rc)) {
/* Store failed monitors here, otherwise the block below will cause them
* to be forgotten when a stop happens.
*/
if (entry->failed) {
lrmd_free_event(entry->failed);
}
entry->failed = lrmd_copy_event(op);
} else if (op->interval_ms == 0) {
if (entry->last) {
lrmd_free_event(entry->last);
}
entry->last = lrmd_copy_event(op);
if (op->params &&
(safe_str_eq(CRMD_ACTION_START, op->op_type) ||
safe_str_eq("reload", op->op_type) ||
safe_str_eq(CRMD_ACTION_STATUS, op->op_type))) {
if (entry->stop_params) {
g_hash_table_destroy(entry->stop_params);
}
entry->stop_params = crm_str_table_new();
g_hash_table_foreach(op->params, copy_instance_keys, entry->stop_params);
}
}
if (op->interval_ms > 0) {
/* Ensure there are no duplicates */
history_remove_recurring_op(entry, op);
crm_trace("Adding recurring op: " CRM_OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op));
} else if (entry->recurring_op_list && safe_str_eq(op->op_type, RSC_STATUS) == FALSE) {
crm_trace("Dropping %d recurring ops because of: " CRM_OP_FMT,
g_list_length(entry->recurring_op_list), op->rsc_id,
op->op_type, op->interval_ms);
history_free_recurring_ops(entry);
}
}
/*!
* \internal
* \brief Send a direct OK ack for a resource task
*
* \param[in] lrm_state LRM connection
* \param[in] input Input message being ack'ed
* \param[in] rsc_id ID of affected resource
* \param[in] rsc Affected resource (if available)
* \param[in] task Operation task being ack'ed
* \param[in] ack_host Name of host to send ack to
* \param[in] ack_sys IPC system name to ack
*/
static void
send_task_ok_ack(lrm_state_t *lrm_state, ha_msg_input_t *input,
const char *rsc_id, lrmd_rsc_info_t *rsc, const char *task,
const char *ack_host, const char *ack_sys)
{
lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc_id, task);
op->rc = PCMK_OCF_OK;
op->op_status = PCMK_LRM_OP_DONE;
send_direct_ack(ack_host, ack_sys, rsc, op, rsc_id);
lrmd_free_event(op);
}
void
lrm_op_callback(lrmd_event_data_t * op)
{
const char *nodename = NULL;
lrm_state_t *lrm_state = NULL;
CRM_CHECK(op != NULL, return);
/* determine the node name for this connection. */
nodename = op->remote_nodename ? op->remote_nodename : fsa_our_uname;
if (op->type == lrmd_event_disconnect && (safe_str_eq(nodename, fsa_our_uname))) {
/* If this is the local executor IPC connection, set the right bits in the
* controller when the connection goes down.
*/
lrm_connection_destroy();
return;
} else if (op->type != lrmd_event_exec_complete) {
/* we only need to process execution results */
return;
}
lrm_state = lrm_state_find(nodename);
CRM_ASSERT(lrm_state != NULL);
process_lrm_event(lrm_state, op, NULL);
}
/* A_LRM_CONNECT */
void
do_lrm_control(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
/* This only pertains to local executor connections. Remote connections are
* handled as resources within the scheduler. Connecting and disconnecting
* from remote executor instances is handled differently.
*/
lrm_state_t *lrm_state = NULL;
if(fsa_our_uname == NULL) {
return; /* Nothing to do */
}
lrm_state = lrm_state_find_or_create(fsa_our_uname);
if (lrm_state == NULL) {
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}
if (action & A_LRM_DISCONNECT) {
if (lrm_state_verify_stopped(lrm_state, cur_state, LOG_INFO) == FALSE) {
if (action == A_LRM_DISCONNECT) {
crmd_fsa_stall(FALSE);
return;
}
}
clear_bit(fsa_input_register, R_LRM_CONNECTED);
crm_info("Disconnecting from the executor");
lrm_state_disconnect(lrm_state);
lrm_state_reset_tables(lrm_state, FALSE);
crm_notice("Disconnected from the executor");
}
if (action & A_LRM_CONNECT) {
int ret = pcmk_ok;
crm_debug("Connecting to the executor");
ret = lrm_state_ipc_connect(lrm_state);
if (ret != pcmk_ok) {
if (lrm_state->num_lrm_register_fails < MAX_LRM_REG_FAILS) {
crm_warn("Failed to connect to the executor %d time%s (%d max)",
lrm_state->num_lrm_register_fails,
s_if_plural(lrm_state->num_lrm_register_fails),
MAX_LRM_REG_FAILS);
crm_timer_start(wait_timer);
crmd_fsa_stall(FALSE);
return;
}
}
if (ret != pcmk_ok) {
crm_err("Failed to connect to the executor the max allowed %d time%s",
lrm_state->num_lrm_register_fails,
s_if_plural(lrm_state->num_lrm_register_fails));
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}
set_bit(fsa_input_register, R_LRM_CONNECTED);
crm_info("Connection to the executor established");
}
if (action & ~(A_LRM_CONNECT | A_LRM_DISCONNECT)) {
crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__);
}
}
static gboolean
lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, int log_level)
{
int counter = 0;
gboolean rc = TRUE;
const char *when = "lrm disconnect";
GHashTableIter gIter;
const char *key = NULL;
rsc_history_t *entry = NULL;
struct recurring_op_s *pending = NULL;
crm_debug("Checking for active resources before exit");
if (cur_state == S_TERMINATE) {
log_level = LOG_ERR;
when = "shutdown";
} else if (is_set(fsa_input_register, R_SHUTDOWN)) {
when = "shutdown... waiting";
}
if (lrm_state->pending_ops && lrm_state_is_connected(lrm_state) == TRUE) {
guint removed = g_hash_table_foreach_remove(
lrm_state->pending_ops, stop_recurring_actions, lrm_state);
guint nremaining = g_hash_table_size(lrm_state->pending_ops);
if (removed || nremaining) {
crm_notice("Stopped %u recurring operation%s at %s (%u remaining)",
removed, s_if_plural(removed), when, nremaining);
}
}
if (lrm_state->pending_ops) {
g_hash_table_iter_init(&gIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&pending)) {
/* Ignore recurring actions in the shutdown calculations */
if (pending->interval_ms == 0) {
counter++;
}
}
}
if (counter > 0) {
do_crm_log(log_level, "%d pending executor operation%s at %s",
counter, s_if_plural(counter), when);
if (cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) {
g_hash_table_iter_init(&gIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&gIter, (gpointer*)&key, (gpointer*)&pending)) {
do_crm_log(log_level, "Pending action: %s (%s)", key, pending->op_key);
}
} else {
rc = FALSE;
}
return rc;
}
if (lrm_state->resource_history == NULL) {
return rc;
}
if (is_set(fsa_input_register, R_SHUTDOWN)) {
/* At this point we're not waiting, we're just shutting down */
when = "shutdown";
}
counter = 0;
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer*)&entry)) {
if (is_rsc_active(lrm_state, entry->id) == FALSE) {
continue;
}
counter++;
if (log_level == LOG_ERR) {
crm_info("Found %s active at %s", entry->id, when);
} else {
crm_trace("Found %s active at %s", entry->id, when);
}
if (lrm_state->pending_ops) {
GHashTableIter hIter;
g_hash_table_iter_init(&hIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&hIter, (gpointer*)&key, (gpointer*)&pending)) {
if (crm_str_eq(entry->id, pending->rsc_id, TRUE)) {
crm_notice("%sction %s (%s) incomplete at %s",
pending->interval_ms == 0 ? "A" : "Recurring a",
key, pending->op_key, when);
}
}
}
}
if (counter) {
crm_err("%d resource%s active at %s",
counter, (counter == 1)? " was" : "s were", when);
}
return rc;
}
static char *
build_parameter_list(const lrmd_event_data_t *op,
const struct ra_metadata_s *metadata,
xmlNode *result, enum ra_param_flags_e param_type,
bool invert_for_xml)
{
int len = 0;
int max = 0;
char *list = NULL;
GList *iter = NULL;
/* Newer resource agents support the "private" parameter attribute to
* indicate sensitive parameters. For backward compatibility with older
* agents, this list is used if the agent doesn't specify any as "private".
*/
const char *secure_terms[] = {
"password",
"passwd",
"user",
};
if (is_not_set(metadata->ra_flags, ra_uses_private)
&& (param_type == ra_param_private)) {
max = DIMOF(secure_terms);
}
for (iter = metadata->ra_params; iter != NULL; iter = iter->next) {
struct ra_param_s *param = (struct ra_param_s *) iter->data;
bool accept = FALSE;
if (is_set(param->rap_flags, param_type)) {
accept = TRUE;
} else if (max) {
for (int lpc = 0; lpc < max; lpc++) {
if (safe_str_eq(secure_terms[lpc], param->rap_name)) {
accept = TRUE;
break;
}
}
}
if (accept) {
int start = len;
crm_trace("Attr %s is %s", param->rap_name, ra_param_flag2text(param_type));
len += strlen(param->rap_name) + 2; // include spaces around
list = realloc_safe(list, len + 1); // include null terminator
// spaces before and after make parsing simpler
sprintf(list + start, " %s ", param->rap_name);
} else {
crm_trace("Rejecting %s for %s", param->rap_name, ra_param_flag2text(param_type));
}
if (result && (invert_for_xml? !accept : accept)) {
const char *v = g_hash_table_lookup(op->params, param->rap_name);
if (v != NULL) {
crm_trace("Adding attr %s=%s to the xml result", param->rap_name, v);
crm_xml_add(result, param->rap_name, v);
}
}
}
return list;
}
static void
append_restart_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
char *list = NULL;
char *digest = NULL;
xmlNode *restart = NULL;
CRM_LOG_ASSERT(op->params != NULL);
if (op->interval_ms > 0) {
/* monitors are not reloadable */
return;
}
if (is_set(metadata->ra_flags, ra_supports_reload)) {
restart = create_xml_node(NULL, XML_TAG_PARAMS);
/* Add any parameters with unique="1" to the "op-force-restart" list.
*
* (Currently, we abuse "unique=0" to indicate reloadability. This is
* nonstandard and should eventually be replaced once the OCF standard
* is updated with something better.)
*/
list = build_parameter_list(op, metadata, restart, ra_param_unique,
FALSE);
} else {
/* Resource does not support reloads */
return;
}
digest = calculate_operation_digest(restart, version);
/* Add "op-force-restart" and "op-restart-digest" to indicate the resource supports reload,
* no matter if it actually supports any parameters with unique="1"). */
crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list? list: "");
crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest);
crm_trace("%s: %s, %s", op->rsc_id, digest, list);
crm_log_xml_trace(restart, "restart digest source");
free_xml(restart);
free(digest);
free(list);
}
static void
append_secure_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
char *list = NULL;
char *digest = NULL;
xmlNode *secure = NULL;
CRM_LOG_ASSERT(op->params != NULL);
/*
* To keep XML_LRM_ATTR_OP_SECURE short, we want it to contain the
* secure parameters but XML_LRM_ATTR_SECURE_DIGEST to be based on
* the insecure ones
*/
secure = create_xml_node(NULL, XML_TAG_PARAMS);
list = build_parameter_list(op, metadata, secure, ra_param_private, TRUE);
if (list != NULL) {
digest = calculate_operation_digest(secure, version);
crm_xml_add(update, XML_LRM_ATTR_OP_SECURE, list);
crm_xml_add(update, XML_LRM_ATTR_SECURE_DIGEST, digest);
crm_trace("%s: %s, %s", op->rsc_id, digest, list);
crm_log_xml_trace(secure, "secure digest source");
} else {
crm_trace("%s: no secure parameters", op->rsc_id);
}
free_xml(secure);
free(digest);
free(list);
}
static gboolean
build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op,
const char *node_name, const char *src)
{
int target_rc = 0;
xmlNode *xml_op = NULL;
struct ra_metadata_s *metadata = NULL;
const char *caller_version = NULL;
lrm_state_t *lrm_state = NULL;
if (op == NULL) {
return FALSE;
}
target_rc = rsc_op_expected_rc(op);
/* there is a small risk in formerly mixed clusters that it will
* be sub-optimal.
*
* however with our upgrade policy, the update we send should
* still be completely supported anyway
*/
caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION);
CRM_LOG_ASSERT(caller_version != NULL);
if(caller_version == NULL) {
caller_version = CRM_FEATURE_SET;
}
crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version);
xml_op = create_operation_update(parent, op, caller_version, target_rc, fsa_our_uname, src, LOG_DEBUG);
if (xml_op == NULL) {
return TRUE;
}
if ((rsc == NULL) || (op->params == NULL)
|| !crm_op_needs_metadata(rsc->standard, op->op_type)) {
crm_trace("No digests needed for %s action on %s (params=%p rsc=%p)",
op->op_type, op->rsc_id, op->params, rsc);
return TRUE;
}
lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
crm_warn("Cannot calculate digests for operation " CRM_OP_FMT
" because we have no connection to executor for %s",
op->rsc_id, op->op_type, op->interval_ms, node_name);
return TRUE;
}
metadata = metadata_cache_get(lrm_state->metadata_cache, rsc);
if (metadata == NULL) {
/* For now, we always collect resource agent meta-data via a local,
* synchronous, direct execution of the agent. This has multiple issues:
* the executor should execute agents, not the controller; meta-data for
* Pacemaker Remote nodes should be collected on those nodes, not
* locally; and the meta-data call shouldn't eat into the timeout of the
* real action being performed.
*
* These issues are planned to be addressed by having the scheduler
* schedule a meta-data cache check at the beginning of each transition.
* Once that is working, this block will only be a fallback in case the
* initial collection fails.
*/
char *metadata_str = NULL;
int rc = lrm_state_get_metadata(lrm_state, rsc->standard,
rsc->provider, rsc->type,
&metadata_str, 0);
if (rc != pcmk_ok) {
crm_warn("Failed to get metadata for %s (%s:%s:%s)",
rsc->id, rsc->standard, rsc->provider, rsc->type);
return TRUE;
}
metadata = metadata_cache_update(lrm_state->metadata_cache, rsc,
metadata_str);
free(metadata_str);
if (metadata == NULL) {
crm_warn("Failed to update metadata for %s (%s:%s:%s)",
rsc->id, rsc->standard, rsc->provider, rsc->type);
return TRUE;
}
}
#if ENABLE_VERSIONED_ATTRS
crm_xml_add(xml_op, XML_ATTR_RA_VERSION, metadata->ra_version);
#endif
crm_trace("Including additional digests for %s::%s:%s", rsc->standard, rsc->provider, rsc->type);
append_restart_list(op, metadata, xml_op, caller_version);
append_secure_list(op, metadata, xml_op, caller_version);
return TRUE;
}
static gboolean
is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id)
{
rsc_history_t *entry = NULL;
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
if (entry == NULL || entry->last == NULL) {
return FALSE;
}
crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type,
entry->last->interval_ms, entry->last->rc);
if (entry->last->rc == PCMK_OCF_OK && safe_str_eq(entry->last->op_type, CRMD_ACTION_STOP)) {
return FALSE;
} else if (entry->last->rc == PCMK_OCF_OK
&& safe_str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE)) {
/* a stricter check is too complex...
* leave that to the PE
*/
return FALSE;
} else if (entry->last->rc == PCMK_OCF_NOT_RUNNING) {
return FALSE;
} else if ((entry->last->interval_ms == 0)
&& (entry->last->rc == PCMK_OCF_NOT_CONFIGURED)) {
/* Badly configured resources can't be reliably stopped */
return FALSE;
}
return TRUE;
}
static gboolean
build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
{
GHashTableIter iter;
rsc_history_t *entry = NULL;
g_hash_table_iter_init(&iter, lrm_state->resource_history);
while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) {
GList *gIter = NULL;
xmlNode *xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE);
crm_xml_add(xml_rsc, XML_ATTR_ID, entry->id);
crm_xml_add(xml_rsc, XML_ATTR_TYPE, entry->rsc.type);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, entry->rsc.standard);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, entry->rsc.provider);
if (entry->last && entry->last->params) {
const char *container = g_hash_table_lookup(entry->last->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
if (container) {
crm_trace("Resource %s is a part of container resource %s", entry->id, container);
crm_xml_add(xml_rsc, XML_RSC_ATTR_CONTAINER, container);
}
}
build_operation_update(xml_rsc, &(entry->rsc), entry->failed, lrm_state->node_name, __FUNCTION__);
build_operation_update(xml_rsc, &(entry->rsc), entry->last, lrm_state->node_name, __FUNCTION__);
for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIter->next) {
build_operation_update(xml_rsc, &(entry->rsc), gIter->data, lrm_state->node_name, __FUNCTION__);
}
}
return FALSE;
}
static xmlNode *
do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
{
xmlNode *xml_state = NULL;
xmlNode *xml_data = NULL;
xmlNode *rsc_list = NULL;
crm_node_t *peer = NULL;
peer = crm_get_peer_full(0, lrm_state->node_name, CRM_GET_PEER_ANY);
CRM_CHECK(peer != NULL, return NULL);
xml_state = create_node_state_update(peer, update_flags, NULL,
__FUNCTION__);
if (xml_state == NULL) {
return NULL;
}
xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM);
crm_xml_add(xml_data, XML_ATTR_ID, peer->uuid);
rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES);
/* Build a list of active (not always running) resources */
build_active_RAs(lrm_state, rsc_list);
crm_log_xml_trace(xml_state, "Current executor state");
return xml_state;
}
xmlNode *
do_lrm_query(gboolean is_replace, const char *node_name)
{
lrm_state_t *lrm_state = lrm_state_find(node_name);
if (!lrm_state) {
crm_err("Could not find executor state for node %s", node_name);
return NULL;
}
return do_lrm_query_internal(lrm_state,
node_update_cluster|node_update_peer);
}
static void
notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, int rc)
{
lrmd_event_data_t *op = NULL;
const char *from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
const char *from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
crm_info("Notifying %s on %s that %s was%s deleted",
from_sys, (from_host? from_host : "localhost"), rsc_id,
((rc == pcmk_ok)? "" : " not"));
op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE);
if (rc == pcmk_ok) {
op->op_status = PCMK_LRM_OP_DONE;
op->rc = PCMK_OCF_OK;
} else {
op->op_status = PCMK_LRM_OP_ERROR;
op->rc = PCMK_OCF_UNKNOWN_ERROR;
}
send_direct_ack(from_host, from_sys, NULL, op, rsc_id);
lrmd_free_event(op);
if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) {
/* this isn't expected - trigger a new transition */
time_t now = time(NULL);
char *now_s = crm_itoa(now);
crm_debug("Triggering a refresh after %s deleted %s from the executor",
from_sys, rsc_id);
update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
"last-lrm-refresh", now_s, FALSE, NULL, NULL);
free(now_s);
}
}
static gboolean
lrm_remove_deleted_rsc(gpointer key, gpointer value, gpointer user_data)
{
struct delete_event_s *event = user_data;
struct pending_deletion_op_s *op = value;
if (crm_str_eq(event->rsc, op->rsc, TRUE)) {
notify_deleted(event->lrm_state, op->input, event->rsc, event->rc);
return TRUE;
}
return FALSE;
}
static gboolean
lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data)
{
const char *rsc = user_data;
struct recurring_op_s *pending = value;
if (crm_str_eq(rsc, pending->rsc_id, TRUE)) {
crm_info("Removing op %s:%d for deleted resource %s",
pending->op_key, pending->call_id, rsc);
return TRUE;
}
return FALSE;
}
/*
* Remove the rsc from the CIB
*
* Avoids refreshing the entire LRM section of this host
*/
#define rsc_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']"
static int
delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options,
const char *user_name)
{
char *rsc_xpath = NULL;
int rc = pcmk_ok;
CRM_CHECK(rsc_id != NULL, return -ENXIO);
rsc_xpath = crm_strdup_printf(rsc_template, lrm_state->node_name, rsc_id);
rc = cib_internal_op(fsa_cib_conn, CIB_OP_DELETE, NULL, rsc_xpath,
NULL, NULL, call_options | cib_xpath, user_name);
free(rsc_xpath);
return rc;
}
static void
delete_rsc_entry(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id,
GHashTableIter * rsc_gIter, int rc, const char *user_name)
{
struct delete_event_s event;
CRM_CHECK(rsc_id != NULL, return);
if (rc == pcmk_ok) {
char *rsc_id_copy = strdup(rsc_id);
if (rsc_gIter)
g_hash_table_iter_remove(rsc_gIter);
else
g_hash_table_remove(lrm_state->resource_history, rsc_id_copy);
crm_debug("sync: Sending delete op for %s", rsc_id_copy);
delete_rsc_status(lrm_state, rsc_id_copy, cib_quorum_override, user_name);
g_hash_table_foreach_remove(lrm_state->pending_ops, lrm_remove_deleted_op, rsc_id_copy);
free(rsc_id_copy);
}
if (input) {
notify_deleted(lrm_state, input, rsc_id, rc);
}
event.rc = rc;
event.rsc = rsc_id;
event.lrm_state = lrm_state;
g_hash_table_foreach_remove(lrm_state->deletion_ops, lrm_remove_deleted_rsc, &event);
}
/*!
* \internal
* \brief Erase an LRM history entry from the CIB, given the operation data
*
* \param[in] lrm_state LRM state of the desired node
* \param[in] op Operation whose history should be deleted
*/
static void
erase_lrm_history_by_op(lrm_state_t *lrm_state, lrmd_event_data_t *op)
{
xmlNode *xml_top = NULL;
CRM_CHECK(op != NULL, return);
xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP);
crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id);
crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data);
if (op->interval_ms > 0) {
char *op_id = generate_op_key(op->rsc_id, op->op_type, op->interval_ms);
/* Avoid deleting last_failure too (if it was a result of this recurring op failing) */
crm_xml_add(xml_top, XML_ATTR_ID, op_id);
free(op_id);
}
crm_debug("Erasing resource operation history for " CRM_OP_FMT " (call=%d)",
op->rsc_id, op->op_type, op->interval_ms, op->call_id);
fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top,
cib_quorum_override);
crm_log_xml_trace(xml_top, "op:cancel");
free_xml(xml_top);
}
/* Define xpath to find LRM resource history entry by node and resource */
#define XPATH_HISTORY \
"/" XML_TAG_CIB "/" XML_CIB_TAG_STATUS \
"/" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
"/" XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \
"/" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
"/" XML_LRM_TAG_RSC_OP
/* ... and also by operation key */
#define XPATH_HISTORY_ID XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s']"
/* ... and also by operation key and operation call ID */
#define XPATH_HISTORY_CALL XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_CALLID "='%d']"
/* ... and also by operation key and original operation key */
#define XPATH_HISTORY_ORIG XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_TASK_KEY "='%s']"
/*!
* \internal
* \brief Erase an LRM history entry from the CIB, given operation identifiers
*
* \param[in] lrm_state LRM state of the node to clear history for
* \param[in] rsc_id Name of resource to clear history for
* \param[in] key Operation key of operation to clear history for
* \param[in] orig_op If specified, delete only if it has this original op
* \param[in] call_id If specified, delete entry only if it has this call ID
*/
static void
erase_lrm_history_by_id(lrm_state_t *lrm_state, const char *rsc_id,
const char *key, const char *orig_op, int call_id)
{
char *op_xpath = NULL;
CRM_CHECK((rsc_id != NULL) && (key != NULL), return);
if (call_id > 0) {
op_xpath = crm_strdup_printf(XPATH_HISTORY_CALL,
lrm_state->node_name, rsc_id, key,
call_id);
} else if (orig_op) {
op_xpath = crm_strdup_printf(XPATH_HISTORY_ORIG,
lrm_state->node_name, rsc_id, key,
orig_op);
} else {
op_xpath = crm_strdup_printf(XPATH_HISTORY_ID,
lrm_state->node_name, rsc_id, key);
}
crm_debug("Erasing resource operation history for %s on %s (call=%d)",
key, rsc_id, call_id);
fsa_cib_conn->cmds->remove(fsa_cib_conn, op_xpath, NULL,
cib_quorum_override | cib_xpath);
free(op_xpath);
}
static inline gboolean
last_failed_matches_op(rsc_history_t *entry, const char *op, guint interval_ms)
{
if (entry == NULL) {
return FALSE;
}
if (op == NULL) {
return TRUE;
}
return (safe_str_eq(op, entry->failed->op_type)
&& (interval_ms == entry->failed->interval_ms));
}
/*!
* \internal
* \brief Clear a resource's last failure
*
* Erase a resource's last failure on a particular node from both the
* LRM resource history in the CIB, and the resource history remembered
* for the LRM state.
*
* \param[in] rsc_id Resource name
* \param[in] node_name Node name
* \param[in] operation If specified, only clear if matching this operation
* \param[in] interval_ms If operation is specified, it has this interval
*/
void
lrm_clear_last_failure(const char *rsc_id, const char *node_name,
const char *operation, guint interval_ms)
{
char *op_key = NULL;
char *orig_op_key = NULL;
lrm_state_t *lrm_state = NULL;
lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
return;
}
/* Erase from CIB */
op_key = generate_op_key(rsc_id, "last_failure", 0);
if (operation) {
orig_op_key = generate_op_key(rsc_id, operation, interval_ms);
}
erase_lrm_history_by_id(lrm_state, rsc_id, op_key, orig_op_key, 0);
free(op_key);
free(orig_op_key);
/* Remove from memory */
if (lrm_state->resource_history) {
rsc_history_t *entry = g_hash_table_lookup(lrm_state->resource_history,
rsc_id);
if (last_failed_matches_op(entry, operation, interval_ms)) {
lrmd_free_event(entry->failed);
entry->failed = NULL;
}
}
}
/* Returns: gboolean - cancellation is in progress */
static gboolean
cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, gboolean remove)
{
int rc = pcmk_ok;
char *local_key = NULL;
struct recurring_op_s *pending = NULL;
CRM_CHECK(op != 0, return FALSE);
CRM_CHECK(rsc_id != NULL, return FALSE);
if (key == NULL) {
local_key = make_stop_id(rsc_id, op);
key = local_key;
}
pending = g_hash_table_lookup(lrm_state->pending_ops, key);
if (pending) {
if (remove && pending->remove == FALSE) {
pending->remove = TRUE;
crm_debug("Scheduling %s for removal", key);
}
if (pending->cancelled) {
crm_debug("Operation %s already cancelled", key);
free(local_key);
return FALSE;
}
pending->cancelled = TRUE;
} else {
crm_info("No pending op found for %s", key);
free(local_key);
return FALSE;
}
crm_debug("Cancelling op %d for %s (%s)", op, rsc_id, key);
rc = lrm_state_cancel(lrm_state, pending->rsc_id, pending->op_type,
pending->interval_ms);
if (rc == pcmk_ok) {
crm_debug("Op %d for %s (%s): cancelled", op, rsc_id, key);
free(local_key);
return TRUE;
}
crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc_id, key);
/* The caller needs to make sure the entry is
* removed from the pending_ops list
*
* Usually by returning TRUE inside the worker function
* supplied to g_hash_table_foreach_remove()
*
* Not removing the entry from pending_ops will block
* the node from shutting down
*/
free(local_key);
return FALSE;
}
struct cancel_data {
gboolean done;
gboolean remove;
const char *key;
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct cancel_data *data = user_data;
struct recurring_op_s *op = (struct recurring_op_s *)value;
if (crm_str_eq(op->op_key, data->key, TRUE)) {
data->done = TRUE;
remove = !cancel_op(data->lrm_state, data->rsc->id, key, op->call_id, data->remove);
}
return remove;
}
static gboolean
cancel_op_key(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *key, gboolean remove)
{
guint removed = 0;
struct cancel_data data;
CRM_CHECK(rsc != NULL, return FALSE);
CRM_CHECK(key != NULL, return FALSE);
data.key = key;
data.rsc = rsc;
data.done = FALSE;
data.remove = remove;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(lrm_state->pending_ops, cancel_action_by_key, &data);
crm_trace("Removed %u op cache entries, new size: %u",
removed, g_hash_table_size(lrm_state->pending_ops));
return data.done;
}
/*!
* \internal
* \brief Retrieve resource information from LRM
*
* \param[in] lrm_state LRM connection to use
* \param[in] rsc_xml XML containing resource configuration
* \param[in] do_create If true, register resource with LRM if not already
* \param[out] rsc_info Where to store resource information obtained from LRM
*
* \retval pcmk_ok Success (and rsc_info holds newly allocated result)
* \retval -EINVAL Required information is missing from arguments
* \retval -ENOTCONN No active connection to LRM
* \retval -ENODEV Resource not found
* \retval -errno Error communicating with executor when registering resource
*
* \note Caller is responsible for freeing result on success.
*/
static int
get_lrm_resource(lrm_state_t *lrm_state, xmlNode *rsc_xml, gboolean do_create,
lrmd_rsc_info_t **rsc_info)
{
const char *id = ID(rsc_xml);
CRM_CHECK(lrm_state && rsc_xml && rsc_info, return -EINVAL);
CRM_CHECK(id, return -EINVAL);
if (lrm_state_is_connected(lrm_state) == FALSE) {
return -ENOTCONN;
}
crm_trace("Retrieving resource information for %s from the executor", id);
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
// If resource isn't known by ID, try clone name, if provided
if (!*rsc_info) {
const char *long_id = crm_element_value(rsc_xml, XML_ATTR_ID_LONG);
if (long_id) {
*rsc_info = lrm_state_get_rsc_info(lrm_state, long_id, 0);
}
}
if ((*rsc_info == NULL) && do_create) {
const char *class = crm_element_value(rsc_xml, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(rsc_xml, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(rsc_xml, XML_ATTR_TYPE);
int rc;
crm_trace("Registering resource %s with the executor", id);
rc = lrm_state_register_rsc(lrm_state, id, class, provider, type,
lrmd_opt_drop_recurring);
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
crm_err("Could not register resource %s with the executor on %s: %s "
CRM_XS " rc=%d",
id, lrm_state->node_name, pcmk_strerror(rc), rc);
/* Register this as an internal error if this involves the local
* executor. Otherwise, we're likely dealing with an unresponsive
* remote node, which is not an FSA failure.
*/
if (lrm_state_is_local(lrm_state) == TRUE) {
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
}
return rc;
}
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
}
return *rsc_info? pcmk_ok : -ENODEV;
}
static void
delete_resource(lrm_state_t * lrm_state,
const char *id,
lrmd_rsc_info_t * rsc,
GHashTableIter * gIter,
const char *sys,
const char *host,
const char *user,
ha_msg_input_t * request,
gboolean unregister)
{
int rc = pcmk_ok;
crm_info("Removing resource %s for %s (%s) on %s", id, sys, user ? user : "internal", host);
if (rsc && unregister) {
rc = lrm_state_unregister_rsc(lrm_state, id, 0);
}
if (rc == pcmk_ok) {
crm_trace("Resource '%s' deleted", id);
} else if (rc == -EINPROGRESS) {
crm_info("Deletion of resource '%s' pending", id);
if (request) {
struct pending_deletion_op_s *op = NULL;
char *ref = crm_element_value_copy(request->msg, XML_ATTR_REFERENCE);
op = calloc(1, sizeof(struct pending_deletion_op_s));
op->rsc = strdup(rsc->id);
op->input = copy_ha_msg_input(request);
g_hash_table_insert(lrm_state->deletion_ops, ref, op);
}
return;
} else {
crm_warn("Deletion of resource '%s' for %s (%s) on %s failed: %d",
id, sys, user ? user : "internal", host, rc);
}
delete_rsc_entry(lrm_state, request, id, gIter, rc, user);
}
static int
get_fake_call_id(lrm_state_t *lrm_state, const char *rsc_id)
{
int call_id = 999999999;
rsc_history_t *entry = NULL;
if(lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* Make sure the call id is greater than the last successful operation,
* otherwise the failure will not result in a possible recovery of the resource
* as it could appear the failure occurred before the successful start */
if (entry) {
call_id = entry->last_callid + 1;
}
if (call_id < 0) {
call_id = 1;
}
return call_id;
}
static void
fake_op_status(lrm_state_t *lrm_state, lrmd_event_data_t *op, int op_status,
enum ocf_exitcode op_exitcode)
{
op->call_id = get_fake_call_id(lrm_state, op->rsc_id);
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
op->op_status = op_status;
op->rc = op_exitcode;
}
static void
force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node)
{
GHashTableIter gIter;
rsc_history_t *entry = NULL;
crm_info("Clearing resource history on node %s", lrm_state->node_name);
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
/* only unregister the resource during a reprobe if it is not a remote connection
* resource. otherwise unregistering the connection will terminate remote-node
* membership */
gboolean unregister = TRUE;
if (is_remote_lrmd_ra(NULL, NULL, entry->id)) {
lrm_state_t *remote_lrm_state = lrm_state_find(entry->id);
if (remote_lrm_state) {
/* when forcing a reprobe, make sure to clear remote node before
* clearing the remote node's connection resource */
force_reprobe(remote_lrm_state, from_sys, from_host, user_name, TRUE);
}
unregister = FALSE;
}
delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys, from_host,
user_name, NULL, unregister);
}
/* Now delete the copy in the CIB */
erase_status_tag(lrm_state->node_name, XML_CIB_TAG_LRM, cib_scope_local);
/* Finally, _delete_ the value in pacemaker-attrd -- setting it to FALSE
* would result in the scheduler sending us back here again
*/
update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node);
}
static void
synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc)
{
lrmd_event_data_t *op = NULL;
lrmd_rsc_info_t *rsc_info = NULL;
const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK);
const char *target_node = crm_element_value(action, XML_LRM_ATTR_TARGET);
xmlNode *xml_rsc = find_xml_node(action, XML_CIB_TAG_RESOURCE, TRUE);
if ((xml_rsc == NULL) || (ID(xml_rsc) == NULL)) {
/* @TODO Should we do something else, like direct ack? */
crm_info("Can't fake %s failure (%d) on %s without resource configuration",
crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
target_node);
return;
} else if(operation == NULL) {
/* This probably came from crm_resource -C, nothing to do */
crm_info("Can't fake %s failure (%d) on %s without operation",
ID(xml_rsc), rc, target_node);
return;
}
op = construct_op(lrm_state, action, ID(xml_rsc), operation);
if (safe_str_eq(operation, RSC_NOTIFY)) { // Notifications can't fail
fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_OK);
} else {
fake_op_status(lrm_state, op, PCMK_LRM_OP_ERROR, rc);
}
crm_info("Faking " CRM_OP_FMT " result (%d) on %s",
op->rsc_id, op->op_type, op->interval_ms, op->rc, target_node);
/* Process the result as if it came from the LRM, if possible
* (i.e. resource info can be obtained from the lrm_state).
*/
if (lrm_state) {
rsc_info = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0);
}
if (rsc_info) {
lrmd_free_rsc_info(rsc_info);
process_lrm_event(lrm_state, op, NULL);
} else if (controld_action_is_recordable(op->op_type)) {
/* If we can't process the result normally, at least write it to the CIB
* if possible, so the scheduler can act on it.
*/
const char *standard = crm_element_value(xml_rsc, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(xml_rsc, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(xml_rsc, XML_ATTR_TYPE);
if (standard && type) {
rsc_info = lrmd_new_rsc_info(op->rsc_id, standard, provider, type);
do_update_resource(target_node, rsc_info, op);
lrmd_free_rsc_info(rsc_info);
} else {
// @TODO Should we direct ack?
crm_info("Can't fake %s failure (%d) on %s without resource standard and type",
crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
target_node);
}
}
lrmd_free_event(op);
}
/*!
* \internal
* \brief Get target of an LRM operation
*
* \param[in] xml LRM operation data XML
*
* \return LRM operation target node name (local node or Pacemaker Remote node)
*/
static const char *
lrm_op_target(xmlNode *xml)
{
const char *target = NULL;
if (xml) {
target = crm_element_value(xml, XML_LRM_ATTR_TARGET);
}
if (target == NULL) {
target = fsa_our_uname;
}
return target;
}
static void
fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
const char *from_host, const char *from_sys)
{
lrmd_event_data_t *op = NULL;
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(xml, XML_CIB_TAG_RESOURCE, TRUE);
CRM_CHECK(xml_rsc != NULL, return);
/* The executor simply executes operations and reports the results, without
* any concept of success or failure, so to fail a resource, we must fake
* what a failure looks like.
*
* To do this, we create a fake executor operation event for the resource,
* and pass that event to the executor client callback so it will be
* processed as if it came from the executor.
*/
op = construct_op(lrm_state, xml, ID(xml_rsc), "asyncmon");
fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_UNKNOWN_ERROR);
free((char*) op->user_data);
op->user_data = NULL;
op->interval_ms = 0;
#if ENABLE_ACL
if (user_name && is_privileged(user_name) == FALSE) {
crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc));
send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
return;
}
#endif
if (get_lrm_resource(lrm_state, xml_rsc, TRUE, &rsc) == pcmk_ok) {
crm_info("Failing resource %s...", rsc->id);
process_lrm_event(lrm_state, op, NULL);
op->op_status = PCMK_LRM_OP_DONE;
op->rc = PCMK_OCF_OK;
lrmd_free_rsc_info(rsc);
} else {
crm_info("Cannot find/create resource in order to fail it...");
crm_log_xml_warn(xml, "bad input");
}
send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
}
static void
handle_refresh_op(lrm_state_t *lrm_state, const char *user_name,
const char *from_host, const char *from_sys)
{
int rc = pcmk_ok;
xmlNode *fragment = do_lrm_query_internal(lrm_state, node_update_all);
fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name);
crm_info("Forced a local resource history refresh: call=%d", rc);
if (safe_str_neq(CRM_SYSTEM_CRMD, from_sys)) {
xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, fragment, from_host,
from_sys, CRM_SYSTEM_LRMD,
fsa_our_uuid);
crm_debug("ACK'ing refresh from %s (%s)", from_sys, from_host);
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(reply);
}
free_xml(fragment);
}
static void
handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
{
xmlNode *data = do_lrm_query_internal(lrm_state, node_update_all);
xmlNode *reply = create_reply(msg, data);
if (relay_message(reply, TRUE) == FALSE) {
crm_err("Unable to route reply");
crm_log_xml_err(reply, "reply");
}
free_xml(reply);
free_xml(data);
}
static void
handle_reprobe_op(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node)
{
crm_notice("Forcing the status of all resources to be redetected");
force_reprobe(lrm_state, from_sys, from_host, user_name, is_remote_node);
if (safe_str_neq(CRM_SYSTEM_PENGINE, from_sys)
&& safe_str_neq(CRM_SYSTEM_TENGINE, from_sys)) {
xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, NULL, from_host,
from_sys, CRM_SYSTEM_LRMD,
fsa_our_uuid);
crm_debug("ACK'ing re-probe from %s (%s)", from_sys, from_host);
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(reply);
}
}
static bool do_lrm_cancel(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_host, const char *from_sys)
{
char *op_key = NULL;
char *meta_key = NULL;
int call = 0;
const char *call_id = NULL;
const char *op_task = NULL;
const char *interval_ms_s = NULL;
gboolean in_progress = FALSE;
xmlNode *params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE);
CRM_CHECK(params != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_INTERVAL_MS);
interval_ms_s = crm_element_value(params, meta_key);
free(meta_key);
CRM_CHECK(interval_ms_s != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_TASK);
op_task = crm_element_value(params, meta_key);
free(meta_key);
CRM_CHECK(op_task != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_CALLID);
call_id = crm_element_value(params, meta_key);
free(meta_key);
op_key = generate_op_key(rsc->id, op_task, crm_parse_ms(interval_ms_s));
crm_debug("Scheduler requested op %s (call=%s) be cancelled",
op_key, (call_id? call_id : "NA"));
call = crm_parse_int(call_id, "0");
if (call == 0) {
// Normal case when the scheduler cancels a recurring op
in_progress = cancel_op_key(lrm_state, rsc, op_key, TRUE);
} else {
// Normal case when the scheduler cancels an orphan op
in_progress = cancel_op(lrm_state, rsc->id, NULL, call, TRUE);
}
// Acknowledge cancellation operation if for a remote connection resource
if (!in_progress || is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
char *op_id = make_stop_id(rsc->id, call);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id) == FALSE) {
crm_info("Nothing known about operation %d for %s", call, op_key);
}
erase_lrm_history_by_id(lrm_state, rsc->id, op_key, NULL, call);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
/* needed at least for cancellation of a remote operation */
g_hash_table_remove(lrm_state->pending_ops, op_id);
free(op_id);
} else {
/* No ack is needed since abcdaa8, but peers with older versions
* in a rolling upgrade need one. We didn't bump the feature set
* at that commit, so we can only compare against the previous
* CRM version (3.0.8). If any peers have feature set 3.0.9 but
* not abcdaa8, they will time out waiting for the ack (no
* released versions of Pacemaker are affected).
*/
const char *peer_version = crm_element_value(params, XML_ATTR_CRM_VERSION);
if (compare_version(peer_version, "3.0.8") <= 0) {
crm_info("Sending compatibility ack for %s cancellation to %s (CRM version %s)",
op_key, from_host, peer_version);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
}
}
free(op_key);
return TRUE;
}
static void
do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_sys, const char *from_host,
bool crm_rsc_delete, const char *user_name)
{
gboolean unregister = TRUE;
#if ENABLE_ACL
int cib_rc = delete_rsc_status(lrm_state, rsc->id,
cib_dryrun|cib_sync_call, user_name);
if (cib_rc != pcmk_ok) {
lrmd_event_data_t *op = NULL;
crm_err("Could not delete resource status of %s for %s (user %s) on %s: %s"
CRM_XS " rc=%d",
rsc->id, from_sys, (user_name? user_name : "unknown"),
from_host, pcmk_strerror(cib_rc), cib_rc);
op = construct_op(lrm_state, input->xml, rsc->id, CRMD_ACTION_DELETE);
op->op_status = PCMK_LRM_OP_ERROR;
if (cib_rc == -EACCES) {
op->rc = PCMK_OCF_INSUFFICIENT_PRIV;
} else {
op->rc = PCMK_OCF_UNKNOWN_ERROR;
}
send_direct_ack(from_host, from_sys, NULL, op, rsc->id);
lrmd_free_event(op);
return;
}
#endif
if (crm_rsc_delete && is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
unregister = FALSE;
}
delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host,
user_name, input, unregister);
}
/* A_LRM_INVOKE */
void
do_lrm_invoke(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
lrm_state_t *lrm_state = NULL;
const char *crm_op = NULL;
const char *from_sys = NULL;
const char *from_host = NULL;
const char *operation = NULL;
ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
const char *user_name = NULL;
const char *target_node = NULL;
gboolean is_remote_node = FALSE;
bool crm_rsc_delete = FALSE;
target_node = lrm_op_target(input->xml);
is_remote_node = safe_str_neq(target_node, fsa_our_uname);
lrm_state = lrm_state_find(target_node);
if ((lrm_state == NULL) && is_remote_node) {
crm_err("Failing action because local node has never had connection to remote node %s",
target_node);
synthesize_lrmd_failure(NULL, input->xml, PCMK_OCF_CONNECTION_DIED);
return;
}
CRM_ASSERT(lrm_state != NULL);
#if ENABLE_ACL
user_name = crm_acl_get_set_user(input->msg, F_CRM_USER, NULL);
crm_trace("Executor command from user '%s'", user_name);
#endif
crm_op = crm_element_value(input->msg, F_CRM_TASK);
from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) {
from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
}
crm_trace("Executor %s command from %s", crm_op, from_sys);
if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) {
crm_rsc_delete = TRUE; // Only crm_resource uses this op
operation = CRMD_ACTION_DELETE;
} else if (safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) {
fail_lrm_resource(input->xml, lrm_state, user_name, from_host,
from_sys);
return;
} else if (input->xml != NULL) {
operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK);
}
if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) {
handle_refresh_op(lrm_state, user_name, from_host, from_sys);
} else if (safe_str_eq(crm_op, CRM_OP_LRM_QUERY)) {
handle_query_op(input->msg, lrm_state);
} else if (safe_str_eq(operation, CRM_OP_PROBED)) {
update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE,
user_name, is_remote_node);
} else if (safe_str_eq(operation, CRM_OP_REPROBE)
|| safe_str_eq(crm_op, CRM_OP_REPROBE)) {
handle_reprobe_op(lrm_state, from_sys, from_host, user_name,
is_remote_node);
} else if (operation != NULL) {
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE);
gboolean create_rsc = safe_str_neq(operation, CRMD_ACTION_DELETE);
int rc;
// We can't return anything meaningful without a resource ID
CRM_CHECK(xml_rsc && ID(xml_rsc), return);
rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc);
if (rc == -ENOTCONN) {
synthesize_lrmd_failure(lrm_state, input->xml,
PCMK_OCF_CONNECTION_DIED);
return;
} else if ((rc < 0) && !create_rsc) {
/* Delete of malformed or nonexistent resource
* (deleting something that does not exist is a success)
*/
crm_notice("Not registering resource '%s' for a %s event "
CRM_XS " get-rc=%d (%s) transition-key=%s",
ID(xml_rsc), operation,
rc, pcmk_strerror(rc), ID(input->xml));
delete_rsc_entry(lrm_state, input, ID(xml_rsc), NULL, pcmk_ok,
user_name);
send_task_ok_ack(lrm_state, input, ID(xml_rsc), NULL, operation,
from_host, from_sys);
return;
} else if (rc == -EINVAL) {
// Resource operation on malformed resource
crm_err("Invalid resource definition for %s", ID(xml_rsc));
crm_log_xml_warn(input->msg, "invalid resource");
synthesize_lrmd_failure(lrm_state, input->xml,
PCMK_OCF_NOT_CONFIGURED); // fatal error
return;
} else if (rc < 0) {
// Error communicating with the executor
crm_err("Could not register resource '%s' with executor: %s "
CRM_XS " rc=%d",
ID(xml_rsc), pcmk_strerror(rc), rc);
crm_log_xml_warn(input->msg, "failed registration");
synthesize_lrmd_failure(lrm_state, input->xml,
PCMK_OCF_INVALID_PARAM); // hard error
return;
}
if (safe_str_eq(operation, CRMD_ACTION_CANCEL)) {
if (!do_lrm_cancel(input, lrm_state, rsc, from_host, from_sys)) {
crm_log_xml_warn(input->xml, "Bad command");
}
} else if (safe_str_eq(operation, CRMD_ACTION_DELETE)) {
do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
crm_rsc_delete, user_name);
} else {
do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg);
}
lrmd_free_rsc_info(rsc);
} else {
crm_err("Cannot perform operation %s of unknown type", crm_str(crm_op));
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
static lrmd_event_data_t *
construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, const char *rsc_id, const char *operation)
{
lrmd_event_data_t *op = NULL;
const char *op_delay = NULL;
const char *op_timeout = NULL;
const char *interval_ms_s = NULL;
GHashTable *params = NULL;
const char *transition = NULL;
CRM_ASSERT(rsc_id && operation);
op = calloc(1, sizeof(lrmd_event_data_t));
CRM_ASSERT(op != NULL);
op->type = lrmd_event_exec_complete;
op->op_type = strdup(operation);
op->op_status = PCMK_LRM_OP_PENDING;
op->rc = -1;
op->rsc_id = strdup(rsc_id);
op->interval_ms = 0;
op->timeout = 0;
op->start_delay = 0;
if (rsc_op == NULL) {
CRM_LOG_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
op->user_data = NULL;
/* the stop_all_resources() case
* by definition there is no DC (or they'd be shutting
* us down).
* So we should put our version here.
*/
op->params = crm_str_table_new();
g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET));
crm_trace("Constructed %s op for %s", operation, rsc_id);
return op;
}
params = xml2list(rsc_op);
g_hash_table_remove(params, CRM_META "_op_target_rc");
op_delay = crm_meta_value(params, XML_OP_ATTR_START_DELAY);
op_timeout = crm_meta_value(params, XML_ATTR_TIMEOUT);
interval_ms_s = crm_meta_value(params, XML_LRM_ATTR_INTERVAL_MS);
op->interval_ms = crm_parse_ms(interval_ms_s);
op->timeout = crm_parse_int(op_timeout, "0");
op->start_delay = crm_parse_int(op_delay, "0");
#if ENABLE_VERSIONED_ATTRS
// Resolve any versioned parameters
if (lrm_state && safe_str_neq(op->op_type, RSC_METADATA)
&& safe_str_neq(op->op_type, CRMD_ACTION_DELETE)
&& !is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
// Resource info *should* already be cached, so we don't get executor call
lrmd_rsc_info_t *rsc = lrm_state_get_rsc_info(lrm_state, rsc_id, 0);
struct ra_metadata_s *metadata;
metadata = metadata_cache_get(lrm_state->metadata_cache, rsc);
if (metadata) {
xmlNode *versioned_attrs = NULL;
GHashTable *hash = NULL;
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
versioned_attrs = first_named_child(rsc_op, XML_TAG_OP_VER_ATTRS);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_iter_steal(&iter);
g_hash_table_replace(params, key, value);
}
g_hash_table_destroy(hash);
versioned_attrs = first_named_child(rsc_op, XML_TAG_OP_VER_META);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_replace(params, crm_meta_name(key), strdup(value));
if (safe_str_eq(key, XML_ATTR_TIMEOUT)) {
op->timeout = crm_parse_int(value, "0");
} else if (safe_str_eq(key, XML_OP_ATTR_START_DELAY)) {
op->start_delay = crm_parse_int(value, "0");
}
}
g_hash_table_destroy(hash);
versioned_attrs = first_named_child(rsc_op, XML_TAG_RSC_VER_ATTRS);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_iter_steal(&iter);
g_hash_table_replace(params, key, value);
}
g_hash_table_destroy(hash);
}
lrmd_free_rsc_info(rsc);
}
#endif
if (safe_str_neq(operation, RSC_STOP)) {
op->params = params;
} else {
rsc_history_t *entry = NULL;
if (lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* If we do not have stop parameters cached, use
* whatever we are given */
if (!entry || !entry->stop_params) {
op->params = params;
} else {
/* Copy the cached parameter list so that we stop the resource
* with the old attributes, not the new ones */
op->params = crm_str_table_new();
g_hash_table_foreach(params, copy_meta_keys, op->params);
g_hash_table_foreach(entry->stop_params, copy_instance_keys, op->params);
g_hash_table_destroy(params);
params = NULL;
}
}
/* sanity */
if (op->timeout <= 0) {
op->timeout = op->interval_ms;
}
if (op->start_delay < 0) {
op->start_delay = 0;
}
transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY);
CRM_CHECK(transition != NULL, return op);
op->user_data = strdup(transition);
if (op->interval_ms != 0) {
if (safe_str_eq(operation, CRMD_ACTION_START)
|| safe_str_eq(operation, CRMD_ACTION_STOP)) {
crm_err("Start and Stop actions cannot have an interval: %u",
op->interval_ms);
op->interval_ms = 0;
}
}
crm_trace("Constructed %s op for %s: interval=%u",
operation, rsc_id, op->interval_ms);
return op;
}
void
send_direct_ack(const char *to_host, const char *to_sys,
lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id)
{
xmlNode *reply = NULL;
xmlNode *update, *iter;
crm_node_t *peer = NULL;
CRM_CHECK(op != NULL, return);
if (op->rsc_id == NULL) {
CRM_ASSERT(rsc_id != NULL);
op->rsc_id = strdup(rsc_id);
}
if (to_sys == NULL) {
to_sys = CRM_SYSTEM_TENGINE;
}
peer = crm_get_peer(0, fsa_our_uname);
update = create_node_state_update(peer, node_update_none, NULL,
__FUNCTION__);
iter = create_xml_node(update, XML_CIB_TAG_LRM);
crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE);
crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
build_operation_update(iter, rsc, op, fsa_our_uname, __FUNCTION__);
reply = create_request(CRM_OP_INVOKE_LRM, update, to_host, to_sys, CRM_SYSTEM_LRMD, NULL);
crm_log_xml_trace(update, "ACK Update");
crm_debug("ACK'ing resource op " CRM_OP_FMT " from %s: %s",
op->rsc_id, op->op_type, op->interval_ms, op->user_data,
crm_element_value(reply, XML_ATTR_REFERENCE));
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(update);
free_xml(reply);
}
gboolean
verify_stopped(enum crmd_fsa_state cur_state, int log_level)
{
gboolean res = TRUE;
GList *lrm_state_list = lrm_state_get_list();
GList *state_entry;
for (state_entry = lrm_state_list; state_entry != NULL; state_entry = state_entry->next) {
lrm_state_t *lrm_state = state_entry->data;
if (!lrm_state_verify_stopped(lrm_state, cur_state, log_level)) {
/* keep iterating through all even when false is returned */
res = FALSE;
}
}
set_bit(fsa_input_register, R_SENT_RSC_STOP);
g_list_free(lrm_state_list); lrm_state_list = NULL;
return res;
}
struct stop_recurring_action_s {
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct stop_recurring_action_s *event = user_data;
struct recurring_op_s *op = (struct recurring_op_s *)value;
if ((op->interval_ms != 0)
&& crm_str_eq(op->rsc_id, event->rsc->id, TRUE)) {
crm_debug("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, (char*)key);
remove = !cancel_op(event->lrm_state, event->rsc->id, key, op->call_id, FALSE);
}
return remove;
}
static gboolean
stop_recurring_actions(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
lrm_state_t *lrm_state = user_data;
struct recurring_op_s *op = (struct recurring_op_s *)value;
if (op->interval_ms != 0) {
crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id,
(const char *) key);
remove = !cancel_op(lrm_state, op->rsc_id, key, op->call_id, FALSE);
}
return remove;
}
static void
record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t *op)
{
const char *record_pending = NULL;
CRM_CHECK(node_name != NULL, return);
CRM_CHECK(rsc != NULL, return);
CRM_CHECK(op != NULL, return);
// Never record certain operation types as pending
if ((op->op_type == NULL) || (op->params == NULL)
|| !controld_action_is_recordable(op->op_type)) {
return;
}
// defaults to true
record_pending = crm_meta_value(op->params, XML_OP_ATTR_PENDING);
if (record_pending && !crm_is_true(record_pending)) {
return;
}
op->call_id = -1;
op->op_status = PCMK_LRM_OP_PENDING;
op->rc = PCMK_OCF_UNKNOWN;
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
/* write a "pending" entry to the CIB, inhibit notification */
crm_debug("Recording pending op " CRM_OP_FMT " on %s in the CIB",
op->rsc_id, op->op_type, op->interval_ms, node_name);
do_update_resource(node_name, rsc, op);
}
static void
do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg,
xmlNode * request)
{
int call_id = 0;
char *op_id = NULL;
lrmd_event_data_t *op = NULL;
lrmd_key_value_t *params = NULL;
fsa_data_t *msg_data = NULL;
const char *transition = NULL;
gboolean stop_recurring = FALSE;
bool send_nack = FALSE;
CRM_CHECK(rsc != NULL, return);
CRM_CHECK(operation != NULL, return);
if (msg != NULL) {
transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
if (transition == NULL) {
crm_log_xml_err(msg, "Missing transition number");
}
}
op = construct_op(lrm_state, msg, rsc->id, operation);
CRM_CHECK(op != NULL, return);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id)
&& (op->interval_ms == 0)
&& strcmp(operation, CRMD_ACTION_MIGRATE) == 0) {
/* pcmk remote connections are a special use case.
* We never ever want to stop monitoring a connection resource until
* the entire migration has completed. If the connection is unexpectedly
* severed, even during a migration, this is an event we must detect.*/
stop_recurring = FALSE;
} else if ((op->interval_ms == 0)
&& strcmp(operation, CRMD_ACTION_STATUS) != 0
&& strcmp(operation, CRMD_ACTION_NOTIFY) != 0) {
/* stop any previous monitor operations before changing the resource state */
stop_recurring = TRUE;
}
if (stop_recurring == TRUE) {
guint removed = 0;
struct stop_recurring_action_s data;
data.rsc = rsc;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(
lrm_state->pending_ops, stop_recurring_action_by_rsc, &data);
if (removed) {
crm_debug("Stopped %u recurring operation%s in preparation for " CRM_OP_FMT,
removed, s_if_plural(removed),
rsc->id, operation, op->interval_ms);
}
}
/* now do the op */
crm_info("Performing key=%s op=" CRM_OP_FMT,
transition, rsc->id, operation, op->interval_ms);
if (is_set(fsa_input_register, R_SHUTDOWN) && safe_str_eq(operation, RSC_START)) {
register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
send_nack = TRUE;
} else if (fsa_state != S_NOT_DC
&& fsa_state != S_POLICY_ENGINE /* Recalculating */
&& fsa_state != S_TRANSITION_ENGINE
&& safe_str_neq(operation, CRMD_ACTION_STOP)) {
send_nack = TRUE;
}
if(send_nack) {
crm_notice("Discarding attempt to perform action %s on %s in state %s (shutdown=%s)",
operation, rsc->id, fsa_state2string(fsa_state),
is_set(fsa_input_register, R_SHUTDOWN)?"true":"false");
op->rc = CRM_DIRECT_NACK_RC;
op->op_status = PCMK_LRM_OP_ERROR;
send_direct_ack(NULL, NULL, rsc, op, rsc->id);
lrmd_free_event(op);
free(op_id);
return;
}
record_pending_op(lrm_state->node_name, rsc, op);
op_id = generate_op_key(rsc->id, op->op_type, op->interval_ms);
if (op->interval_ms > 0) {
/* cancel it so we can then restart it without conflict */
cancel_op_key(lrm_state, rsc, op_id, FALSE);
}
if (op->params) {
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
g_hash_table_iter_init(&iter, op->params);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
params = lrmd_key_value_add(params, key, value);
}
}
call_id = lrm_state_exec(lrm_state, rsc->id, op->op_type, op->user_data,
op->interval_ms, op->timeout, op->start_delay,
params);
if (call_id <= 0 && lrm_state_is_local(lrm_state)) {
crm_err("Operation %s on %s failed: %d", operation, rsc->id, call_id);
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
} else if (call_id <= 0) {
crm_err("Operation %s on resource %s failed to execute on remote node %s: %d",
operation, rsc->id, lrm_state->node_name, call_id);
fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_UNKNOWN_ERROR);
process_lrm_event(lrm_state, op, NULL);
} else {
/* record all operations so we can wait
* for them to complete during shutdown
*/
char *call_id_s = make_stop_id(rsc->id, call_id);
struct recurring_op_s *pending = NULL;
pending = calloc(1, sizeof(struct recurring_op_s));
crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s);
pending->call_id = call_id;
pending->interval_ms = op->interval_ms;
pending->op_type = strdup(operation);
pending->op_key = strdup(op_id);
pending->rsc_id = strdup(rsc->id);
pending->start_time = time(NULL);
- pending->user_data = strdup(op->user_data);
+ pending->user_data = op->user_data? strdup(op->user_data) : NULL;
g_hash_table_replace(lrm_state->pending_ops, call_id_s, pending);
if ((op->interval_ms > 0)
&& (op->start_delay > START_DELAY_THRESHOLD)) {
char *uuid = NULL;
int dummy = 0, target_rc = 0;
crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id);
decode_transition_key(op->user_data, &uuid, &dummy, &dummy, &target_rc);
free(uuid);
op->rc = target_rc;
op->op_status = PCMK_LRM_OP_DONE;
send_direct_ack(NULL, NULL, rsc, op, rsc->id);
}
pending->params = op->params;
op->params = NULL;
}
free(op_id);
lrmd_free_event(op);
return;
}
int last_resource_update = 0;
static void
cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
switch (rc) {
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
crm_trace("Resource update %d complete: rc=%d", call_id, rc);
break;
default:
crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
}
if (call_id == last_resource_update) {
last_resource_update = 0;
trigger_fsa(fsa_source);
}
}
static int
do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op)
{
/*
<status>
<nodes_status id=uname>
<lrm>
<lrm_resources>
<lrm_resource id=...>
</...>
*/
int rc = pcmk_ok;
xmlNode *update, *iter = NULL;
int call_opt = crmd_cib_smart_opt();
const char *uuid = NULL;
CRM_CHECK(op != NULL, return 0);
iter = create_xml_node(iter, XML_CIB_TAG_STATUS);
update = iter;
iter = create_xml_node(iter, XML_CIB_TAG_STATE);
if (safe_str_eq(node_name, fsa_our_uname)) {
uuid = fsa_our_uuid;
} else {
/* remote nodes uuid and uname are equal */
uuid = node_name;
crm_xml_add(iter, XML_NODE_IS_REMOTE, "true");
}
CRM_LOG_ASSERT(uuid != NULL);
if(uuid == NULL) {
rc = -EINVAL;
goto done;
}
crm_xml_add(iter, XML_ATTR_UUID, uuid);
crm_xml_add(iter, XML_ATTR_UNAME, node_name);
crm_xml_add(iter, XML_ATTR_ORIGIN, __FUNCTION__);
iter = create_xml_node(iter, XML_CIB_TAG_LRM);
crm_xml_add(iter, XML_ATTR_ID, uuid);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE);
crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
build_operation_update(iter, rsc, op, node_name, __FUNCTION__);
if (rsc) {
const char *container = NULL;
crm_xml_add(iter, XML_ATTR_TYPE, rsc->type);
crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->standard);
crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider);
if (op->params) {
container = g_hash_table_lookup(op->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
}
if (container) {
crm_trace("Resource %s is a part of container resource %s", op->rsc_id, container);
crm_xml_add(iter, XML_RSC_ATTR_CONTAINER, container);
}
} else {
crm_warn("Resource %s no longer exists in the executor", op->rsc_id);
send_direct_ack(NULL, NULL, rsc, op, op->rsc_id);
goto cleanup;
}
crm_log_xml_trace(update, __FUNCTION__);
/* make it an asynchronous call and be done with it
*
* Best case:
* the resource state will be discovered during
* the next signup or election.
*
* Bad case:
* we are shutting down and there is no DC at the time,
* but then why were we shutting down then anyway?
* (probably because of an internal error)
*
* Worst case:
* we get shot for having resources "running" that really weren't
*
* the alternative however means blocking here for too long, which
* isn't acceptable
*/
fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, rc, NULL);
if (rc > 0) {
last_resource_update = rc;
}
done:
/* the return code is a call number, not an error code */
crm_trace("Sent resource state update message: %d for %s=%u on %s",
rc, op->op_type, op->interval_ms, op->rsc_id);
fsa_register_cib_callback(rc, FALSE, NULL, cib_rsc_callback);
cleanup:
free_xml(update);
return rc;
}
void
do_lrm_event(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)
{
CRM_CHECK(FALSE, return);
}
static char *
unescape_newlines(const char *string)
{
char *pch = NULL;
char *ret = NULL;
static const char *escaped_newline = "\\n";
if (!string) {
return NULL;
}
ret = strdup(string);
pch = strstr(ret, escaped_newline);
while (pch != NULL) {
/* 2 chars for 2 chars, null-termination irrelevant */
memcpy(pch, "\n ", 2 * sizeof(char));
pch = strstr(pch, escaped_newline);
}
return ret;
}
gboolean
process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending)
{
char *op_id = NULL;
char *op_key = NULL;
int update_id = 0;
gboolean remove = FALSE;
gboolean removed = FALSE;
lrmd_rsc_info_t *rsc = NULL;
CRM_CHECK(op != NULL, return FALSE);
CRM_CHECK(op->rsc_id != NULL, return FALSE);
op_id = make_stop_id(op->rsc_id, op->call_id);
op_key = generate_op_key(op->rsc_id, op->op_type, op->interval_ms);
rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0);
if(pending == NULL) {
remove = TRUE;
pending = g_hash_table_lookup(lrm_state->pending_ops, op_id);
}
if (op->op_status == PCMK_LRM_OP_ERROR) {
switch(op->rc) {
case PCMK_OCF_NOT_RUNNING:
case PCMK_OCF_RUNNING_MASTER:
case PCMK_OCF_DEGRADED:
case PCMK_OCF_DEGRADED_MASTER:
// Leave it to the TE/scheduler to decide if this is an error
op->op_status = PCMK_LRM_OP_DONE;
break;
default:
/* Nothing to do */
break;
}
}
if (op->op_status != PCMK_LRM_OP_CANCELLED) {
if (controld_action_is_recordable(op->op_type)) {
update_id = do_update_resource(lrm_state->node_name, rsc, op);
} else {
send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
}
} else if (op->interval_ms == 0) {
/* This will occur when "crm resource cleanup" is called while actions are in-flight */
crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id);
send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
} else if (pending == NULL) {
/* We don't need to do anything for cancelled ops
* that are not in our pending op list. There are no
* transition actions waiting on these operations. */
} else if (op->user_data == NULL) {
/* At this point we have a pending entry, but no transition
* key present in the user_data field. report this */
crm_err("Op %s (call=%d): No user data", op_key, op->call_id);
} else if (pending->remove) {
/* The tengine canceled this op, we have been waiting for the cancel to finish. */
erase_lrm_history_by_op(lrm_state, op);
} else if (op->rsc_deleted) {
/* The tengine initiated this op, but it was cancelled outside of the
* tengine's control during a resource cleanup/re-probe request. The tengine
* must be alerted that this operation completed, otherwise the tengine
* will continue waiting for this update to occur until it is timed out.
* We don't want this update going to the cib though, so use a direct ack. */
crm_trace("Op %s (call=%d): cancelled due to rsc deletion", op_key, op->call_id);
send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
} else {
/* Before a stop is called, no need to direct ack */
crm_trace("Op %s (call=%d): no delete event required", op_key, op->call_id);
}
if(remove == FALSE) {
/* The caller will do this afterwards, but keep the logging consistent */
removed = TRUE;
} else if ((op->interval_ms == 0)
&& g_hash_table_remove(lrm_state->pending_ops, op_id)) {
removed = TRUE;
crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed",
op_key, op->call_id, op_id, g_hash_table_size(lrm_state->pending_ops));
} else if ((op->interval_ms != 0)
&& (op->op_status == PCMK_LRM_OP_CANCELLED)) {
removed = TRUE;
g_hash_table_remove(lrm_state->pending_ops, op_id);
}
switch (op->op_status) {
case PCMK_LRM_OP_CANCELLED:
crm_info("Result of %s operation for %s on %s: %s "
CRM_XS " call=%d key=%s confirmed=%s",
crm_action_str(op->op_type, op->interval_ms),
op->rsc_id, lrm_state->node_name,
services_lrm_status_str(op->op_status),
op->call_id, op_key, (removed? "true" : "false"));
break;
case PCMK_LRM_OP_DONE:
do_crm_log((op->interval_ms? LOG_INFO : LOG_NOTICE),
"Result of %s operation for %s on %s: %d (%s) "
CRM_XS " call=%d key=%s confirmed=%s cib-update=%d",
crm_action_str(op->op_type, op->interval_ms),
op->rsc_id, lrm_state->node_name,
op->rc, services_ocf_exitcode_str(op->rc),
op->call_id, op_key, (removed? "true" : "false"),
update_id);
break;
case PCMK_LRM_OP_TIMEOUT:
crm_err("Result of %s operation for %s on %s: %s "
CRM_XS " call=%d key=%s timeout=%dms",
crm_action_str(op->op_type, op->interval_ms),
op->rsc_id, lrm_state->node_name,
services_lrm_status_str(op->op_status),
op->call_id, op_key, op->timeout);
break;
default:
crm_err("Result of %s operation for %s on %s: %s "
CRM_XS " call=%d key=%s confirmed=%s status=%d cib-update=%d",
crm_action_str(op->op_type, op->interval_ms),
op->rsc_id, lrm_state->node_name,
services_lrm_status_str(op->op_status), op->call_id, op_key,
(removed? "true" : "false"), op->op_status, update_id);
}
if (op->output) {
char *prefix =
crm_strdup_printf("%s-" CRM_OP_FMT ":%d", lrm_state->node_name,
op->rsc_id, op->op_type, op->interval_ms,
op->call_id);
if (op->rc) {
crm_log_output(LOG_NOTICE, prefix, op->output);
} else {
crm_log_output(LOG_DEBUG, prefix, op->output);
}
free(prefix);
}
if (safe_str_neq(op->op_type, RSC_METADATA)) {
crmd_alert_resource_op(lrm_state->node_name, op);
} else if (op->rc == PCMK_OCF_OK) {
char *metadata = unescape_newlines(op->output);
metadata_cache_update(lrm_state->metadata_cache, rsc, metadata);
free(metadata);
}
if (op->rsc_deleted) {
crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key);
delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL);
}
/* If a shutdown was escalated while operations were pending,
* then the FSA will be stalled right now... allow it to continue
*/
mainloop_set_trigger(fsa_source);
update_history_cache(lrm_state, rsc, op);
lrmd_free_rsc_info(rsc);
free(op_key);
free(op_id);
return TRUE;
}
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
index 096e9c10e3..345ccaa042 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
@@ -1,1450 +1,1454 @@
= Advanced Resource Types =
[[group-resources]]
== Groups - A Syntactic Shortcut ==
indexterm:[Group Resources]
indexterm:[Resource,Groups]
One of the most common elements of a cluster is a set of resources
that need to be located together, start sequentially, and stop in the
reverse order. To simplify this configuration, we support the concept
of groups.
.A group of two primitive resources
======
[source,XML]
-------
<group id="shortcut">
<primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
<instance_attributes id="params-public-ip">
<nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
</instance_attributes>
</primitive>
<primitive id="Email" class="lsb" type="exim"/>
</group>
-------
======
Although the example above contains only two resources, there is no
limit to the number of resources a group can contain. The example is
also sufficient to explain the fundamental properties of a group:
* Resources are started in the order they appear in (+Public-IP+
first, then +Email+)
* Resources are stopped in the reverse order to which they appear in
(+Email+ first, then +Public-IP+)
If a resource in the group can't run anywhere, then nothing after that
is allowed to run, too.
* If +Public-IP+ can't run anywhere, neither can +Email+;
* but if +Email+ can't run anywhere, this does not affect +Public-IP+
in any way
The group above is logically equivalent to writing:
.How the cluster sees a group resource
======
[source,XML]
-------
<configuration>
<resources>
<primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
<instance_attributes id="params-public-ip">
<nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
</instance_attributes>
</primitive>
<primitive id="Email" class="lsb" type="exim"/>
</resources>
<constraints>
<rsc_colocation id="xxx" rsc="Email" with-rsc="Public-IP" score="INFINITY"/>
<rsc_order id="yyy" first="Public-IP" then="Email"/>
</constraints>
</configuration>
-------
======
Obviously as the group grows bigger, the reduced configuration effort
can become significant.
Another (typical) example of a group is a DRBD volume, the filesystem
mount, an IP address, and an application that uses them.
=== Group Properties ===
.Properties of a Group Resource
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the group
indexterm:[id,Group Resource Property]
indexterm:[Resource,Group Property,id]
|=========================================================
=== Group Options ===
Groups inherit the +priority+, +target-role+, and +is-managed+ properties
from primitive resources. See <<s-resource-options>> for information about
those properties.
=== Group Instance Attributes ===
Groups have no instance attributes. However, any that are set for the group
object will be inherited by the group's children.
=== Group Contents ===
Groups may only contain a collection of cluster resources (see
<<primitive-resource>>). To refer to a child of a group resource, just use
the child's +id+ instead of the group's.
=== Group Constraints ===
Although it is possible to reference a group's children in
constraints, it is usually preferable to reference the group itself.
.Some constraints involving groups
======
[source,XML]
-------
<constraints>
<rsc_location id="group-prefers-node1" rsc="shortcut" node="node1" score="500"/>
<rsc_colocation id="webserver-with-group" rsc="Webserver" with-rsc="shortcut"/>
<rsc_order id="start-group-then-webserver" first="Webserver" then="shortcut"/>
</constraints>
-------
======
=== Group Stickiness ===
indexterm:[resource-stickiness,Groups]
Stickiness, the measure of how much a resource wants to stay where it
is, is additive in groups. Every active resource of the group will
contribute its stickiness value to the group's total. So if the
default +resource-stickiness+ is 100, and a group has seven members,
five of which are active, then the group as a whole will prefer its
current location with a score of 500.
[[s-resource-clone]]
== Clones - Resources That Can Have Multiple Active Instances ==
indexterm:[Clone Resources]
indexterm:[Resource,Clones]
'Clone' resources are resources that can have more than one copy active at the
same time. This allows you, for example, to run a copy of a daemon on every
node. You can clone any primitive or group resource.
footnote:[
Of course, the service must support running multiple instances.
]
=== Anonymous versus Unique Clones ===
A clone resource is configured to be either 'anonymous' or 'globally unique'.
Anonymous clones are the simplest. These behave completely identically
everywhere they are running. Because of this, there can be only one instance of
an anonymous clone active per node.
The instances of globally unique clones are distinct entities. All instances
are launched identically, but one instance of the clone is not identical to any
other instance, whether running on the same node or a different node. As an
example, a cloned IP address can use special kernel functionality such that
each instance handles a subset of requests for the same IP address.
[[s-resource-promotable]]
=== Promotable clones ===
indexterm:[Promotable Clone Resources]
indexterm:[Resource,Promotable]
If a clone is 'promotable', its instances can perform a special role that
Pacemaker will manage via the +promote+ and +demote+ actions of the resource
agent.
Services that support such a special role have various terms for the special
role and the default role: primary and secondary, master and replica,
controller and worker, etc. Pacemaker uses the terms 'master' and 'slave',
footnote:[
These are historical terms that will eventually be replaced, but the extensive
use of them and the need for backward compatibility makes it a long process.
You may see examples using a +master+ tag instead of a +clone+ tag with the
+promotable+ meta-attribute set to +true+; the +master+ tag is supported, but
deprecated, and will be removed in a future version. You may also see such
services referred to as 'multi-state' or 'stateful'; these means the same thing
as 'promotable'.
]
but is agnostic to what the service calls them or what they do.
All that Pacemaker cares about is that an instance comes up in the default role
when started, and the resource agent supports the +promote+ and +demote+ actions
to manage entering and exiting the special role.
=== Clone Properties ===
.Properties of a Clone Resource
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the clone
indexterm:[id,Clone Property]
indexterm:[Clone,Property,id]
|=========================================================
=== Clone Options ===
<<s-resource-options,Options>> inherited from primitive resources:
+priority, target-role, is-managed+
.Clone-specific configuration options
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|globally-unique
|false
|If +true+, each clone instance performs a distinct function
indexterm:[globally-unique,Clone Option]
indexterm:[Clone,Option,globally-unique]
|clone-max
|number of nodes in cluster
|The maximum number of clone instances that can be started across the entire
cluster
indexterm:[clone-max,Clone Option]
indexterm:[Clone,Option,clone-max]
|clone-node-max
|1
|If +globally-unique+ is +true+, the maximum number of clone instances that can
be started on a single node
indexterm:[clone-node-max,Clone Option]
indexterm:[Clone,Option,clone-node-max]
|clone-min
|0
|Require at least this number of clone instances to be runnable before allowing
resources depending on the clone to be runnable. A value of 0 means require
all clone instances to be runnable.
indexterm:[clone-min,Clone Option]
indexterm:[Clone,Option,clone-min]
|notify
|false
|Call the resource agent's +notify+ action for all active instances, before and
after starting or stopping any clone instance. The resource agent must support
this action. Allowed values: +false+, +true+
indexterm:[notify,Clone Option]
indexterm:[Clone,Option,notify]
|ordered
|false
|If +true+, clone instances must be started sequentially instead of in parallel
Allowed values: +false+, +true+
indexterm:[ordered,Clone Option]
indexterm:[Clone,Option,ordered]
|interleave
|false
|When this clone is ordered relative to another clone, if this option is
+false+ (the default), the ordering is relative to 'all' instances of the
other clone, whereas if this option is +true+, the ordering is relative only
to instances on the same node.
Allowed values: +false+, +true+
indexterm:[interleave,Clone Option]
indexterm:[Clone,Option,interleave]
|promotable
|false
|If +true+, clone instances can perform a special role that Pacemaker will
manage via the resource agent's +promote+ and +demote+ actions. The resource
agent must support these actions.
Allowed values: +false+, +true+
indexterm:[promotable,Clone Option]
indexterm:[Clone,Option,promotable]
|promoted-max
|1
|If +promotable+ is +true+, the number of instances that can be promoted at one
time across the entire cluster
indexterm:[promoted-max,Clone Option]
indexterm:[Clone,Option,promoted-max]
|promoted-node-max
|1
|If +promotable+ is +true+ and +globally-unique+ is +false+, the number of
clone instances can be promoted at one time on a single node
indexterm:[promoted-node-max,Clone Option]
indexterm:[Clone,Option,promoted-node-max]
|=========================================================
For backward compatibility, +master-max+ and +master-node-max+ are accepted as
aliases for +promoted-max+ and +promoted-node-max+, but are deprecated since
2.0.0, and support for them will be removed in a future version.
=== Clone Contents ===
Clones must contain exactly one primitive or group resource.
.A clone that runs a web server on all nodes
====
[source,XML]
----
<clone id="apache-clone">
<primitive id="apache" class="lsb" type="apache">
<operations>
<op id="apache-monitor" name="monitor" interval="30"/>
</operations>
</primitive>
</clone>
----
====
[WARNING]
You should never reference the name of a clone's child (the primitive or group
resource being cloned). If you think you need to do this, you probably need to
re-evaluate your design.
=== Clone Instance Attributes ===
Clones have no instance attributes; however, any that are set here will be
inherited by the clone's child.
=== Clone Constraints ===
In most cases, a clone will have a single instance on each active cluster
node. If this is not the case, you can indicate which nodes the
cluster should preferentially assign copies to with resource location
constraints. These constraints are written no differently from those
for primitive resources except that the clone's +id+ is used.
.Some constraints involving clones
======
[source,XML]
-------
<constraints>
<rsc_location id="clone-prefers-node1" rsc="apache-clone" node="node1" score="500"/>
<rsc_colocation id="stats-with-clone" rsc="apache-stats" with="apache-clone"/>
<rsc_order id="start-clone-then-stats" first="apache-clone" then="apache-stats"/>
</constraints>
-------
======
Ordering constraints behave slightly differently for clones. In the
example above, +apache-stats+ will wait until all copies of +apache-clone+
that need to be started have done so before being started itself.
Only if _no_ copies can be started will +apache-stats+ be prevented
from being active. Additionally, the clone will wait for
+apache-stats+ to be stopped before stopping itself.
Colocation of a primitive or group resource with a clone means that
the resource can run on any node with an active instance of the clone.
The cluster will choose an instance based on where the clone is running and
the resource's own location preferences.
Colocation between clones is also possible. If one clone +A+ is colocated
with another clone +B+, the set of allowed locations for +A+ is limited to
nodes on which +B+ is (or will be) active. Placement is then performed
normally.
==== Promotable Clone Constraints ====
For promotable clone resources, the +first-action+ and/or +then-action+ fields
for ordering constraints may be set to +promote+ or +demote+ to constrain the
master role, and colocation constraints may contain +rsc-role+ and/or
+with-rsc-role+ fields.
.Additional colocation constraint options for promotable clone resources
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|rsc-role
|Started
|An additional attribute of colocation constraints that specifies the
role that +rsc+ must be in. Allowed values: +Started+, +Master+,
+Slave+.
indexterm:[rsc-role,Ordering Constraints]
indexterm:[Constraints,Ordering,rsc-role]
|with-rsc-role
|Started
|An additional attribute of colocation constraints that specifies the
role that +with-rsc+ must be in. Allowed values: +Started+,
+Master+, +Slave+.
indexterm:[with-rsc-role,Ordering Constraints]
indexterm:[Constraints,Ordering,with-rsc-role]
|=========================================================
.Constraints involving promotable clone resources
======
[source,XML]
-------
<constraints>
<rsc_location id="db-prefers-node1" rsc="database" node="node1" score="500"/>
<rsc_colocation id="backup-with-db-slave" rsc="backup"
with-rsc="database" with-rsc-role="Slave"/>
<rsc_colocation id="myapp-with-db-master" rsc="myApp"
with-rsc="database" with-rsc-role="Master"/>
<rsc_order id="start-db-before-backup" first="database" then="backup"/>
<rsc_order id="promote-db-then-app" first="database" first-action="promote"
then="myApp" then-action="start"/>
</constraints>
-------
======
In the example above, +myApp+ will wait until one of the database
copies has been started and promoted to master before being started
itself on the same node. Only if no copies can be promoted will +myApp+ be
prevented from being active. Additionally, the cluster will wait for
+myApp+ to be stopped before demoting the database.
Colocation of a primitive or group resource with a promotable clone
resource means that it can run on any node with an active instance of
the promotable clone resource that has the specified role (+master+ or
+slave+). In the example above, the cluster will choose a location based on
where database is running as a +master+, and if there are multiple
+master+ instances it will also factor in +myApp+'s own location
preferences when deciding which location to choose.
Colocation with regular clones and other promotable clone resources is also
possible. In such cases, the set of allowed locations for the +rsc+
clone is (after role filtering) limited to nodes on which the
+with-rsc+ promotable clone resource is (or will be) in the specified role.
Placement is then performed as normal.
==== Using Promotable Clone Resources in Colocation Sets ====
.Additional colocation set options relevant to promotable clone resources
[width="95%",cols="1m,1,6<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|role
|Started
|The role that 'all members' of the set must be in. Allowed values: +Started+, +Master+,
+Slave+.
indexterm:[role,Ordering Constraints]
indexterm:[Constraints,Ordering,role]
|=========================================================
In the following example +B+'s master must be located on the same node as +A+'s master.
Additionally resources +C+ and +D+ must be located on the same node as +A+'s
and +B+'s masters.
.Colocate C and D with A's and B's master instances
======
[source,XML]
-------
<constraints>
<rsc_colocation id="coloc-1" score="INFINITY" >
<resource_set id="colocated-set-example-1" sequential="true" role="Master">
<resource_ref id="A"/>
<resource_ref id="B"/>
</resource_set>
<resource_set id="colocated-set-example-2" sequential="true">
<resource_ref id="C"/>
<resource_ref id="D"/>
</resource_set>
</rsc_colocation>
</constraints>
-------
======
==== Using Promotable Clone Resources in Ordered Sets ====
.Additional ordered set options relevant to promotable clone resources
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|action
|value of +first-action+
|An additional attribute of ordering constraint sets that specifies the
action that applies to 'all members' of the set. Allowed
values: +start+, +stop+, +promote+, +demote+.
indexterm:[action,Ordering Constraints]
indexterm:[Constraints,Ordering,action]
|=========================================================
.Start C and D after first promoting A and B
======
[source,XML]
-------
<constraints>
<rsc_order id="order-1" score="INFINITY" >
<resource_set id="ordered-set-1" sequential="true" action="promote">
<resource_ref id="A"/>
<resource_ref id="B"/>
</resource_set>
<resource_set id="ordered-set-2" sequential="true" action="start">
<resource_ref id="C"/>
<resource_ref id="D"/>
</resource_set>
</rsc_order>
</constraints>
-------
======
In the above example, +B+ cannot be promoted to a master role until +A+ has
been promoted. Additionally, resources +C+ and +D+ must wait until +A+ and +B+
have been promoted before they can start.
[[s-clone-stickiness]]
=== Clone Stickiness ===
indexterm:[resource-stickiness,Clones]
To achieve a stable allocation pattern, clones are slightly sticky by
default. If no value for +resource-stickiness+ is provided, the clone
will use a value of 1. Being a small value, it causes minimal
disturbance to the score calculations of other resources but is enough
to prevent Pacemaker from needlessly moving copies around the cluster.
[NOTE]
====
For globally unique clones, this may result in multiple instances of the
clone staying on a single node, even after another eligible node becomes
active (for example, after being put into standby mode then made active again).
If you do not want this behavior, specify a +resource-stickiness+ of 0
for the clone temporarily and let the cluster adjust, then set it back
to 1 if you want the default behavior to apply again.
====
=== Clone Resource Agent Requirements ===
Any resource can be used as an anonymous clone, as it requires no
additional support from the resource agent. Whether it makes sense to
do so depends on your resource and its resource agent.
==== Resource Agent Requirements for Globally Unique Clones ====
Globally unique clones require additional support in the resource agent. In
particular, it must only respond with +$\{OCF_SUCCESS}+ if the node has that
exact instance active. All other probes for instances of the clone should
result in +$\{OCF_NOT_RUNNING}+ (or one of the other OCF error codes if
they are failed).
Individual instances of a clone are identified by appending a colon and a
numerical offset, e.g. +apache:2+.
Resource agents can find out how many copies there are by examining
the +OCF_RESKEY_CRM_meta_clone_max+ environment variable and which
instance it is by examining +OCF_RESKEY_CRM_meta_clone+.
The resource agent must not make any assumptions (based on
+OCF_RESKEY_CRM_meta_clone+) about which numerical instances are active. In
particular, the list of active copies will not always be an unbroken
sequence, nor always start at 0.
==== Resource Agent Requirements for Promotable Clones ====
Promotable clone resources require two extra actions, +demote+ and +promote+,
which are responsible for changing the state of the resource. Like +start+ and
+stop+, they should return +$\{OCF_SUCCESS}+ if they completed successfully or
a relevant error code if they did not.
The states can mean whatever you wish, but when the resource is
started, it must come up in the mode called +slave+. From there the
cluster will decide which instances to promote to +master+.
In addition to the clone requirements for monitor actions, agents must
also _accurately_ report which state they are in. The cluster relies
on the agent to report its status (including role) accurately and does
not indicate to the agent what role it currently believes it to be in.
.Role implications of OCF return codes
[width="95%",cols="1,1<",options="header",align="center"]
|=========================================================
|Monitor Return Code
|Description
|OCF_NOT_RUNNING
|Stopped
indexterm:[Return Code,OCF_NOT_RUNNING]
|OCF_SUCCESS
|Running (Slave)
indexterm:[Return Code,OCF_SUCCESS]
|OCF_RUNNING_MASTER
|Running (Master)
indexterm:[Return Code,OCF_RUNNING_MASTER]
|OCF_FAILED_MASTER
|Failed (Master)
indexterm:[Return Code,OCF_FAILED_MASTER]
|Other
|Failed (Slave)
|=========================================================
==== Clone Notifications ====
If the clone has the +notify+ meta-attribute set to +true+, and the resource
agent supports the +notify+ action, Pacemaker will call the action when
appropriate, passing a number of extra variables which, when combined with
additional context, can be used to calculate the current state of the cluster
and what is about to happen to it.
.Environment variables supplied with Clone notify actions
[width="95%",cols="5,3<",options="header",align="center"]
|=========================================================
|Variable
|Description
|OCF_RESKEY_CRM_meta_notify_type
|Allowed values: +pre+, +post+
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,type]
indexterm:[type,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_operation
|Allowed values: +start+, +stop+
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,operation]
indexterm:[operation,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_start_resource
|Resources to be started
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_resource]
indexterm:[start_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_stop_resource
|Resources to be stopped
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_resource]
indexterm:[stop_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_active_resource
|Resources that are running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_resource]
indexterm:[active_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_inactive_resource
|Resources that are not running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,inactive_resource]
indexterm:[inactive_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_start_uname
|Nodes on which resources will be started
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_uname]
indexterm:[start_uname,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_stop_uname
|Nodes on which resources will be stopped
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_uname]
indexterm:[stop_uname,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_active_uname
|Nodes on which resources are running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_uname]
indexterm:[active_uname,Notification Environment Variable]
|=========================================================
The variables come in pairs, such as
+OCF_RESKEY_CRM_meta_notify_start_resource+ and
+OCF_RESKEY_CRM_meta_notify_start_uname+ and should be treated as an
array of whitespace-separated elements.
+OCF_RESKEY_CRM_meta_notify_inactive_resource+ is an exception as the
matching +uname+ variable does not exist since inactive resources
are not running on any node.
Thus in order to indicate that +clone:0+ will be started on +sles-1+,
+clone:2+ will be started on +sles-3+, and +clone:3+ will be started
on +sles-2+, the cluster would set
.Notification variables
======
[source,Bash]
-------
OCF_RESKEY_CRM_meta_notify_start_resource="clone:0 clone:2 clone:3"
OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2"
-------
======
==== Interpretation of Notification Variables ====
.Pre-notification (stop):
* Active resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (stop) / Pre-notification (start):
* Active resources
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Inactive resources
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (start):
* Active resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
==== Extra Notifications for Promotable Clones ====
.Extra environment variables supplied for promotable clones
[width="95%",cols="5,3<",options="header",align="center"]
|=========================================================
|_OCF_RESKEY_CRM_meta_notify_master_resource_
|Resources that are running in +Master+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_resource]
indexterm:[master_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_slave_resource_
|Resources that are running in +Slave+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_resource]
indexterm:[slave_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_promote_resource_
|Resources to be promoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_resource]
indexterm:[promote_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_demote_resource_
|Resources to be demoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_resource]
indexterm:[demote_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_promote_uname_
|Nodes on which resources will be promoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_uname]
indexterm:[promote_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_demote_uname_
|Nodes on which resources will be demoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_uname]
indexterm:[demote_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_master_uname_
|Nodes on which resources are running in +Master+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_uname]
indexterm:[master_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_slave_uname_
|Nodes on which resources are running in +Slave+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_uname]
indexterm:[slave_uname,Notification Environment Variable]
|=========================================================
==== Interpretation of Promotable Notification Variables ====
.Pre-notification (demote):
* +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* +Master+ resources: +$OCF_RESKEY_CRM_meta_notify_master_resource+
* +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (demote) / Pre-notification (stop):
* +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
.Post-notification (stop) / Pre-notification (start)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (start) / Pre-notification (promote)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (promote)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
=== Monitoring Promotable Clone Resources ===
The usual monitor actions are insufficient to monitor a promotable clone
resource, because Pacemaker needs to verify not only that the resource is
active, but also that its actual role matches its intended one.
Define two monitoring actions: the usual one will cover the slave role,
and an additional one with +role="master"+ will cover the master role.
.Monitoring both states of a promotable clone resource
======
[source,XML]
-------
<clone id="myMasterRsc">
<meta_attributes id="myMasterRsc-meta">
<nvpair name="promotable" value="true"/>
</meta_attributes>
<primitive id="myRsc" class="ocf" type="myApp" provider="myCorp">
<operations>
<op id="public-ip-slave-check" name="monitor" interval="60"/>
<op id="public-ip-master-check" name="monitor" interval="61" role="Master"/>
</operations>
</primitive>
</clone>
-------
======
[IMPORTANT]
===========
It is crucial that _every_ monitor operation has a different interval!
Pacemaker currently differentiates between operations
only by resource and interval; so if (for example) a promotable clone resource
had the same monitor interval for both roles, Pacemaker would ignore the
role when checking the status -- which would cause unexpected return
codes, and therefore unnecessary complications.
===========
[[s-promotion-scores]]
=== Determining Which Instance is Promoted ===
Pacemaker can choose a promotable clone instance to be promoted in one of two
ways:
* Promotion scores: These are node attributes set via the `crm_master` utility,
which generally would be called by the resource agent's start action if it
supports promotable clones. This tool automatically detects both the resource
and host, and should be used to set a preference for being promoted. Based on
this, +promoted-max+, and +promoted-node-max+, the instance(s) with the
highest preference will be promoted.
* Constraints: Location constraints can indicate which nodes are most preferred
as masters.
.Explicitly preferring node1 to be promoted to master
======
[source,XML]
-------
<rsc_location id="master-location" rsc="myMasterRsc">
<rule id="master-rule" score="100" role="Master">
<expression id="master-exp" attribute="#uname" operation="eq" value="node1"/>
</rule>
</rsc_location>
-------
======
[[s-resource-bundle]]
== Bundles - Isolated Environments ==
indexterm:[bundle]
indexterm:[Resource,bundle]
indexterm:[Docker,bundle]
indexterm:[rkt,bundle]
Pacemaker supports a special syntax for launching a
https://en.wikipedia.org/wiki/Operating-system-level_virtualization[container]
with any infrastructure it requires: the 'bundle'.
Pacemaker bundles support https://www.docker.com/[Docker] and
https://coreos.com/rkt/[rkt] container technologies.
footnote:[Docker is a trademark of Docker, Inc. No endorsement by or
association with Docker, Inc. is implied.]
.A bundle for a containerized web server
====
[source,XML]
----
<bundle id="httpd-bundle">
<docker image="pcmk:http" replicas="3"/>
<network ip-range-start="192.168.122.131"
host-netmask="24"
host-interface="eth0">
<port-mapping id="httpd-port" port="80"/>
</network>
<storage>
<storage-mapping id="httpd-syslog"
source-dir="/dev/log"
target-dir="/dev/log"
options="rw"/>
<storage-mapping id="httpd-root"
source-dir="/srv/html"
target-dir="/var/www/html"
options="rw"/>
<storage-mapping id="httpd-logs"
source-dir-root="/var/log/pacemaker/bundles"
target-dir="/etc/httpd/logs"
options="rw"/>
</storage>
<primitive class="ocf" id="httpd" provider="heartbeat" type="apache"/>
</bundle>
----
====
=== Bundle Properties ===
.Properties of a Bundle
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the bundle (required)
indexterm:[id,bundle]
indexterm:[bundle,Property,id]
|description
|Arbitrary text (not used by Pacemaker)
indexterm:[description,bundle]
indexterm:[bundle,Property,description]
|=========================================================
A bundle must contain exactly one +<docker>+ or +<rkt>+ element.
=== Docker Properties ===
Before configuring a Docker bundle in Pacemaker, the user must install Docker
and supply a fully configured Docker image on every node allowed to run the
bundle.
Pacemaker will create an implicit +ocf:heartbeat:docker+ resource to manage
a bundle's Docker container. The user must ensure that resource agent is
installed on every node allowed to run the bundle.
.Properties of a Bundle's Docker Element
[width="95%",cols="3m,4,5<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|image
|
|Docker image tag (required)
indexterm:[image,Docker]
indexterm:[Docker,Property,image]
|replicas
|Value of +promoted-max+ if that is positive, else 1
|A positive integer specifying the number of container instances to launch
indexterm:[replicas,Docker]
indexterm:[Docker,Property,replicas]
|replicas-per-host
|1
|A positive integer specifying the number of container instances allowed to run
on a single node
indexterm:[replicas-per-host,Docker]
indexterm:[Docker,Property,replicas-per-host]
|promoted-max
|0
|A non-negative integer that, if positive, indicates that the containerized
service should be treated as a promotable service, with this many replicas
allowed to run the service in the master role
indexterm:[promoted-max,Docker]
indexterm:[Docker,Property,promoted-max]
|network
|
|If specified, this will be passed to +docker run+ as the
https://docs.docker.com/engine/reference/run/#network-settings[network setting]
for the Docker container.
indexterm:[network,Docker]
indexterm:[Docker,Property,network]
|run-command
|`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
|This command will be run inside the container when launching it ("PID 1"). If
the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
(but could, for example, be a script that does other stuff, too). If the
container image has a pre-2.0.0 version of Pacemaker, set this to
+/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
indexterm:[run-command,Docker]
indexterm:[Docker,Property,run-command]
|options
|
|Extra command-line options to pass to `docker run`
indexterm:[options,Docker]
indexterm:[Docker,Property,options]
|=========================================================
For backward compatibility, +masters+ is accepted as an alias for
+promoted-max+, but is deprecated since 2.0.0, and support for it will be
removed in a future version.
=== rkt Properties ===
Before configuring a rkt bundle in Pacemaker, the user must install rkt
and supply a fully configured container image on every node allowed to run the
bundle.
Pacemaker will create an implicit +ocf:heartbeat:rkt+ resource to manage
a bundle's rkt container. The user must ensure that resource agent is
installed on every node allowed to run the bundle.
.Properties of a Bundle's rkt Element
[width="95%",cols="3m,4,5<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|image
|
|Container image tag (required)
indexterm:[image,rkt]
indexterm:[rkt,Property,image]
|replicas
|Value of +promoted-max+ if that is positive, else 1
|A positive integer specifying the number of container instances to launch
indexterm:[replicas,rkt]
indexterm:[rkt,Property,replicas]
|replicas-per-host
|1
|A positive integer specifying the number of container instances allowed to run
on a single node
indexterm:[replicas-per-host,rkt]
indexterm:[rkt,Property,replicas-per-host]
|promoted-max
|0
|A non-negative integer that, if positive, indicates that the containerized
service should be treated as a promotable service, with this many replicas
allowed to run the service in the master role
indexterm:[promoted-max,rkt]
indexterm:[rkt,Property,promoted-max]
|network
|
|If specified, this will be passed to +rkt run+ as the
network setting for the rkt container.
indexterm:[network,rkt]
indexterm:[rkt,Property,network]
|run-command
|`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
|This command will be run inside the container when launching it ("PID 1"). If
the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
(but could, for example, be a script that does other stuff, too). If the
container image has a pre-2.0.0 version of Pacemaker, set this to
+/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
indexterm:[run-command,rkt]
indexterm:[rkt,Property,run-command]
|options
|
|Extra command-line options to pass to `rkt run`
indexterm:[options,rkt]
indexterm:[rkt,Property,options]
|=========================================================
For backward compatibility, +masters+ is accepted as an alias for
+promoted-max+, but is deprecated since 2.0.0, and support for it will be
removed in a future version.
=== Bundle Network Properties ===
A bundle may optionally contain one +<network>+ element.
indexterm:[bundle,network]
.Properties of a Bundle's Network Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|add-host
|TRUE
|If TRUE, and +ip-range-start+ is used, Pacemaker will automatically ensure
that +/etc/hosts+ inside the containers has entries for each
<<s-resource-bundle-note-replica-names,replica name>> and its assigned IP.
indexterm:[add-host,network]
indexterm:[network,Property,add-host]
|ip-range-start
|
|If specified, Pacemaker will create an implicit +ocf:heartbeat:IPaddr2+
resource for each container instance, starting with this IP address,
using up to +replicas+ sequential addresses. These addresses can be used
from the host's network to reach the service inside the container, though
it is not visible within the container itself. Only IPv4 addresses are
currently supported.
indexterm:[ip-range-start,network]
indexterm:[network,Property,ip-range-start]
|host-netmask
|32
|If +ip-range-start+ is specified, the IP addresses are created with this
CIDR netmask (as a number of bits).
indexterm:[host-netmask,network]
indexterm:[network,Property,host-netmask]
|host-interface
|
|If +ip-range-start+ is specified, the IP addresses are created on this
host interface (by default, it will be determined from the IP address).
indexterm:[host-interface,network]
indexterm:[network,Property,host-interface]
|control-port
|3121
|If the bundle contains a +primitive+, the cluster will use this integer TCP
port for communication with Pacemaker Remote inside the container. Changing
this is useful when the container is unable to listen on the default port,
for example, when the container uses the host's network rather than
+ip-range-start+ (in which case +replicas-per-host+ must be 1), or when the
bundle may run on a Pacemaker Remote node that is already listening on the
default port. Any PCMK_remote_port environment variable set on the host or in
the container is ignored for bundle connections.
indexterm:[control-port,network]
indexterm:[network,Property,control-port]
|=========================================================
[[s-resource-bundle-note-replica-names]]
[NOTE]
====
Replicas are named by the bundle id plus a dash and an integer counter starting
with zero. For example, if a bundle named +httpd-bundle+ has +replicas=2+, its
containers will be named +httpd-bundle-0+ and +httpd-bundle-1+.
====
Additionally, a +<network>+ element may optionally contain one or more
+<port-mapping>+ elements.
indexterm:[bundle,network,port-mapping]
.Properties of a Bundle's Port-Mapping Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|id
|
|A unique name for the port mapping (required)
indexterm:[id,port-mapping]
indexterm:[port-mapping,Property,id]
|port
|
|If this is specified, connections to this TCP port number on the host network
(on the container's assigned IP address, if +ip-range-start+ is specified)
will be forwarded to the container network. Exactly one of +port+ or +range+
must be specified in a +port-mapping+.
indexterm:[port,port-mapping]
indexterm:[port-mapping,Property,port]
|internal-port
|value of +port+
|If +port+ and this are specified, connections to +port+ on the host's network
will be forwarded to this port on the container network.
indexterm:[internal-port,port-mapping]
indexterm:[port-mapping,Property,internal-port]
|range
|
|If this is specified, connections to these TCP port numbers (expressed as
'first_port'-'last_port') on the host network (on the container's assigned IP
address, if +ip-range-start+ is specified) will be forwarded to the same ports
in the container network. Exactly one of +port+ or +range+ must be specified
in a +port-mapping+.
indexterm:[range,port-mapping]
indexterm:[port-mapping,Property,range]
|=========================================================
[NOTE]
====
If the bundle contains a +primitive+, Pacemaker will automatically map the
+control-port+, so it is not necessary to specify that port in a
+port-mapping+.
====
=== Bundle Storage Properties ===
A bundle may optionally contain one +<storage>+ element. A +<storage>+ element
has no properties of its own, but may contain one or more +<storage-mapping>+
elements.
indexterm:[bundle,storage,storage-mapping]
.Properties of a Bundle's Storage-Mapping Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|id
|
|A unique name for the storage mapping (required)
indexterm:[id,storage-mapping]
indexterm:[storage-mapping,Property,id]
|source-dir
|
|The absolute path on the host's filesystem that will be mapped into the
container. Exactly one of +source-dir+ and +source-dir-root+ must be specified
in a +storage-mapping+.
indexterm:[source-dir,storage-mapping]
indexterm:[storage-mapping,Property,source-dir]
|source-dir-root
|
|The start of a path on the host's filesystem that will be mapped into the
container, using a different subdirectory on the host for each container
instance. The subdirectory will be named the same as the
<<s-resource-bundle-note-replica-names,replica name>>.
Exactly one of +source-dir+ and +source-dir-root+ must be specified in a
+storage-mapping+.
indexterm:[source-dir-root,storage-mapping]
indexterm:[storage-mapping,Property,source-dir-root]
|target-dir
|
|The path name within the container where the host storage will be mapped
(required)
indexterm:[target-dir,storage-mapping]
indexterm:[storage-mapping,Property,target-dir]
|options
|
|File system mount options to use when mapping the storage
indexterm:[options,storage-mapping]
indexterm:[storage-mapping,Property,options]
|=========================================================
[NOTE]
====
Pacemaker does not define the behavior if the source directory does not already
exist on the host. However, it is expected that the container technology and/or
its resource agent will create the source directory in that case.
====
[NOTE]
====
If the bundle contains a +primitive+,
Pacemaker will automatically map the equivalent of
+source-dir=/etc/pacemaker/authkey target-dir=/etc/pacemaker/authkey+
and +source-dir-root=/var/log/pacemaker/bundles target-dir=/var/log+ into the
container, so it is not necessary to specify those paths in a
+storage-mapping+.
====
[IMPORTANT]
====
The +PCMK_authkey_location+ environment variable must not be set to anything
other than the default of `/etc/pacemaker/authkey` on any node in the cluster.
====
=== Bundle Primitive ===
A bundle may optionally contain one +<primitive>+ resource
(see <<s-resource-primitive>>). The primitive may have operations,
instance attributes and meta-attributes defined, as usual.
If a bundle contains a primitive resource, the container image must include
the Pacemaker Remote daemon, and at least one of +ip-range-start+ or
+control-port+ must be configured in the bundle. Pacemaker will create an
implicit +ocf:pacemaker:remote+ resource for the connection, launch
Pacemaker Remote within the container, and monitor and manage the primitive
resource via Pacemaker Remote.
If the bundle has more than one container instance (replica), the primitive
resource will function as an implicit clone (see <<s-resource-clone>>) --
a promotable clone if the bundle has +masters+ greater than zero
(see <<s-resource-promotable>>).
[IMPORTANT]
====
Containers in bundles with a +primitive+ must have an accessible networking
environment, so that Pacemaker on the cluster nodes can contact
Pacemaker Remote inside the container. For example, the Docker option
`--net=none` should not be used with a +primitive+. The default (using a
distinct network space inside the container) works in combination with
+ip-range-start+. If the Docker option `--net=host` is used (making the
container share the host's network space), a unique +control-port+ should be
specified for each bundle. Any firewall must allow access to the
+control-port+.
====
[[s-bundle-attributes]]
=== Bundle Node Attributes ===
If the bundle has a +primitive+, the primitive's resource agent may want to set
node attributes such as <<s-promotion-scores,promotion scores>>. However, with
containers, it is not apparent which node should get the attribute.
If the container uses shared storage that is the same no matter which node the
container is hosted on, then it is appropriate to use the promotion score on the
bundle node itself.
On the other hand, if the container uses storage exported from the underlying host,
then it may be more appropriate to use the promotion score on the underlying host.
Since this depends on the particular situation, the
+container-attribute-target+ resource meta-attribute allows the user to specify
which approach to use. If it is set to +host+, then user-defined node attributes
will be checked on the underlying host. If it is anything else, the local node
(in this case the bundle node) is used as usual.
This only applies to user-defined attributes; the cluster will always check the
local node for cluster-defined attributes such as +#uname+.
If +container-attribute-target+ is +host+, the cluster will pass additional
environment variables to the primitive's resource agent that allow it to set
-node attributes appropriately: +container_attribute_target+ (identical to the
-meta-attribute value) and +physical_host+ (the name of the underlying host).
+node attributes appropriately: +CRM_meta_container_attribute_target+ (identical
+to the meta-attribute value) and +CRM_meta_physical_host+ (the name of the
+underlying host).
[NOTE]
====
-It is up to the resource agent to check for the additional variables and use
-them when setting node attributes.
+When called by a resource agent, the attrd_updater and crm_attribute commands
+will automatically check those environment variables and set attributes
+appropriately.
====
=== Bundle Meta-Attributes ===
Any meta-attribute set on a bundle will be inherited by the bundle's
primitive and any resources implicitly created by Pacemaker for the bundle.
This includes options such as +priority+, +target-role+, and +is-managed+. See
<<s-resource-options>> for more information.
=== Limitations of Bundles ===
Restarting pacemaker while a bundle is unmanaged or the cluster is in
maintenance mode may cause the bundle to fail.
-Bundles may not be cloned or included in groups. This includes the bundle's
-primitive and any resources implicitly created by Pacemaker for the bundle.
+Bundles may not be explicitly cloned or included in groups. This includes the
+bundle's primitive and any resources implicitly created by Pacemaker for the
+bundle. (If +replicas+ is greater than 1, the bundle will behave like a clone
+implicitly.)
Bundles do not have instance attributes, utilization attributes, or operations,
though a bundle's primitive may have them.
A bundle with a primitive can run on a Pacemaker Remote node only if the bundle
uses a distinct +control-port+.
diff --git a/extra/resources/ClusterMon.in b/extra/resources/ClusterMon.in
index 04fb43f1e6..2cd2d57f23 100755
--- a/extra/resources/ClusterMon.in
+++ b/extra/resources/ClusterMon.in
@@ -1,276 +1,274 @@
#!@BASH_PATH@
#
#
# ClusterMon OCF RA.
# Starts crm_mon in background which logs cluster status as
# html to the specified file.
#
-# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Br馥
+# Copyright 2004-2018 SUSE LINUX AG, Lars Marowsky-Br馥
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
# OCF instance parameters:
# OCF_RESKEY_user
# OCF_RESKEY_pidfile
# OCF_RESKEY_update
# OCF_RESKEY_extra_options
# OCF_RESKEY_htmlfile
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ClusterMon" version="1.0">
<version>1.0</version>
<longdesc lang="en">
This is a ClusterMon Resource Agent.
It outputs current cluster status to the html.
</longdesc>
<shortdesc lang="en">Runs crm_mon in the background, recording the cluster status to an HTML file</shortdesc>
<parameters>
<parameter name="user" unique="0">
<longdesc lang="en">
The user we want to run crm_mon as
</longdesc>
<shortdesc lang="en">The user we want to run crm_mon as</shortdesc>
<content type="string" default="root" />
</parameter>
<parameter name="update" unique="0">
<longdesc lang="en">
How frequently should we update the cluster status (in milliseconds).
For compatibility with old documentation, values less than 1000 will be treated
as seconds.
</longdesc>
<shortdesc lang="en">Update interval in milliseconds</shortdesc>
<content type="integer" default="15000" />
</parameter>
<parameter name="extra_options" unique="0">
<longdesc lang="en">
Additional options to pass to crm_mon. Eg. -n -r
</longdesc>
<shortdesc lang="en">Extra options</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="pidfile" unique="1">
<longdesc lang="en">
PID file location to ensure only one instance is running
</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="/tmp/ClusterMon_${OCF_RESOURCE_INSTANCE}.pid" />
</parameter>
<parameter name="htmlfile" unique="1" required="0">
<longdesc lang="en">
Location to write HTML output to.
</longdesc>
<shortdesc lang="en">HTML output</shortdesc>
<content type="string" default="/tmp/ClusterMon_${OCF_RESOURCE_INSTANCE}.html" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" depth="0" timeout="20" interval="10" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
ClusterMon_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
ClusterMon_exit() {
if [ $1 != 0 ]; then
exit $OCF_ERR_GENERIC
else
exit $OCF_SUCCESS
fi
}
ClusterMon_start() {
- cmd_prefix=""
- cmd_suffix=""
if [ ! -z $OCF_RESKEY_user ]; then
su - $OCF_RESKEY_user -c "$CMON_CMD"
else
$CMON_CMD
fi
ClusterMon_exit $?
}
ClusterMon_stop() {
if [ -f $OCF_RESKEY_pidfile ]; then
pid=`cat $OCF_RESKEY_pidfile`
if [ ! -z $pid ]; then
kill -s 9 $pid
rm -f $OCF_RESKEY_pidfile
fi
fi
ClusterMon_exit 0
}
ClusterMon_monitor() {
if [ -f $OCF_RESKEY_pidfile ]; then
pid=`cat $OCF_RESKEY_pidfile`
if [ ! -z $pid ]; then
str=$(echo "su - $OCF_RESKEY_user -c \"$CMON_CMD\"" | tr 'crmon, \t' 'xxxxxxxx')
ps -o "args=${str}" -p $pid 2>/dev/null | \
grep -qE "[c]rm_mon.*${OCF_RESKEY_pidfile}"
rc=$?
case $rc in
0) exit $OCF_SUCCESS;;
1) exit $OCF_NOT_RUNNING;;
*) exit $OCF_ERR_GENERIC;;
esac
fi
fi
exit $OCF_NOT_RUNNING
}
CheckOptions() {
while getopts Vi:nrh:cdp: OPTION
do
case $OPTION in
V|n|r|c|d);;
i) ocf_log warn "You should not have specified the -i option, since OCF_RESKEY_update is set already!";;
h) ocf_log warn "You should not have specified the -h option, since OCF_RESKEY_htmlfile is set already!";;
p) ocf_log warn "You should not have specified the -p option, since OCF_RESKEY_pidfile is set already!";;
*) return $OCF_ERR_ARGS;;
esac
done
if [ $? -ne 0 ]; then
return $OCF_ERR_ARGS
fi
# We should have eaten all options at this stage
shift $(($OPTIND -1))
if [ $# -gt 0 ]; then
false
else
true
fi
}
ClusterMon_validate() {
# Existence of the user
if [ ! -z $OCF_RESKEY_user ]; then
getent passwd "$OCF_RESKEY_user" >/dev/null
if [ $? -eq 0 ]; then
: Yes, user exists. We can further check his permission on crm_mon if necessary
else
ocf_log err "The user $OCF_RESKEY_user does not exist!"
exit $OCF_ERR_ARGS
fi
fi
# Pidfile better be an absolute path
case $OCF_RESKEY_pidfile in
/*) ;;
*) ocf_log warn "You should have pidfile($OCF_RESKEY_pidfile) of absolute path!" ;;
esac
# Check the update interval
if ocf_is_decimal "$OCF_RESKEY_update" && [ $OCF_RESKEY_update -gt 0 ]; then
:
else
ocf_log err "Invalid update interval $OCF_RESKEY_update. It should be positive integer!"
exit $OCF_ERR_ARGS
fi
if CheckOptions $OCF_RESKEY_extra_options; then
:
else
ocf_log err "Invalid options $OCF_RESKEY_extra_options!"
exit $OCF_ERR_ARGS
fi
# Htmlfile better be an absolute path
case $OCF_RESKEY_htmlfile in
/*) ;;
*) ocf_log warn "You should have htmlfile($OCF_RESKEY_htmlfile) of absolute path!" ;;
esac
echo "Validate OK"
return $OCF_SUCCESS
}
if [ $# -ne 1 ]; then
ClusterMon_usage
exit $OCF_ERR_ARGS
fi
: ${OCF_RESKEY_update:="15000"}
: ${OCF_RESKEY_pidfile:="/tmp/ClusterMon_${OCF_RESOURCE_INSTANCE}.pid"}
: ${OCF_RESKEY_htmlfile:="/tmp/ClusterMon_${OCF_RESOURCE_INSTANCE}.html"}
if [ ${OCF_RESKEY_update} -ge 1000 ]; then
OCF_RESKEY_update=$(( $OCF_RESKEY_update / 1000 ))
fi
CMON_CMD="${HA_SBIN_DIR}/crm_mon -p $OCF_RESKEY_pidfile -d -i $OCF_RESKEY_update $OCF_RESKEY_extra_options -h $OCF_RESKEY_htmlfile"
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) ClusterMon_start
;;
stop) ClusterMon_stop
;;
monitor) ClusterMon_monitor
;;
validate-all) ClusterMon_validate
;;
usage|help) ClusterMon_usage
exit $OCF_SUCCESS
;;
*) ClusterMon_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?
diff --git a/extra/resources/Dummy b/extra/resources/Dummy
index 722b9108cd..bb311117e6 100755
--- a/extra/resources/Dummy
+++ b/extra/resources/Dummy
@@ -1,271 +1,269 @@
#!/bin/sh
#
# Dummy OCF RA. Does nothing but wait a few seconds, can be
# configured to fail occassionally.
#
# Copyright 2004-2018 SUSE LINUX AG, Lars Marowsky-Br馥
# All Rights Reserved.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="Dummy" version="1.0">
<version>1.0</version>
<longdesc lang="en">
This is a Dummy Resource Agent. It does absolutely nothing except
keep track of whether its running or not.
Its purpose in life is for testing and to serve as a template for RA writers.
NB: Please pay attention to the timeouts specified in the actions
section below. They should be meaningful for the kind of resource
the agent manages. They should be the minimum advised timeouts,
but they shouldn't/cannot cover _all_ possible resource
instances. So, try to be neither overly generous nor too stingy,
but moderate. The minimum timeouts should never be below 10 seconds.
</longdesc>
<shortdesc lang="en">Example stateless resource agent</shortdesc>
<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/Dummy-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>
<parameter name="passwd" unique="1">
<longdesc lang="en">
Fake password field
</longdesc>
<shortdesc lang="en">Password</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="fake" unique="0">
<longdesc lang="en">
Fake attribute that can be changed to cause a reload
</longdesc>
<shortdesc lang="en">Fake attribute that can be changed to cause a reload</shortdesc>
<content type="string" default="dummy" />
</parameter>
<parameter name="op_sleep" unique="1">
<longdesc lang="en">
Number of seconds to sleep during operations. This can be used to test how
the cluster reacts to operation timeouts.
</longdesc>
<shortdesc lang="en">Operation sleep duration in seconds.</shortdesc>
<content type="string" default="0" />
</parameter>
<parameter name="fail_start_on" unique="0">
<longdesc lang="en">
Start actions will return failure if running on the host specified here, but
the resource will start successfully anyway (future monitor calls will find it
running). This can be used to test on-fail=ignore.
</longdesc>
<shortdesc lang="en">Report bogus start failure on specified host</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="envfile" unique="1">
<longdesc lang="en">
If this is set, the environment will be dumped to this file for every call.
</longdesc>
<shortdesc lang="en">Environment dump file</shortdesc>
<content type="string" default="" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" timeout="20" interval="10" depth="0"/>
<action name="reload" timeout="20" />
<action name="migrate_to" timeout="20" />
<action name="migrate_from" timeout="20" />
<action name="validate-all" timeout="20" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# don't exit on TERM, to test that pacemaker-execd makes sure that we do exit
trap sigterm_handler TERM
sigterm_handler() {
ocf_log info "They use TERM to bring us down. No such luck."
return
}
dummy_usage() {
cat <<END
usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
dump_env() {
if [ "${OCF_RESKEY_envfile}" != "" ]; then
echo "### ${__OCF_ACTION} @ $(date) ###
$(env | sort)
###" >> "${OCF_RESKEY_envfile}"
fi
}
dummy_start() {
- local RETVAL
-
dummy_monitor
- RETVAL=$?
- if [ $RETVAL -eq $OCF_SUCCESS ]; then
+ DS_RETVAL=$?
+ if [ $DS_RETVAL -eq $OCF_SUCCESS ]; then
if [ "$(uname -n)" = "${OCF_RESKEY_fail_start_on}" ]; then
- RETVAL=$OCF_ERR_GENERIC
+ DS_RETVAL=$OCF_ERR_GENERIC
fi
- return $RETVAL
+ return $DS_RETVAL
fi
touch "${OCF_RESKEY_state}"
- RETVAL=$?
+ DS_RETVAL=$?
if [ "$(uname -n)" = "${OCF_RESKEY_fail_start_on}" ]; then
- RETVAL=$OCF_ERR_GENERIC
+ DS_RETVAL=$OCF_ERR_GENERIC
fi
- return $RETVAL
+ return $DS_RETVAL
}
dummy_stop() {
dummy_monitor --force
if [ $? -eq $OCF_SUCCESS ]; then
rm ${OCF_RESKEY_state}
fi
rm -f "${VERIFY_SERIALIZED_FILE}"
return $OCF_SUCCESS
}
dummy_monitor() {
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ $OCF_RESKEY_op_sleep -ne 0 ]; then
- if [ "$1" = "" -a -f "${VERIFY_SERIALIZED_FILE}" ]; then
+ if [ "$1" = "" ] && [ -f "${VERIFY_SERIALIZED_FILE}" ]; then
# two monitor ops have occurred at the same time.
# This verifies a condition in pacemaker-execd regression tests.
ocf_log err "$VERIFY_SERIALIZED_FILE exists already"
return $OCF_ERR_GENERIC
fi
touch "${VERIFY_SERIALIZED_FILE}"
sleep ${OCF_RESKEY_op_sleep}
rm "${VERIFY_SERIALIZED_FILE}"
fi
if [ -f "${OCF_RESKEY_state}" ]; then
# Multiple monitor levels are defined to support various tests
case "$OCF_CHECK_LEVEL" in
10)
# monitor level with delay, useful for testing timeouts
sleep 30
;;
20)
# monitor level that fails intermittently
- n=$(expr $(dd if=/dev/urandom bs=1 count=1 2>/dev/null | od | head -1 | cut -f2 -d' ') % 5)
+ n=$(expr "$(dd if=/dev/urandom bs=1 count=1 2>/dev/null | od | head -1 | cut -f2 -d' ')" % 5)
if [ $n -eq 1 ]; then
ocf_exit_reason "smoke detected near CPU fan"
return $OCF_ERR_GENERIC
fi
;;
30)
# monitor level that always fails
ocf_exit_reason "hyperdrive quota reached"
return $OCF_ERR_GENERIC
;;
*)
;;
esac
return $OCF_SUCCESS
fi
return $OCF_NOT_RUNNING
}
dummy_validate() {
# Is the state directory writable?
state_dir=`dirname "$OCF_RESKEY_state"`
touch "$state_dir/$$"
if [ $? -ne 0 ]; then
return $OCF_ERR_ARGS
fi
rm "$state_dir/$$"
return $OCF_SUCCESS
}
: ${OCF_RESKEY_fake=dummy}
: ${OCF_RESKEY_op_sleep=0}
: ${OCF_RESKEY_CRM_meta_interval=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
if [ -z "$OCF_RESKEY_state" ]; then
OCF_RESKEY_state="${HA_VARRUN%%/}/Dummy-${OCF_RESOURCE_INSTANCE}.state"
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
# Strip off the trailing clone marker (note + is not portable in sed)
OCF_RESKEY_state=`echo $OCF_RESKEY_state | sed s/:[0-9][0-9]*\.state/.state/`
fi
fi
VERIFY_SERIALIZED_FILE="${OCF_RESKEY_state}.serialized"
dump_env
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) dummy_start;;
stop) dummy_stop;;
monitor) dummy_monitor;;
migrate_to) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}."
dummy_stop
;;
migrate_from) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}."
dummy_start
;;
reload) ocf_log err "Reloading..."
dummy_start
;;
validate-all) dummy_validate;;
usage|help) dummy_usage
exit $OCF_SUCCESS
;;
*) dummy_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
diff --git a/extra/resources/HealthSMART.in b/extra/resources/HealthSMART.in
index 9e91298a9e..e02060581e 100755
--- a/extra/resources/HealthSMART.in
+++ b/extra/resources/HealthSMART.in
@@ -1,328 +1,328 @@
#!@BASH_PATH@
#
#
# HealthSMART OCF RA. Checks the S.M.A.R.T. status of all given
# drives and writes the #health-smart status into the CIB
#
# Copyright (c) 2009 Michael Schwartzkopff, 2010 Matthew Richardson
#
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#
SMARTCTL=/usr/sbin/smartctl
ATTRDUP=/usr/sbin/attrd_updater
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="HealthSMART" version="0.1">
<version>1.0</version>
<longdesc lang="en">
Systhem health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>
<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>
<parameter name="drives" unique="0">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>
<parameter name="devices" unique="0">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>
<parameter name="temp_lower_limit" unique="0">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
<content type="string" default="0"/>
</parameter>
<parameter name="temp_upper_limit" unique="0">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>
<parameter name="temp_warning" unique="0">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc>
<content type="string" default="5"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="10" />
<action name="stop" timeout="10" />
<action name="monitor" timeout="10" interval="10" start-delay="0" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="10" />
</actions>
</resource-agent>
END
}
#######################################################################
check_temperature() {
if [ $1 -lt ${lower_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return 1
fi
if [ $1 -gt ${upper_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return 1
fi
if [ $1 -lt ${lower_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
$ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
return 1
fi
if [ $1 -gt ${upper_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
$ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
return 1
fi
}
init_smart() {
#Set temperature defaults
if [ -z ${OCF_RESKEY_temp_warning} ]; then
yellow_threshold=5
else
yellow_threshold=${OCF_RESKEY_temp_warning}
fi
if [ -z ${OCF_RESKEY_temp_lower_limit} ] ; then
lower_red_limit=0
else
lower_red_limit=${OCF_RESKEY_temp_lower_limit}
fi
lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))
if [ -z ${OCF_RESKEY_temp_upper_limit} ] ; then
upper_red_limit=60
else
upper_red_limit=${OCF_RESKEY_temp_upper_limit}
fi
upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))
#Set disk defaults
if [ -z "${OCF_RESKEY_drives}" ] ; then
DRIVES="/dev/sda"
else
DRIVES=${OCF_RESKEY_drives}
fi
#Test for presence of smartctl
if [ ! -x $SMARTCTL ] ; then
ocf_log err "${SMARTCTL} not installed."
exit $OCF_ERR_INSTALLED
fi
for DRIVE in $DRIVES; do
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
$SMARTCTL -d $DEVICE -i ${DRIVE} | grep -q "SMART support is: Enabled"
if [ $? -ne "0" ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
done
else
$SMARTCTL -i ${DRIVE} | grep -q "SMART support is: Enabled"
if [ $? -ne "0" ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
fi
done
}
HealthSMART_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
HealthSMART_start() {
HealthSMART_monitor
if [ $? = $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch ${OCF_RESKEY_state}
}
HealthSMART_stop() {
HealthSMART_monitor
if [ $? = $OCF_SUCCESS ]; then
rm ${OCF_RESKEY_state}
fi
return $OCF_SUCCESS
}
HealthSMART_monitor() {
init_smart
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ -f ${OCF_RESKEY_state} ]; then
# Check overall S.M.A.R.T. status
for DRIVE in $DRIVES; do
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
$SMARTCTL -d $DEVICE -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
if [ $? -ne "0" ]; then
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return $OCF_SUCCESS
fi
done
else
$SMARTCTL -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
if [ $? -ne "0" ]; then
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return $OCF_SUCCESS
fi
fi
# Check drive temperature(s)
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
- check_temperature `$SMARTCTL -d $DEVICE -A ${DRIVE} | awk '/^194/ { print $10 }'`
- if [ $? != 0 ]; then
+ check_temperature "$("$SMARTCTL" -d "$DEVICE" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
+ if [ $? -ne 0 ]; then
return $OCF_SUCCESS
fi
done
else
- check_temperature `$SMARTCTL -A ${DRIVE} | awk '/^194/ { print $10 }'`
- if [ $? != 0 ]; then
+ check_temperature "$("$SMARTCTL" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
+ if [ $? -ne 0 ]; then
return $OCF_SUCCESS
fi
fi
done
$ATTRDUP -n "#health-smart" -U "green" -d "5s"
return $OCF_SUCCESS
fi
return $OCF_NOT_RUNNING
}
HealthSMART_validate() {
init_smart
# Is the state directory writable?
state_dir=`dirname "$OCF_RESKEY_state"`
touch "$state_dir/$$"
if [ $? != 0 ]; then
return $OCF_ERR_ARGS
fi
rm "$state_dir/$$"
return $OCF_SUCCESS
}
: ${OCF_RESKEY_CRM_meta_interval=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
if [ "x$OCF_RESKEY_state" = "x" ]; then
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
# Strip off the trailing clone marker
OCF_RESKEY_state=`echo $state | sed s/:[0-9][0-9]*\.state/.state/`
else
OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
fi
fi
case $__OCF_ACTION in
start) HealthSMART_start;;
stop) HealthSMART_stop;;
monitor) HealthSMART_monitor;;
validate-all) HealthSMART_validate;;
meta-data)
meta_data
exit $OCF_SUCCESS
;;
usage|help)
HealthSMART_usage
exit $OCF_SUCCESS
;;
*) HealthSMART_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
diff --git a/extra/resources/Stateful b/extra/resources/Stateful
index 6c2f1b6a6b..f1b7a40169 100755
--- a/extra/resources/Stateful
+++ b/extra/resources/Stateful
@@ -1,231 +1,231 @@
#!/bin/sh
#
# Example of a stateful OCF Resource Agent
# Copyright 2006-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot"
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="Stateful" version="1.1">
<version>1.0</version>
<longdesc lang="en">
This is an example resource agent that implements two states
</longdesc>
<shortdesc lang="en">Example stateful resource agent</shortdesc>
<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/Stateful-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>
<parameter name="envfile" unique="1">
<longdesc lang="en">
If this is set, the environment will be dumped to this file for every call.
</longdesc>
<shortdesc lang="en">Environment dump file</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="notify_delay" unique="0">
<longdesc lang="en">
The notify action will sleep for this many seconds before returning,
to simulate a long-running notify.
</longdesc>
<shortdesc lang="en">Notify delay in seconds</shortdesc>
<content type="string" default="" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" depth="0" timeout="20" interval="10" role="Master"/>
<action name="monitor" depth="0" timeout="20" interval="10" role="Slave"/>
<action name="notify" timeout="5" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
exit $OCF_SUCCESS
}
#######################################################################
stateful_usage() {
cat <<END
usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
exit $1
}
stateful_update() {
echo $1 > ${OCF_RESKEY_state}
}
stateful_check_state() {
target=$1
if [ -f ${OCF_RESKEY_state} ]; then
state=`cat ${OCF_RESKEY_state}`
if [ "x$target" = "x$state" ]; then
return 0
fi
else
if [ "x$target" = "x" ]; then
return 0
fi
fi
return 1
}
dump_env() {
if [ "${OCF_RESKEY_envfile}" != "" ]; then
echo "### ${__OCF_ACTION} @ $(date) ###
$(env | sort)
###" >> "${OCF_RESKEY_envfile}"
fi
}
stateful_start() {
stateful_check_state master
if [ $? = 0 ]; then
# CRM Error - Should never happen
return $OCF_RUNNING_MASTER
fi
stateful_update slave
$CRM_MASTER -v ${slave_score}
return 0
}
stateful_demote() {
stateful_check_state
if [ $? = 0 ]; then
# CRM Error - Should never happen
return $OCF_NOT_RUNNING
fi
stateful_update slave
$CRM_MASTER -v ${slave_score}
return 0
}
stateful_promote() {
stateful_check_state
if [ $? = 0 ]; then
return $OCF_NOT_RUNNING
fi
stateful_update master
$CRM_MASTER -v ${master_score}
return 0
}
stateful_stop() {
$CRM_MASTER -D
stateful_check_state master
if [ $? = 0 ]; then
# CRM Error - Should never happen
return $OCF_RUNNING_MASTER
fi
if [ -f ${OCF_RESKEY_state} ]; then
rm ${OCF_RESKEY_state}
fi
return 0
}
stateful_monitor() {
stateful_check_state "master"
if [ $? = 0 ]; then
if [ $OCF_RESKEY_CRM_meta_interval = 0 ]; then
# Restore the master setting during probes
$CRM_MASTER -v ${master_score}
fi
return $OCF_RUNNING_MASTER
fi
stateful_check_state "slave"
if [ $? = 0 ]; then
if [ $OCF_RESKEY_CRM_meta_interval = 0 ]; then
# Restore the master setting during probes
$CRM_MASTER -v ${slave_score}
fi
return $OCF_SUCCESS
fi
if [ -f ${OCF_RESKEY_state} ]; then
echo "File '${OCF_RESKEY_state}' exists but contains unexpected contents"
cat ${OCF_RESKEY_state}
return $OCF_ERR_GENERIC
fi
return 7
}
stateful_notify() {
if [ "${OCF_RESKEY_notify_delay}" != "0" ]; then
sleep "${OCF_RESKEY_notify_delay}"
fi
return $OCF_SUCCESS
}
stateful_validate() {
exit $OCF_SUCCESS
}
: ${slave_score=5}
: ${master_score=10}
: ${OCF_RESKEY_CRM_meta_interval=0}
-: ${OCF_RESKEY_CRM_notify_delay=0}
+: ${OCF_RESKEY_notify_delay=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
if [ "x$OCF_RESKEY_state" = "x" ]; then
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
state="${HA_VARRUN%%/}/Stateful-${OCF_RESOURCE_INSTANCE}.state"
# Strip off the trailing clone marker
OCF_RESKEY_state=`echo $state | sed s/:[0-9][0-9]*\.state/.state/`
else
OCF_RESKEY_state="${HA_VARRUN%%/}/Stateful-${OCF_RESOURCE_INSTANCE}.state"
fi
fi
dump_env
case $__OCF_ACTION in
meta-data) meta_data;;
start) stateful_start;;
promote) stateful_promote;;
demote) stateful_demote;;
notify) stateful_notify ;;
stop) stateful_stop;;
monitor) stateful_monitor;;
validate-all) stateful_validate;;
usage|help) stateful_usage $OCF_SUCCESS;;
*) stateful_usage $OCF_ERR_UNIMPLEMENTED;;
esac
exit $?
diff --git a/extra/resources/SysInfo.in b/extra/resources/SysInfo.in
index 24c259cc1b..8cba0e5cbe 100755
--- a/extra/resources/SysInfo.in
+++ b/extra/resources/SysInfo.in
@@ -1,386 +1,382 @@
#!@BASH_PATH@
#
#
# SysInfo OCF Resource Agent
# It records (in the CIB) various attributes of a node
#
-# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Br馥
+# Copyright 2004-2018 SUSE LINUX AG, Lars Marowsky-Br馥
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="SysInfo" version="1.0">
<version>1.0</version>
<longdesc lang="en">
This is a SysInfo Resource Agent.
It records (in the CIB) various attributes of a node
Sample Linux output:
arch: i686
os: Linux-2.4.26-gentoo-r14
free_swap: 1999
cpu_info: Intel(R) Celeron(R) CPU 2.40GHz
cpu_speed: 4771.02
cpu_cores: 1
cpu_load: 0.00
ram_total: 513
ram_free: 117
root_free: 2.4
#health_disk: red
Sample Darwin output:
arch: i386
os: Darwin-8.6.2
cpu_info: Intel Core Duo
cpu_speed: 2.16
cpu_cores: 2
cpu_load: 0.18
ram_total: 2016
ram_free: 787
root_free: 13
#health_disk: green
Units:
free_swap: MB
ram_*: MB
cpu_speed (Linux): bogomips
cpu_speed (Darwin): GHz
*_free: GB (or user-defined: disk_unit)
</longdesc>
<shortdesc lang="en">SysInfo resource agent</shortdesc>
<parameters>
<parameter name="pidfile" unique="1">
<longdesc lang="en">PID file</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="$OCF_RESKEY_pidfile" />
</parameter>
<parameter name="delay" unique="0">
<longdesc lang="en">Interval to allow values to stabilize</longdesc>
<shortdesc lang="en">Dampening Delay</shortdesc>
<content type="string" default="0s" />
</parameter>
<parameter name="disks" unique="0">
<longdesc lang="en">
Filesystems or Paths to be queried for free disk space as a SPACE
separated list - e.g "/dev/sda1 /tmp".
Results will be written to an attribute with leading slashes
removed, and other slashes replaced with underscore, and the word
'free' appended - e.g for /dev/sda1 it would be 'dev_sda1_free'.
Note: The root filesystem '/' is always queried to an attribute
named 'root_free'
</longdesc>
<shortdesc lang="en">List of Filesytems/Paths to query for free disk space</shortdesc>
<content type="string" />
</parameter>
<parameter name="disk_unit" unique="0">
<longdesc lang="en">
Unit to report disk free space in.
Can be one of: B, K, M, G, T, P (case-insensitive)
</longdesc>
<shortdesc lang="en">Unit to report disk free space in</shortdesc>
<content type="string" default="G"/>
</parameter>
<parameter name="min_disk_free" unique="0">
<longdesc lang="en">
The amount of free space required in monitored disks. If any
of the monitored disks has less than this amount of free space,
, with the node attribute "#health_disk" changing to "red",
all resources will move away from the node. Set the node-health-strategy
property appropriately for this to take effect.
If the unit is not specified, it defaults to disk_unit.
</longdesc>
<shortdesc lang="en">minimum disk free space required</shortdesc>
<content type="string" default=""/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="20s" />
<action name="stop" timeout="20s" />
<action name="monitor" timeout="20s" interval="60s"/>
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
UpdateStat() {
name=$1; shift
value="$*"
printf "%s:\t%s\n" "$name" "$value"
if [ "$__OCF_ACTION" = "start" ] ; then
${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n $name -B "$value"
else
${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n $name -v "$value"
fi
}
SysInfoStats() {
UpdateStat arch "`uname -m`"
UpdateStat os "`uname -s`-`uname -r`"
case `uname -s` in
"Darwin")
mem=`top -l 1 | grep Mem: | awk '{print $10}'`
mem_used=`top -l 1 | grep Mem: | awk '{print $8}'`
mem=`SysInfo_mem_units $mem`
mem_used=`SysInfo_mem_units $mem_used`
mem_total=`expr $mem_used + $mem`
cpu_type=`system_profiler SPHardwareDataType | awk -F': ' '/^CPU Type/ {print $2; exit}'`
cpu_speed=`system_profiler SPHardwareDataType | awk -F': ' '/^CPU Speed/ {print $2; exit}'`
cpu_cores=`system_profiler SPHardwareDataType | awk -F': ' '/^Number Of/ {print $2; exit}'`
cpu_load=`uptime | awk '{ print $10 }'`
;;
"FreeBSD")
cpu_type=`sysctl -in hw.model`
cpu_speed=`sysctl -in dev.cpu.0.freq`
cpu_cores=`sysctl -in hw.ncpu`
cpu_load=`sysctl -in vm.loadavg | awk '{ print $4 }'`
free_pages=`sysctl -in vm.stats.vm.v_free_count`
page_count=`sysctl -in vm.stats.vm.v_page_count`
page_size=`sysctl -in vm.stats.vm.v_page_size`
mem=`expr $free_pages \* $page_size / 1024 / 1024`M
mem_total=`expr $page_count \* $page_size / 1024 / 1024`M
;;
"Linux")
if [ -f /proc/cpuinfo ]; then
cpu_type=`awk -F': ' '/model name/ {print $2; exit}' /proc/cpuinfo`
cpu_speed=`awk -F': ' '/bogomips/ {print $2; exit}' /proc/cpuinfo`
cpu_cores=`grep "^processor" /proc/cpuinfo | wc -l`
fi
cpu_load=`uptime | awk '{ print $10 }'`
if [ -f /proc/meminfo ]; then
# meminfo results are in kB
mem=`grep "SwapFree" /proc/meminfo | awk '{print $2"k"}'`
if [ ! -z $mem ]; then
- UpdateStat free_swap `SysInfo_mem_units $mem`
+ UpdateStat free_swap "$(SysInfo_mem_units "$mem")"
fi
mem=`grep "Inactive" /proc/meminfo | awk '{print $2"k"}'`
mem_total=`grep "MemTotal" /proc/meminfo | awk '{print $2"k"}'`
else
mem=`top -n 1 | grep Mem: | awk '{print $7}'`
fi
;;
*)
esac
if [ x != x"$cpu_type" ]; then
UpdateStat cpu_info "$cpu_type"
fi
if [ x != x"$cpu_speed" ]; then
UpdateStat cpu_speed "$cpu_speed"
fi
if [ x != x"$cpu_cores" ]; then
UpdateStat cpu_cores "$cpu_cores"
fi
if [ x != x"$cpu_load" ]; then
UpdateStat cpu_load "$cpu_load"
fi
if [ ! -z "$mem" ]; then
# Massage the memory values
- UpdateStat ram_total `SysInfo_mem_units $mem_total`
- UpdateStat ram_free `SysInfo_mem_units $mem`
+ UpdateStat ram_total "$(SysInfo_mem_units "$mem_total")"
+ UpdateStat ram_free "$(SysInfo_mem_units "$mem")"
fi
# Portability notes:
# o tail: explicit "-n" not available in Solaris; instead simplify
# 'tail -n <c>' to the equivalent 'tail -<c>'.
for disk in "/" ${OCF_RESKEY_disks}; do
unset disk_free disk_label
disk_free=`df -h ${disk} | tail -1 | awk '{print $4}'`
if [ x != x"$disk_free" ]; then
disk_label=`echo $disk | sed -e 's#^/$#root#;s#^/*##;s#/#_#g'`
disk_free=`SysInfo_hdd_units $disk_free`
UpdateStat ${disk_label}_free $disk_free
if [ -n "$MIN_FREE" ]; then
if [ $disk_free -le $MIN_FREE ]; then
UpdateStat "#health_disk" "red"
else
UpdateStat "#health_disk" "green"
fi
fi
fi
done
}
SysInfo_megabytes() {
# Size in megabytes
echo $1 | awk '{ n = $0;
sub(/[0-9]+(.[0-9]+)?/, "");
split(n, a, $0);
n=a[1];
if ($0 == "G" || $0 == "") { n *= 1024 };
if (/^kB?/) { n /= 1024 };
printf "%d\n", n }' # Intentionally round to an integer
}
SysInfo_mem_units() {
mem=$1
if [ -z $1 ]; then
return
fi
mem=$(SysInfo_megabytes "$1")
# Round to the next multiple of 50
r=$(($mem % 50))
if [ $r != 0 ]; then
mem=$(($mem + 50 - $r))
fi
echo $mem
}
SysInfo_hdd_units() {
# Defauts to size in gigabytes
case $OCF_RESKEY_disk_unit in
[Pp]) echo $(($(SysInfo_megabytes "$1") / 1024 / 1024 / 1024));;
[Tt]) echo $(($(SysInfo_megabytes "$1") / 1024 / 1024));;
[Gg]) echo $(($(SysInfo_megabytes "$1") / 1024));;
- [Mm]) echo $(SysInfo_megabytes "$1");;
+ [Mm]) echo "$(SysInfo_megabytes "$1")" ;;
[Kk]) echo $(($(SysInfo_megabytes "$1") * 1024));;
[Bb]) echo $(($(SysInfo_megabytes "$1") * 1024 * 1024));;
*)
ocf_log err "Invalid value for disk_unit: $OCF_RESKEY_disk_unit"
echo $(($(SysInfo_megabytes "$1") / 1024));;
esac
}
SysInfo_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
SysInfo_start() {
echo $OCF_RESKEY_clone > $OCF_RESKEY_pidfile
SysInfoStats
exit $OCF_SUCCESS
}
SysInfo_stop() {
rm $OCF_RESKEY_pidfile
exit $OCF_SUCCESS
}
SysInfo_monitor() {
if [ -f $OCF_RESKEY_pidfile ]; then
clone=`cat $OCF_RESKEY_pidfile`
fi
if [ x$clone = x ]; then
rm $OCF_RESKEY_pidfile
exit $OCF_NOT_RUNNING
elif [ $clone = $OCF_RESKEY_clone ]; then
SysInfoStats
exit $OCF_SUCCESS
- elif [ x$OCF_RESKEY_CRM_meta_globally_unique = xtrue
- -o x$OCF_RESKEY_CRM_meta_globally_unique = xTrue
- -o x$OCF_RESKEY_CRM_meta_globally_unique = xyes
- -o x$OCF_RESKEY_CRM_meta_globally_unique = xYes
- ]; then
+ elif ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then
SysInfoStats
exit $OCF_SUCCESS
fi
exit $OCF_NOT_RUNNING
}
SysInfo_validate() {
return $OCF_SUCCESS
}
if [ $# -ne 1 ]; then
SysInfo_usage
exit $OCF_ERR_ARGS
fi
: ${OCF_RESKEY_pidfile:="${HA_VARRUN%%/}/SysInfo-${OCF_RESOURCE_INSTANCE}"}
: ${OCF_RESKEY_disk_unit:="G"}
: ${OCF_RESKEY_clone:="0"}
if [ x != x${OCF_RESKEY_delay} ]; then
OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}"
else
OCF_RESKEY_delay="-d 0"
fi
MIN_FREE=""
if [ -n "$OCF_RESKEY_min_disk_free" ]; then
ocf_is_decimal "$OCF_RESKEY_min_disk_free" &&
OCF_RESKEY_min_disk_free="$OCF_RESKEY_min_disk_free$OCF_RESKEY_disk_unit"
MIN_FREE=`SysInfo_hdd_units $OCF_RESKEY_min_disk_free`
fi
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) SysInfo_start
;;
stop) SysInfo_stop
;;
monitor) SysInfo_monitor
;;
validate-all) SysInfo_validate
;;
usage|help) SysInfo_usage
exit $OCF_SUCCESS
;;
*) SysInfo_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?
diff --git a/extra/resources/SystemHealth b/extra/resources/SystemHealth
index 3e76fc3221..4f5701843a 100755
--- a/extra/resources/SystemHealth
+++ b/extra/resources/SystemHealth
@@ -1,252 +1,252 @@
#!/bin/sh
#
# SystemHealth OCF RA.
#
-# Copyright (c) 2009 International Business Machines (IBM), Mark Hamzy
+# Copyright 2009-2018 International Business Machines (IBM), Mark Hamzy
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="SystemHealth" version="0.1">
<version>1.0</version>
<longdesc lang="en">
This is a SystemHealth Resource Agent. It is used to monitor
the health of a system via IPMI.
</longdesc>
<shortdesc lang="en">SystemHealth resource agent</shortdesc>
<parameters>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" timeout="20" />
<action name="reload" timeout="20" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
END
}
#######################################################################
SystemHealth_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
SystemHealth_check_tools() {
which servicelog_notify > /dev/null 2>&1
RC=$?
if [ $RC != 0 ]; then
ocf_log err "servicelog_notify not found!"
return $OCF_ERR_INSTALLED
fi
which ipmiservicelogd > /dev/null 2>&1
RC=$?
if [ $RC != 0 ]; then
ocf_log err "ipmiservicelogd not found!"
return $OCF_ERR_INSTALLED
fi
test -x $OCF_RESKEY_program
RC=$?
if [ $RC != 0 ]; then
ocf_log err "$OCF_RESKEY_program not found!"
return $OCF_ERR_INSTALLED
fi
}
SystemHealth_start() {
SystemHealth_monitor
RC=$?
if [ $RC = $OCF_ERR_GENERIC ]; then
return $OCF_ERR_GENERIC
elif [ $RC = $OCF_SUCCESS ]; then
ocf_log warn "starting an already started SystemHealth"
return $OCF_SUCCESS
fi
service ipmi start > /dev/null 2>&1
RC=$?
if [ $RC != 0 ]; then
ocf_log err "Could not start service IPMI!"
return $OCF_ERR_GENERIC
fi
ipmiservicelogd smi 0 > /dev/null 2>&1 &
RC=$?
if [ $RC != 0 ]; then
ocf_log err "Could not start ipmiservicelogd!"
return $OCF_ERR_GENERIC
fi
servicelog_notify --add --type=EVENT --command="$OCF_RESKEY_program" --method=num_arg --match='type=4' > /dev/null 2>&1
RC=$?
if [ $RC != 0 ]; then
ocf_log err "servicelog_notify register handler failed!"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
SystemHealth_stop() {
SystemHealth_monitor
RC=$?
if [ $RC = $OCF_ERR_GENERIC ]; then
return $OCF_ERR_GENERIC
elif [ $RC = $OCF_SUCCESS ]; then
killall ipmiservicelogd
RC1=$?
if [ $RC1 != 0 ]; then
ocf_log err "Could not stop ipmiservicelogd!"
fi
servicelog_notify --remove --command="$OCF_RESKEY_program" > /dev/null 2>&1
RC2=$?
if [ $RC2 != 0 ]; then
ocf_log err "servicelog_notify remove handler failed!"
fi
- if [ $RC1 = 0 -a $RC2 = 0 ]; then
+ if [ $RC1 -eq 0 ] && [ $RC2 -eq 0 ]; then
return $OCF_SUCCESS
else
return $OCF_ERR_GENERIC
fi
elif [ $RC = $OCF_NOT_RUNNING ]; then
ocf_log warn "stopping an already stopped SystemHealth"
return $OCF_SUCCESS
else
ocf_log err "SystemHealth_stop: should not be here!"
return $OCF_ERR_GENERIC
fi
}
SystemHealth_monitor() {
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ ! -f /var/run/ipmiservicelogd.pid0 ]; then
ocf_log debug "ipmiservicelogd is not running!"
return $OCF_NOT_RUNNING
fi
- ps -p `cat /var/run/ipmiservicelogd.pid0` > /dev/null 2>&1
+ ps -p "$(cat /var/run/ipmiservicelogd.pid0)" >/dev/null 2>&1
RC=$?
if [ $RC != 0 ]; then
ocf_log debug "ipmiservicelogd's pid `cat /var/run/ipmiservicelogd.pid0` is not running!"
rm /var/run/ipmiservicelogd.pid0
return $OCF_ERR_GENERIC
fi
servicelog_notify --list --command="$OCF_RESKEY_program" > /dev/null 2>&1
RC=$?
if [ $RC = 0 ]; then
return $OCF_SUCCESS
else
return $OCF_NOT_RUNNING
fi
}
SystemHealth_validate() {
SystemHealth_check_tools
RC=$?
if [ $RC != 0 ]; then
return $RC
fi
return $OCF_SUCCESS
}
: ${OCF_RESKEY_program=/usr/sbin/notifyServicelogEvent}
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
usage|help) SystemHealth_usage
exit $OCF_SUCCESS
;;
esac
SystemHealth_check_tools
RC=$?
if [ $RC != 0 ]; then
case $__OCF_ACTION in
stop) exit $OCF_SUCCESS;;
*) exit $RC;;
esac
fi
case $__OCF_ACTION in
start) SystemHealth_start;;
stop) SystemHealth_stop;;
monitor) SystemHealth_monitor;;
reload) ocf_log info "Reloading..."
SystemHealth_start
;;
validate-all) ;;
*) SystemHealth_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
diff --git a/extra/resources/attribute b/extra/resources/attribute
index 7444ec607e..80679d1d96 100755
--- a/extra/resources/attribute
+++ b/extra/resources/attribute
@@ -1,233 +1,233 @@
#!/bin/sh
#
# ocf:pacemaker:attribute resource agent
#
-# Copyright (C) 2016 Andrew Beekhof <andrew@beekhof.net>
+# Copyright 2016-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under GNU General Public License version 2 or
# later (GPLv2+) WITHOUT ANY WARRANTY.
#
USAGE="Usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set."
# Load OCF helper functions
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
# Ensure certain variables are set and not empty
: ${HA_VARRUN:="/var/run"}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
: ${OCF_RESOURCE_INSTANCE:="undef"}
DEFAULT_STATE_FILE="${HA_VARRUN%%/}/opa-${OCF_RESOURCE_INSTANCE}.state"
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
# Strip off any trailing clone marker (note + is not portable in sed)
DEFAULT_STATE_FILE=$(echo "$DEFAULT_STATE_FILE" | sed s/:[0-9][0-9]*\.state/.state/)
fi
DEFAULT_ATTR_NAME="opa-${OCF_RESOURCE_INSTANCE}"
DEFAULT_ACTIVE_VALUE="1"
DEFAULT_INACTIVE_VALUE="0"
: ${OCF_RESKEY_state:="$DEFAULT_STATE_FILE"}
: ${OCF_RESKEY_name:="$DEFAULT_ATTR_NAME"}
# Values may be empty string
if [ -z ${OCF_RESKEY_active_value+x} ]; then
OCF_RESKEY_active_value="$DEFAULT_ACTIVE_VALUE"
fi
if [ -z ${OCF_RESKEY_inactive_value+x} ]; then
OCF_RESKEY_inactive_value="$DEFAULT_INACTIVE_VALUE"
fi
usage() {
USAGE_RC=$1
cat <<END
$USAGE
END
return $USAGE_RC
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="attribute" version="1.0">
<version>1.0</version>
<shortdesc lang="en">Manages a node attribute</shortdesc>
<longdesc lang="en">
This resource agent controls a node attribute for the node it's running on.
It sets the attribute one way when started, and another way when stopped,
according to the configuration parameters.
</longdesc>
<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Full path of a temporary file to store the resource state in
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${DEFAULT_STATE_FILE}" />
</parameter>
<parameter name="name" unique="1">
<longdesc lang="en">
Name of node attribute to manage
</longdesc>
<shortdesc lang="en">Attribute name</shortdesc>
<content type="string" default="${DEFAULT_ATTR_NAME}" />
</parameter>
<parameter name="active_value" unique="0">
<longdesc lang="en">
Value to use for node attribute when resource becomes active (empty string is
discouraged, because monitor cannot distinguish it from a query error)
</longdesc>
<shortdesc lang="en">Attribute value when active</shortdesc>
<content type="string" default="$DEFAULT_ACTIVE_VALUE" />
</parameter>
<parameter name="inactive_value" unique="0">
<longdesc lang="en">
Value to use for node attribute when resource becomes inactive
</longdesc>
<shortdesc lang="en">Attribute value when inactive</shortdesc>
<content type="string" default="$DEFAULT_INACTIVE_VALUE" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" timeout="20" interval="10" depth="0"/>
<action name="reload" timeout="20" />
<action name="migrate_to" timeout="20" />
<action name="migrate_from" timeout="20" />
<action name="validate-all" timeout="20" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
return $OCF_SUCCESS
}
validate() {
if [ "$OCF_RESKEY_active_value" = "$OCF_RESKEY_inactive_value" ]; then
ocf_exit_reason "active value '%s' must be different from inactive value '%s'" \
"$OCF_RESKEY_active_value" "$OCF_RESKEY_inactive_value"
return $OCF_ERR_CONFIGURED
fi
VALIDATE_DIR=$(dirname "${OCF_RESKEY_state}")
if [ ! -d "$VALIDATE_DIR" ]; then
ocf_exit_reason "state file '$OCF_RESKEY_state' does not have a valid directory"
return $OCF_ERR_PERM
fi
- if [ ! -w "$VALIDATE_DIR" -o ! -x "$VALIDATE_DIR" ]; then
+ if [ ! -w "$VALIDATE_DIR" ] || [ ! -x "$VALIDATE_DIR" ]; then
ocf_exit_reason "insufficient privileges on directory of state file '$OCF_RESKEY_state'"
return $OCF_ERR_PERM
fi
return $OCF_SUCCESS
}
get_attribute() {
GET_LINE=$(attrd_updater -n "$OCF_RESKEY_name" -Q 2>/dev/null)
if [ $? -ne 0 ]; then
echo ""
else
echo "$GET_LINE" | sed -e "s/.* value=\"\(.*\)\"$/\1/"
fi
}
set_attribute() {
attrd_updater -n "$OCF_RESKEY_name" -U "$1" 2>/dev/null
# TODO if above call is async, loop until get_attribute returns expected value
}
check_attribute() {
CHECK_VALUE=$(get_attribute)
CHECK_REASON=""
if [ ! -f "$OCF_RESKEY_state" ]; then
- if [ "$CHECK_VALUE" != "" -a "$CHECK_VALUE" != "$OCF_RESKEY_inactive_value" ]; then
+ if [ "$CHECK_VALUE" != "" ] && [ "$CHECK_VALUE" != "$OCF_RESKEY_inactive_value" ]; then
CHECK_REASON="Node attribute $OCF_RESKEY_name='$CHECK_VALUE' differs from expected value '$OCF_RESKEY_inactive_value'"
return $OCF_ERR_GENERIC
fi
return $OCF_NOT_RUNNING
fi
if [ "$CHECK_VALUE" != "$OCF_RESKEY_active_value" ]; then
CHECK_REASON="Node attribute $OCF_RESKEY_name='$CHECK_VALUE' differs from expected value '$OCF_RESKEY_active_value'"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
monitor() {
check_attribute
MONITOR_RC=$?
if [ $MONITOR_RC -eq $OCF_ERR_GENERIC ]; then
ocf_exit_reason "$CHECK_REASON"
fi
return $MONITOR_RC
}
start() {
check_attribute
if [ $? -eq $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch "${OCF_RESKEY_state}" 2>/dev/null
if [ $? -ne 0 ]; then
ocf_exit_reason "Unable to manage state file $OCF_RESKEY_state"
return $OCF_ERR_GENERIC
fi
set_attribute "${OCF_RESKEY_active_value}"
if [ $? -ne 0 ]; then
rm -f "${OCF_RESKEY_state}"
ocf_exit_reason "Unable to set node attribute $OCF_RESKEY_name='$OCF_RESKEY_active_value'"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
stop() {
check_attribute
if [ $? -eq $OCF_NOT_RUNNING ]; then
return $OCF_SUCCESS
fi
rm -f ${OCF_RESKEY_state}
set_attribute "${OCF_RESKEY_inactive_value}"
if [ $? -ne 0 ]; then
ocf_exit_reason "Unable to set node attribute $OCF_RESKEY_name='$OCF_RESKEY_inactive_value'"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
case $__OCF_ACTION in
meta-data) meta_data ;;
start) start ;;
stop) stop ;;
monitor) monitor ;;
# We don't do anything special for live migration, but we support it so that
# other resources that live migrate can depend on this one.
migrate_to) stop ;;
migrate_from) start ;;
reload) start ;;
validate-all) validate ;;
usage|help) usage $OCF_SUCCESS ;;
*) usage $OCF_ERR_UNIMPLEMENTED ;;
esac
exit $?
# vim: expandtab:tabstop=4:softtabstop=4:shiftwidth=4:textwidth=80
diff --git a/extra/resources/controld b/extra/resources/controld
index b4bd026c64..7c44845bfc 100755
--- a/extra/resources/controld
+++ b/extra/resources/controld
@@ -1,306 +1,298 @@
#!/bin/sh
#
# OCF resource agent for managing the DLM controld process
#
-# Copyright (c) 2009 Novell, Inc
+# Copyright 2009-2018 Novell, Inc
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
if [ -e "$OCF_ROOT/resource.d/heartbeat/controld" ]; then
ocf_log info "Using heartbeat controld agent"
$OCF_ROOT/resource.d/heartbeat/controld $1
exit $?
fi
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="controld" version="1.1">
<version>1.0</version>
<longdesc lang="en">
This Resource Agent can control the dlm_controld services needed by cluster-aware file systems.
It assumes that dlm_controld is in your default PATH.
In most cases, it should be run as an anonymous clone.
</longdesc>
<shortdesc lang="en">DLM Agent for cluster file systems</shortdesc>
<parameters>
<parameter name="args" unique="1">
<longdesc lang="en">
Any additional options to start the dlm_controld service with
</longdesc>
<shortdesc lang="en">DLM Options</shortdesc>
<content type="string" default="-s 0" />
</parameter>
<parameter name="daemon" unique="1">
<longdesc lang="en">
The daemon to start - supports gfs_controld(.pcmk) and dlm_controld(.pcmk)
</longdesc>
<shortdesc lang="en">The daemon to start</shortdesc>
<content type="string" default="dlm_controld.pcmk" />
</parameter>
<parameter name="allow_stonith_disabled">
<longdesc lang="en">
Allow DLM start-up even if STONITH/fencing is disabled in the cluster.
Setting this option to true will cause cluster malfunction and hangs on
fail-over for DLM clients that require fencing (such as GFS2, OCFS2, and
cLVM2).
This option is advanced use only.
</longdesc>
<shortdesc lang="en">Allow start-up even without STONITH/fencing</shortdesc>
<content type="string" default="false" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="90" />
<action name="stop" timeout="100" />
<action name="monitor" timeout="20" interval="10" depth="0" start-delay="0" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
CONFIGFS_DIR="/sys/kernel/config"
DLM_CONFIGFS_DIR="${CONFIGFS_DIR}/dlm"
DLM_SYSFS_DIR="/sys/kernel/dlm"
controld_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
check_uncontrolled_locks()
{
- local tmp
- tmp=$(ls $DLM_SYSFS_DIR 2>&1)
+ CUL_TMP=$(ls $DLM_SYSFS_DIR 2>&1)
if [ $? -eq 0 ]; then
- if [ -n "$tmp" ]; then
+ if [ -n "$CUL_TMP" ]; then
ocf_log err "Uncontrolled lockspace exists, system must reboot. Executing suicide fencing"
- stonith_admin --reboot=$(crm_node -n) --tag controld
+ stonith_admin --reboot="$(crm_node -n)" --tag controld
exit $OCF_ERR_GENERIC
fi
fi
}
controld_start() {
controld_monitor; rc=$?
case $rc in
$OCF_SUCCESS) return $OCF_SUCCESS;;
$OCF_NOT_RUNNING) ;;
*) return $OCF_ERR_GENERIC;;
esac
# Ensure configfs is mounted
if [ ! -e "$CONFIGFS_DIR" ]; then
modprobe configfs
if [ ! -e "$CONFIGFS_DIR" ]; then
ocf_log err "$CONFIGFS_DIR not available"
return $OCF_ERR_INSTALLED
fi
fi
mount -t configfs | grep " $CONFIGFS_DIR " >/dev/null 2>/dev/null
if [ $? -ne 0 ]; then
mount -t configfs none "$CONFIGFS_DIR"
fi
# Ensure DLM is available
if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
modprobe dlm
if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
ocf_log err "$DLM_CONFIGFS_DIR not available"
return $OCF_ERR_INSTALLED
fi
fi
if ! ocf_is_true "$OCF_RESKEY_allow_stonith_disabled" && \
! ocf_is_true "`crm_attribute --type=crm_config --name=stonith-enabled --query --quiet --default=true`"; then
ocf_log err "The cluster property stonith-enabled may not be deactivated to use the DLM"
return $OCF_ERR_CONFIGURED
fi
${OCF_RESKEY_daemon} $OCF_RESKEY_args
while true
do
sleep 1
controld_monitor; rc=$?
case $rc in
$OCF_SUCCESS)
- local addr_list
- addr_list="$(cat "${DLM_CONFIGFS_DIR}"/cluster/comms/*/addr_list 2>/dev/null)"
- if [ $? -eq 0 ] && [ -n "$addr_list" ]; then
+ CS_ADDR_LIST="$(cat "${DLM_CONFIGFS_DIR}"/cluster/comms/*/addr_list 2>/dev/null)"
+ if [ $? -eq 0 ] && [ -n "$CS_ADDR_LIST" ]; then
return $OCF_SUCCESS
fi
;;
$OCF_NOT_RUNNING)
return $OCF_NOT_RUNNING
;;
*)
return $OCF_ERR_GENERIC
;;
esac
ocf_log debug "Waiting for ${OCF_RESKEY_daemon} to be ready"
done
}
controld_stop() {
controld_monitor; rc=$?
if [ $rc = $OCF_NOT_RUNNING ]; then
return $OCF_SUCCESS
fi
killall -TERM ${OCF_RESKEY_daemon}; rc=$?
if [ $rc != 0 ]; then
return $OCF_ERR_GENERIC
fi
rc=$OCF_SUCCESS
while [ $rc = $OCF_SUCCESS ]; do
controld_monitor; rc=$?
sleep 1
done
if [ $rc = $OCF_NOT_RUNNING ]; then
rc=$OCF_SUCCESS
fi
return $rc
}
controld_monitor() {
- local rc
- killall -0 ${OCF_RESKEY_daemon} >/dev/null 2>&1 ; rc=$?
+ killall -0 ${OCF_RESKEY_daemon} >/dev/null 2>&1 ; CM_RC=$?
- case $rc in
+ case $CM_RC in
0) smw=$(dlm_tool status -v | grep "stateful_merge_wait=" | cut -d= -f2)
if [ -n "$smw" ] && [ $smw -eq 1 ]; then
ocf_log err "DLM status is: stateful_merge_wait"
- rc=$OCF_ERR_GENERIC
+ CM_RC=$OCF_ERR_GENERIC
elif [ -z "$smw" ] && dlm_tool ls | grep -q "wait fencing" && \
! stonith_admin -H '*' -V | grep -q "wishes to"; then
ocf_log err "DLM status is: wait fencing"
- rc=$OCF_ERR_GENERIC
+ CM_RC=$OCF_ERR_GENERIC
else
- rc=$OCF_SUCCESS
+ CM_RC=$OCF_SUCCESS
fi
;;
- 1) rc=$OCF_NOT_RUNNING;;
- *) rc=$OCF_ERR_GENERIC;;
+ 1) CM_RC=$OCF_NOT_RUNNING;;
+ *) CM_RC=$OCF_ERR_GENERIC;;
esac
# if the dlm is not successfully running, but
# dlm lockspace bits are left over, we self must fence.
- if [ $rc -ne $OCF_SUCCESS ]; then
+ if [ $CM_RC -ne $OCF_SUCCESS ]; then
check_uncontrolled_locks
fi
- return $rc
+ return $CM_RC
}
controld_validate() {
check_binary killall
check_binary ${OCF_RESKEY_daemon}
case ${OCF_RESKEY_CRM_meta_globally_unique} in
yes|Yes|true|True|1)
ocf_log err "$OCF_RESOURCE_INSTANCE must be configured with the globally_unique=false meta attribute"
exit $OCF_ERR_CONFIGURED
;;
esac
[ -d /var/run/cluster ] || mkdir /var/run/cluster
return $OCF_SUCCESS
}
: ${OCF_RESKEY_sctp=false}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
-case "$HA_quorum_type" in
- pcmk) daemon_ext=".pcmk";;
- *) daemon_ext="";;
-esac
-
case "$OCF_RESOURCE_INSTANCE" in
*[gG][fF][sS]*)
: ${OCF_RESKEY_args=-g 0}
- : ${OCF_RESKEY_daemon=gfs_controld${daemon_ext}}
+ : ${OCF_RESKEY_daemon=gfs_controld}
;;
*[dD][lL][mM]*)
: ${OCF_RESKEY_args=-s 0}
- : ${OCF_RESKEY_daemon=dlm_controld${daemon_ext}}
+ : ${OCF_RESKEY_daemon=dlm_controld}
;;
*)
: ${OCF_RESKEY_args=-s 0}
- : ${OCF_RESKEY_daemon=dlm_controld${daemon_ext}}
+ : ${OCF_RESKEY_daemon=dlm_controld}
esac
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) controld_validate; controld_start;;
stop) controld_stop;;
monitor) controld_validate; controld_monitor;;
validate-all) controld_validate;;
usage|help) controld_usage
exit $OCF_SUCCESS
;;
*) controld_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
exit $rc
diff --git a/extra/resources/ifspeed.in b/extra/resources/ifspeed.in
index d5eee6c5f8..c60e55b59c 100755
--- a/extra/resources/ifspeed.in
+++ b/extra/resources/ifspeed.in
@@ -1,541 +1,538 @@
#!@BASH_PATH@
#
# OCF resource agent which monitors state of network interface and records it
# as a node attribute in the CIB based on the sum of speeds of its active (up,
# link detected, not blocked) underlying interfaces.
#
-# Copyright (c) 2011 Vladislav Bogdanov <bubble@hoster-ok.com>
+# Copyright 2011-2018 Vladislav Bogdanov <bubble@hoster-ok.com>
# Partially based on 'ping' RA by Andrew Beekhof
#
# Change on 2017 by Tomer Azran <tomerazran@gmail.com>:
# Add "ip" parameter to detect network interface name by ip address:
# http://lists.clusterlabs.org/pipermail/users/2017-August/006224.html
#
# OCF instance parameters:
# OCF_RESKEY_name: name of attribute to set in CIB
# OCF_RESKEY_ip ip address to check
# OCF_RESKEY_iface: network interface to monitor
# OCF_RESKEY_bridge_ports: if not null and OCF_RESKEY_iface is a bridge, list of
# bridge ports to consider.
# Default is all ports which have designated_bridge=root_id
# OCF_RESKEY_weight_base: Relative weight of 1Gbps. This can be used to tune
# value of resulting CIB attribute.
#
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
# If these aren't available, we can still show help,
# which is all that is needed to build the man pages.
[ -r "${OCF_FUNCTIONS}" ] && . "${OCF_FUNCTIONS}"
[ -r "${OCF_FUNCTIONS_DIR}/findif.sh" ] && . "${OCF_FUNCTIONS_DIR}/findif.sh"
: ${OCF_SUCCESS=0}
: ${__OCF_ACTION=$1}
FINDIF=findif
# Defaults
OCF_RESKEY_name_default="ifspeed"
OCF_RESKEY_bridge_ports_default="detect"
OCF_RESKEY_weight_base_default=1000
OCF_RESKEY_dampen_default=5
: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
: ${OCF_RESKEY_bridge_ports=${OCF_RESKEY_bridge_ports_default}}
: ${OCF_RESKEY_weight_base=${OCF_RESKEY_weight_base_default}}
: ${OCF_RESKEY_dampen=${OCF_RESKEY_dampen_default}}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ifspeed" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Every time the monitor action is run, this resource agent records (in the CIB)
(relative) speed of network interface it monitors.
This RA can monitor physical interfaces, bonds, bridges, vlans and (hopefully)
any combination of them.
Examples:
*) Bridge on top of one 10Gbps interface (eth2) and 802.3ad bonding (bond0) built
on two 1Gbps interfaces (eth0 and eth1).
*) Active-backup bonding built on top of one physical interface and one vlan on
another interface.
For STP-enabled bridges this RA tries to some-how guess network topology and by
default looks only on ports which are connected to upstream switch. This can be
overridden by 'bridge_ports' parameter. Active interfaces in this case are those
in "forwarding" state.
For balancing bonds this RA summs speeds of underlying "up" slave interfaces
(and applies coefficient 0.8 to result).
For non-balancing bonds ('active-backup' and probably 'broadcast'), only the
speed of the currently active slave is used.
</longdesc>
<shortdesc lang="en">Network interface speed monitor</shortdesc>
<parameters>
<parameter name="name" unique="1">
<longdesc lang="en">
The name of the attribute to set. This is the name to be used in the constraints.
</longdesc>
<shortdesc lang="en">Attribute name</shortdesc>
<content type="string" default="${OCF_RESKEY_name_default}"/>
</parameter>
<parameter name="iface" unique="0" required="1">
<longdesc lang="en">
Network interface to monitor.
</longdesc>
<shortdesc lang="en">Network interface</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="ip" unique="0" required="0">
<longdesc lang="en">
Try to detect interface name by detecting the interface that holds the IP address.
The IPv4 (dotted quad notation) or IPv6 address (colon hexadecimal notation)
example IPv4 "192.168.1.1".
example IPv6 "2001:db8:DC28:0:0:FC57:D4C8:1FFF".
</longdesc>
<shortdesc lang="en">IPv4 or IPv6 address</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="bridge_ports" unique="0">
<longdesc lang="en">
If not null and OCF_RESKEY_iface is a bridge, list of bridge ports to consider.
Default is all ports which have designated_bridge=root_id.
</longdesc>
<shortdesc lang="en">Bridge ports</shortdesc>
<content type="string" default="${OCF_RESKEY_bridge_ports_default}"/>
</parameter>
<parameter name="weight_base" unique="0">
<longdesc lang="en">
Relative weight of 1Gbps in interface speed.
Can be used to tune how big attribute value will be.
</longdesc>
<shortdesc lang="en">Weight of 1Gbps</shortdesc>
<content type="integer" default="${OCF_RESKEY_weight_base_default}"/>
</parameter>
<parameter name="dampen" unique="0">
<longdesc lang="en">
The time to wait (dampening) for further changes to occur.
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="integer" default="${OCF_RESKEY_dampen_default}"/>
</parameter>
<parameter name="debug" unique="0">
<longdesc lang="en">
Log what have been done more verbosely.
</longdesc>
<shortdesc lang="en">Verbose logging</shortdesc>
<content type="string" default="false"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="30" />
<action name="stop" timeout="30" />
<action name="monitor" depth="0" timeout="30" interval="10"/>
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
start() {
monitor
if [ $? -eq $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
ha_pseudo_resource ${ha_pseudo_resource_name} start
update
return $?
}
stop() {
ha_pseudo_resource ${ha_pseudo_resource_name} stop
attrd_updater -D -n ${OCF_RESKEY_name} -d ${OCF_RESKEY_dampen} ${attrd_options}
return $OCF_SUCCESS
}
monitor() {
local ret
ha_pseudo_resource ${ha_pseudo_resource_name} monitor
ret=$?
if [ ${ret} -eq $OCF_SUCCESS ] ; then
update
fi
return ${ret}
}
# This function tries to guess nic interface by IP
get_nic_name_by_ip(){
# $FINDIF takes its parameters from the environment.
# Its output is as follows:
# [NIC_NAME] netmask [NETMASK] broadcast [BROADCAST}
NICINFO=$( ${FINDIF} )
rc=$?
if [ $rc -eq 0 ];then
# Get NIC_NAME part of findif function output.
echo "${NICINFO%% *}"
else
echo ""
fi
}
validate() {
# Check the interface parameter
if [ -z "${OCF_RESKEY_iface}" ]; then
if [ -z "${OCF_RESKEY_ip}" ]; then
ocf_log err "Empty iface and ip parameters. Please specify either an interface name or valid ip address."
exit $OCF_ERR_CONFIGURED
else
ipcheck_ipv4 "${OCF_RESKEY_ip}"
if [ $? -eq 1 ] ; then
ipcheck_ipv6 "${OCF_RESKEY_ip}"
if [ $? -eq 1 ] ; then
ocf_exit_reason "ip parameter [${OCF_RESKEY_ip}] is not a valid ip address."
exit $OCF_ERR_CONFIGURED
fi
fi
fi
fi
return $OCF_SUCCESS
}
iface_get_speed() {
local iface=$1
local operstate
local carrier
- local bridge_iface_speed
- local bond_iface_speed
- local vlan_iface_speed
local speed
if [ ! -e "/sys/class/net/${iface}" ] ; then
echo "0"
elif iface_is_bridge ${iface} ; then # bridges do not have operstate
read carrier < "/sys/class/net/${iface}/carrier"
if [ "${carrier}" != "1" ] ; then
echo "0"
else
bridge_get_speed ${iface}
fi
else
read operstate < "/sys/class/net/${iface}/operstate"
read carrier < "/sys/class/net/${iface}/carrier"
if [ "${operstate}" != "up" ] || [ "${carrier}" != "1" ] ; then
echo "0"
elif iface_is_bond ${iface} ; then
bond_get_speed ${iface}
elif iface_is_vlan ${iface} ; then
- iface_get_speed $( vlan_get_phy ${iface} )
+ iface_get_speed "$(vlan_get_phy "${iface}")"
elif iface_is_hfi1 "${iface}" ; then
hfi1_get_speed "${iface}"
else
read speed < "/sys/class/net/${iface}/speed"
echo ${speed}
fi
fi
}
iface_is_vlan() {
local iface=$1
[ -e "/proc/net/vlan/${iface}" ] && return 0 || return 1
}
iface_is_bridge() {
local iface=$1
[ -e "/sys/class/net/${iface}/bridge" ] && return 0 || return 1
}
iface_is_bond() {
local iface=$1
[ -e "/sys/class/net/${iface}/bonding" ] && return 0 || return 1
}
iface_is_hfi1() {
local iface=$1
driver=$(readlink /sys/class/net/${iface}/device/driver)
[[ $(basename ${driver}) =~ "hfi1" ]] && return 0 || return 1
}
vlan_get_phy() {
local iface=$1
sed -ne "s/^${iface} .*| *//p" < /proc/net/vlan/config
}
bridge_is_stp_enabled() {
local iface=$1
local stp
read stp < "/sys/class/net/${iface}/bridge/stp_state"
[ "${stp}" = "1" ] && return 0 || return 1
}
bridge_get_root_ports() {
local bridge=$1
local root_id
local root_ports=""
local bridge_id
read root_id < "/sys/class/net/${bridge}/bridge/root_id"
for port in /sys/class/net/${bridge}/brif/* ; do
read bridge_id < "${port}/designated_bridge"
if [ "${bridge_id}" = "${root_id}" ] ; then
root_ports="${root_ports} ${port##*/}"
fi
done
root_ports=${root_ports# }
if [ -n "$2" ] ; then # Record value in specified var. This expects we were called not in a sub-shell.
- eval $2=\${root_ports}
+ eval "$2=\${root_ports}"
else # Expect sub-shell
echo ${root_ports}
fi
}
# From /inlude/linux/if_bridge.h:
#define BR_STATE_DISABLED 0
#define BR_STATE_LISTENING 1
#define BR_STATE_LEARNING 2
#define BR_STATE_FORWARDING 3
#define BR_STATE_BLOCKING 4
bridge_get_active_ports() {
local bridge=$1
shift 1
local ports="$*"
local active_ports=""
local port_state
local stp_state
local warn=0
bridge_is_stp_enabled ${bridge}
stp_state=$?
if [ -z "${ports}" ] || [ "${ports}" = "detect" ] ; then
bridge_get_root_ports ${bridge} ports
fi
for port in $ports ; do
if [ ! -e "/sys/class/net/${bridge}/brif/${port}" ] ; then
ocf_log warning "Port ${port} doesn't belong to bridge ${bridge}"
continue
fi
read port_state < "/sys/class/net/${bridge}/brif/${port}/state"
if [ "${port_state}" = "3" ] ; then
if [ -n "${active_ports}" ] && ${stp_state} ; then
warn=1
fi
active_ports="${active_ports} ${port}"
fi
done
if [ ${warn} -eq 1 ] ; then
ocf_log warning "More then one upstream port in bridge '${bridge}' is in forwarding state while STP is enabled: ${active_ports}"
fi
echo "${active_ports# }"
}
bridge_get_speed() {
local iface=$1
- local bridge_port_speed
local aggregate_speed=0
if ! iface_is_bridge ${iface} ; then
echo 0
return
fi
- local ports=$( bridge_get_active_ports ${iface} ${OCF_RESKEY_bridge_ports} )
- for port in ${ports} ; do
+ BGS_PORTS=$( bridge_get_active_ports "${iface}" "${OCF_RESKEY_bridge_ports}" )
+ for port in ${BGS_PORTS} ; do
: $(( aggregate_speed += $( iface_get_speed ${port} ) ))
done
if [ -n "$2" ] ; then # Record value in specified var. This expects we were called not in a sub-shell.
- eval $2=\${aggregate_speed}
+ eval "$2=\${aggregate_speed}"
else # Expect sub-shell
echo ${aggregate_speed}
fi
}
hfi1_get_speed() {
local iface=$1
local hfi1_speed
local hfi1_value
local hfi1_desc
- # Currently (9/14/2017 Intel Omni Path v10.5.0.0.155) Intel doesn't have Dual/Multiple ports Host Channel Adapters
- # and it's save to use such method to get a speed.
- # Example of output:
+ # Currently (9/14/2017 Intel Omni Path v10.5.0.0.155), Intel doesn't have
+ # dual- or multiple-port Host Channel Adapters, and it's safe to use this
+ # method to get the speed. Example output:
# [root@es-host0 ~]# cat /sys/class/net/ib0/device/infiniband/*/ports/*/rate
# 100 Gb/sec (4X EDR)
read hfi1_speed hfi1_value hfi1_desc < /sys/class/net/${iface}/device/infiniband/*/ports/*/rate
+ ocf_is_true ${OCF_RESKEY_debug} && ocf_log debug "Detected speed $hfi1_speed $hfi1_value $hfi1_desc"
# hfi1_value always in Gb/sec, so we need to convert hfi1_speed in Mb/sec
echo $(( hfi1_speed * 1000 ))
}
bond_get_slaves() {
local iface=$1
local slaves
read slaves < "/sys/class/net/${iface}/bonding/slaves"
if [ -n "$2" ] ; then # Record value in specified var. This expects we were called not in a sub-shell.
- eval $2=\${slaves}
+ eval "$2=\${slaves}"
else # Expect sub-shell
echo ${slaves}
fi
}
bond_get_active_iface() {
local iface=$1
local active
read active < "/sys/class/net/${iface}/bonding/active_slave"
if [ -n "$2" ] ; then # Record value in specified var. This expects we were called not in a sub-shell.
- eval $2=\${active}
+ eval "$2=\${active}"
else # Expect sub-shell
echo ${active}
fi
}
bond_is_balancing() {
local iface=$1
read mode mode_index < "/sys/class/net/${iface}/bonding/mode"
+ ocf_is_true ${OCF_RESKEY_debug} && ocf_log debug "Detected balancing $mode $mode_index"
case ${mode} in
"balance-rr"|"balance-xor"|"802.3ad"|"balance-tlb"|"balance-alb")
return 0
;;
*)
return 1
;;
esac
}
bond_get_speed() {
local iface=$1
local aggregate_speed=0
- local bond_slave_speed
local active_iface
local bond_slaves
if ! iface_is_bond ${iface} ; then
echo 0
return
fi
bond_get_slaves ${iface} bond_slaves
if bond_is_balancing ${iface} ; then
for slave in ${bond_slaves} ; do
: $(( aggregate_speed += $( iface_get_speed ${slave} ) ))
done
# Bonding is unable to get speed*n
: $(( aggregate_speed = aggregate_speed * 8 / 10 ))
else
bond_get_active_iface ${iface} active_iface
aggregate_speed=$( iface_get_speed $active_iface )
fi
if [ -n "$2" ] ; then # Record value in specified var. This expects we were called not in a sub-shell.
- eval $2=\${aggregate_speed}
+ eval "$2=\${aggregate_speed}"
else # Expect sub-shell
echo ${aggregate_speed}
fi
}
update() {
local speed;
local nic=${OCF_RESKEY_iface};
if [ -z "${OCF_RESKEY_iface}" ]; then
nic=$( get_nic_name_by_ip )
if [ -z "${nic}" ];then
ocf_log err "Could not retrieve network interface name from ip address (${OCF_RESKEY_ip})"
exit $OCF_ERR_GENERIC
fi
fi
speed=$( iface_get_speed ${nic} )
: $(( score = speed * ${OCF_RESKEY_weight_base} / 1000 ))
if [ "$__OCF_ACTION" = "start" ] ; then
attrd_updater -n ${OCF_RESKEY_name} -B ${score} -d ${OCF_RESKEY_dampen} ${attrd_options}
else
attrd_updater -n ${OCF_RESKEY_name} -v ${score} -d ${OCF_RESKEY_dampen} ${attrd_options}
fi
rc=$?
case ${rc} in
0)
ocf_is_true ${OCF_RESKEY_debug} && ocf_log debug "Updated ${OCF_RESKEY_name} = ${score}"
;;
*)
ocf_log warn "Could not update ${OCF_RESKEY_name} = ${score}: rc=${rc}"
;;
esac
return ${rc}
}
case $__OCF_ACTION in
meta-data)
meta_data
exit $OCF_SUCCESS
;;
usage|help)
usage
exit $OCF_SUCCESS
;;
esac
-if [ `uname` != "Linux" ] ; then
+if [ "$(uname)" != "Linux" ] ; then
ocf_log err "This RA works only on linux."
exit $OCF_ERR_INSTALLED
fi
: ${ha_pseudo_resource_name:="ifspeed-${OCF_RESOURCE_INSTANCE}"}
attrd_options='-q'
if ocf_is_true ${OCF_RESKEY_debug} ; then
attrd_options=''
fi
validate || exit $?
case $__OCF_ACTION in
start)
start
;;
stop)
stop
;;
monitor)
monitor
;;
validate-all)
;;
*)
usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?
diff --git a/extra/resources/ping b/extra/resources/ping
index 2d92d73a3c..7ae71d2837 100755
--- a/extra/resources/ping
+++ b/extra/resources/ping
@@ -1,434 +1,414 @@
#!/bin/sh
#
+# Ping OCF RA that utilizes the system ping
#
-# Ping OCF RA that utilizes the system ping
+# Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
#
-# Copyright (c) 2009 Andrew Beekhof
-# All Rights Reserved.
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of version 2 of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# Further, this software is distributed without any warranty that it is
-# free of the rightful claim of any third person regarding infringement
-# or the like. Any license provided herein, whether implied or
-# otherwise, applies only to this software file. Patent licenses, if
-# any, provided herein do not apply to combinations of this program with
-# other software, or any other product whatsoever.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write the Free Software Foundation,
-# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
-#
-
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="ping" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Every time the monitor action is run, this resource agent records (in the CIB) the current number of nodes the host can connect to using the system fping (preferred) or ping tool.
</longdesc>
<shortdesc lang="en">node connectivity</shortdesc>
<parameters>
<parameter name="pidfile" unique="0">
<longdesc lang="en">PID file</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/ping-${OCF_RESOURCE_INSTANCE}" />
</parameter>
<parameter name="dampen" unique="0">
<longdesc lang="en">
The time to wait (dampening) further changes occur
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="integer" default="5s"/>
</parameter>
<parameter name="name" unique="0">
<longdesc lang="en">
The name of the attributes to set. This is the name to be used in the constraints.
</longdesc>
<shortdesc lang="en">Attribute name</shortdesc>
<content type="string" default="pingd"/>
</parameter>
<parameter name="multiplier" unique="0">
<longdesc lang="en">
The number by which to multiply the number of connected ping nodes by
</longdesc>
<shortdesc lang="en">Value multiplier</shortdesc>
<content type="integer" default="1"/>
</parameter>
<parameter name="host_list" unique="0" required="1">
<longdesc lang="en">
A space separated list of ping nodes to count.
</longdesc>
<shortdesc lang="en">Host list</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="attempts" unique="0">
<longdesc lang="en">
Number of ping attempts, per host, before declaring it dead
</longdesc>
<shortdesc lang="en">no. of ping attempts</shortdesc>
<content type="integer" default="3"/>
</parameter>
<parameter name="timeout" unique="0">
<longdesc lang="en">
How long, in seconds, to wait before declaring a ping lost
</longdesc>
<shortdesc lang="en">ping timeout in seconds</shortdesc>
<content type="integer" default="2"/>
</parameter>
<parameter name="options" unique="0">
<longdesc lang="en">
A catch all for any other options that need to be passed to ping.
</longdesc>
<shortdesc lang="en">Extra Options</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="failure_score" unique="0">
<longdesc lang="en">
Resource is failed if the score is less than failure_score.
Default never fails.
</longdesc>
<shortdesc lang="en">failure_score</shortdesc>
<content type="integer" default=""/>
</parameter>
<parameter name="use_fping" unique="0">
<longdesc lang="en">
Use fping rather than ping, if found. If set to 0, fping
will not be used even if present.
</longdesc>
<shortdesc lang="en">Use fping if available</shortdesc>
<content type="boolean" default="1"/>
</parameter>
<parameter name="debug" unique="0">
<longdesc lang="en">
Enables to use default attrd_updater verbose logging on every call.
</longdesc>
<shortdesc lang="en">Verbose logging</shortdesc>
<content type="string" default="false"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="60" />
<action name="stop" timeout="20" />
<action name="monitor" depth="0" timeout="60" interval="10"/>
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="30" />
</actions>
</resource-agent>
END
}
#######################################################################
ping_conditional_log() {
level=$1; shift
if [ ${OCF_RESKEY_debug} = "true" ]; then
ocf_log $level "$*"
fi
}
ping_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
ping_start() {
ping_monitor
if [ $? = $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch ${OCF_RESKEY_pidfile}
ping_update
}
ping_stop() {
rm -f ${OCF_RESKEY_pidfile}
attrd_updater -D -n $OCF_RESKEY_name -d $OCF_RESKEY_dampen $attrd_options
return $OCF_SUCCESS
}
ping_monitor() {
if [ -f ${OCF_RESKEY_pidfile} ]; then
ping_update
if [ $? -eq 0 ]; then
return $OCF_SUCCESS
fi
return $OCF_ERR_GENERIC
fi
return $OCF_NOT_RUNNING
}
ping_validate() {
# Is the state directory writable?
state_dir=`dirname "$OCF_RESKEY_pidfile"`
touch "$state_dir/$$"
if [ $? != 0 ]; then
ocf_log err "Invalid location for 'state': $state_dir is not writable"
return $OCF_ERR_ARGS
fi
rm "$state_dir/$$"
# Pidfile better be an absolute path
case $OCF_RESKEY_pidfile in
/*) ;;
*) ocf_log warn "You should use an absolute path for pidfile not: $OCF_RESKEY_pidfile" ;;
esac
# Check the host list
if [ "x" = "x$OCF_RESKEY_host_list" ]; then
ocf_log err "Empty host_list. Please specify some nodes to ping"
exit $OCF_ERR_CONFIGURED
fi
# For fping allow only same IP versions or hostnames
if use_fping; then
hosts_family
if [ $? -eq 99 ]; then
ocf_log err "host_list can contain only host with same IP versions for fping"
exit $OCF_ERR_CONFIGURED
fi
fi
check_binary ping
return $OCF_SUCCESS
}
fping_check() {
p_exe=fping
hosts_family
case $? in
6) p_exe=fping6 ;;
99) ocf_log err "Ambiguous IP versions in host_list: '$OCF_RESKEY_host_list'"; exit $OCF_ERR_CONFIGURED;;
esac
active=0
- n=$OCF_RESKEY_attempts
timeout=`expr $OCF_RESKEY_timeout \* 1000 / $OCF_RESKEY_attempts`
cmd="$p_exe -r $OCF_RESKEY_attempts -t $timeout -B 1.0 $OCF_RESKEY_options $OCF_RESKEY_host_list"
output=`$cmd 2>&1`; rc=$?
active=`echo "$output" | grep "is alive" | wc -l`
case $rc in
0)
;;
1)
for h in `echo "$output" | grep "is unreachable" | awk '{print $1}'`; do
ping_conditional_log warn "$h is inactive"
done
;;
*)
ocf_log err "Unexpected result for '$cmd' $rc: `echo "$output" | tr '\n' ';'`"
;;
esac
return $active
}
ping_check() {
active=0
for host in $OCF_RESKEY_host_list; do
p_exe=ping
case `uname` in
Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";;
Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";;
FreeBSD) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";;
*) ocf_log err "Unknown host type: `uname`"; exit $OCF_ERR_INSTALLED;;
esac
case $host in
*:*) p_exe=ping6
esac
p_out=`$p_exe $p_args $OCF_RESKEY_options $host 2>&1`; rc=$?
case $rc in
0) active=`expr $active + 1`;;
1) ping_conditional_log warn "$host is inactive: $p_out";;
*) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";;
esac
done
return $active
}
ping_update() {
if use_fping; then
fping_check
active=$?
else
ping_check
active=$?
fi
score=`expr $active \* $OCF_RESKEY_multiplier`
if [ "$__OCF_ACTION" = "start" ] ; then
attrd_updater -n $OCF_RESKEY_name -B $score -d $OCF_RESKEY_dampen $attrd_options
else
attrd_updater -n $OCF_RESKEY_name -v $score -d $OCF_RESKEY_dampen $attrd_options
fi
rc=$?
case $rc in
0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;;
*) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";;
esac
if [ $rc -ne 0 ]; then
return $rc
fi
- if [ -n "$OCF_RESKEY_failure_score" -a "$score" -lt "$OCF_RESKEY_failure_score" ]; then
+ if [ -n "$OCF_RESKEY_failure_score" ] && [ "$score" -lt "$OCF_RESKEY_failure_score" ]; then
ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)"
return 1
fi
return 0
}
use_fping() {
ocf_is_true "$OCF_RESKEY_use_fping" && have_binary fping;
}
# return values:
# 4 IPv4
# 6 IPv6
# 0 indefinite (i.e. hostname)
host_family() {
case $1 in
*[0-9].*[0-9].*[0-9].*[0-9]) return 4 ;;
*:*) return 6 ;;
*) return 0 ;;
esac
}
# return values same as host_family plus
# 99 ambiguous families
hosts_family() {
# For fping allow only same IP versions or hostnames
family=0
for host in $OCF_RESKEY_host_list; do
host_family $host
f=$?
- if [ $family -ne 0 -a $f -ne 0 -a $f -ne $family ] ; then
+ if [ $family -ne 0 ] && [ $f -ne 0 ] && [ $f -ne $family ] ; then
family=99
break
fi
[ $f -ne 0 ] && family=$f
done
return $family
}
: ${OCF_RESKEY_name:="pingd"}
: ${OCF_RESKEY_dampen:="5s"}
: ${OCF_RESKEY_attempts:="3"}
: ${OCF_RESKEY_multiplier:="1"}
: ${OCF_RESKEY_debug:="false"}
: ${OCF_RESKEY_failure_score:="0"}
: ${OCF_RESKEY_use_fping:="1"}
: ${OCF_RESKEY_CRM_meta_timeout:="20000"}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
integer=`echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*'`
case ${OCF_RESKEY_timeout} in
*[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=`expr $integer / 1000`;;
*[0-9]m|*[0-9]min) OCF_RESKEY_timeout=`expr $integer \* 60`;;
*[0-9]h|*[0-9]hr) OCF_RESKEY_timeout=`expr $integer \* 60 \* 60`;;
*) OCF_RESKEY_timeout=$integer;;
esac
if [ -z ${OCF_RESKEY_timeout} ]; then
if [ x"$OCF_RESKEY_host_list" != x ]; then
host_count=`echo $OCF_RESKEY_host_list | awk '{print NF}'`
OCF_RESKEY_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts`
OCF_RESKEY_timeout=`expr $OCF_RESKEY_timeout / 1100` # Convert to seconds and finish 10% early
else
OCF_RESKEY_timeout=5
fi
fi
if [ ${OCF_RESKEY_timeout} -lt 1 ]; then
OCF_RESKEY_timeout=5
elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then
# ping actually complains if this value is too high, 5 minutes is plenty
OCF_RESKEY_timeout=300
fi
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
: ${OCF_RESKEY_pidfile:="${HA_VARRUN%%/}/ping-${OCF_RESKEY_name}"}
else
: ${OCF_RESKEY_pidfile:="${HA_VARRUN%%/}/ping-${OCF_RESOURCE_INSTANCE}"}
fi
# Check the debug option
case "${OCF_RESKEY_debug}" in
true|True|TRUE|1) OCF_RESKEY_debug=true;;
false|False|FALSE|0) OCF_RESKEY_debug=false;;
*)
ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}"
OCF_RESKEY_debug=false
;;
esac
attrd_options='-q'
if [ ${OCF_RESKEY_debug} = "true" ]; then
attrd_options=''
fi
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) ping_start;;
stop) ping_stop;;
monitor) ping_monitor;;
validate-all) ping_validate;;
usage|help) ping_usage
exit $OCF_SUCCESS
;;
*) ping_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit $?
diff --git a/tools/crm_failcount.in b/tools/crm_failcount.in
index c95028a7a8..c3050c75aa 100755
--- a/tools/crm_failcount.in
+++ b/tools/crm_failcount.in
@@ -1,287 +1,292 @@
#!@BASH_PATH@
+#
+# Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
+#
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
+#
USAGE_TEXT="Usage: crm_failcount <command> [<options>]
Common options:
--help Display this text, then exit
--version Display version information, then exit
-V, --verbose Specify multiple times to increase debug output
-q, --quiet Print only the value (if querying)
Commands:
-G, --query Query the current value of the resource's fail count
-D, --delete Delete resource's recorded failures
Additional Options:
-r, --resource=value Name of the resource to use (required)
-n, --operation=value Name of operation to use (instead of all operations)
-I, --interval=value If operation is specified, its interval
-N, --node=value Use failcount on named node (instead of local node)"
HELP_TEXT="crm_failcount - Query or delete resource fail counts
$USAGE_TEXT"
# These constants must track crm_exit_t values
CRM_EX_OK=0
-CRM_EX_ERROR=1
CRM_EX_USAGE=64
CRM_EX_NOSUCH=105
exit_usage() {
if [ $# -gt 0 ]; then
- echo "error: $@" >&2
+ echo "error:" "$@" >&2
fi
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
}
warn() {
- echo "warning: $@" >&2
+ echo "warning:" "$@" >&2
}
interval_re() {
echo "^[[:blank:]]*([0-9]+)[[:blank:]]*(${1})[[:blank:]]*$"
}
# This function should follow crm_get_interval() as closely as possible
parse_interval() {
INT_S="$1"
INT_8601RE="^P(([0-9]+)Y)?(([0-9]+)M)?(([0-9]+)D)?T?(([0-9]+)H)?(([0-9]+)M)?(([0-9]+)S)?$"
if [[ $INT_S =~ $(interval_re "") ]]; then
echo $(( ${BASH_REMATCH[1]} * 1000 ))
elif [[ $INT_S =~ $(interval_re "s|sec") ]]; then
echo $(( ${BASH_REMATCH[1]} * 1000 ))
elif [[ $INT_S =~ $(interval_re "ms|msec") ]]; then
echo "${BASH_REMATCH[1]}"
elif [[ $INT_S =~ $(interval_re "m|min") ]]; then
echo $(( ${BASH_REMATCH[1]} * 60000 ))
elif [[ $INT_S =~ $(interval_re "h|hr") ]]; then
echo $(( ${BASH_REMATCH[1]} * 3600000 ))
elif [[ $INT_S =~ $(interval_re "us|usec") ]]; then
echo $(( ${BASH_REMATCH[1]} / 1000 ))
elif [[ $INT_S =~ ^P([0-9]+)W$ ]]; then
echo $(( ${BASH_REMATCH[1]} * 604800000 ))
elif [[ $INT_S =~ $INT_8601RE ]]; then
echo $(( ( ${BASH_REMATCH[2]:-0} * 31536000000 ) \
+ ( ${BASH_REMATCH[4]:-0} * 2592000000 ) \
+ ( ${BASH_REMATCH[6]:-0} * 86400000 ) \
+ ( ${BASH_REMATCH[8]:-0} * 3600000 ) \
+ ( ${BASH_REMATCH[10]:-0} * 60000 ) \
+ ( ${BASH_REMATCH[12]:-0} * 1000 ) ))
else
warn "Unrecognized interval, using 0"
echo "0"
fi
}
query_single_attr() {
QSR_TARGET="$1"
QSR_ATTR="$2"
crm_attribute $VERBOSE --quiet --query -t status -d 0 \
-N "$QSR_TARGET" -n "$QSR_ATTR"
}
query_attr_sum() {
QAS_TARGET="$1"
QAS_PREFIX="$2"
# Build xpath to match all transient node attributes with prefix
QAS_XPATH="/cib/status/node_state[@uname='${QAS_TARGET}']"
QAS_XPATH="${QAS_XPATH}/transient_attributes/instance_attributes"
QAS_XPATH="${QAS_XPATH}/nvpair[starts-with(@name,'$QAS_PREFIX')]"
# Query attributes that match xpath
# @TODO We ignore stderr because we don't want "no results" to look
# like an error, but that also makes $VERBOSE pointless.
QAS_ALL=$(cibadmin --query --sync-call --local \
--xpath="$QAS_XPATH" 2>/dev/null)
QAS_EX=$?
# "No results" is not an error
if [ $QAS_EX -ne $CRM_EX_OK ] && [ $QAS_EX -ne $CRM_EX_NOSUCH ]; then
echo "error: could not query CIB for fail counts" >&2
exit $QAS_EX
fi
# Extract the attribute values (one per line) from the output
QAS_VALUE=$(echo "$QAS_ALL" | sed -n -e \
's/.*<nvpair.*value="\([0-9][0-9]*\|INFINITY\)".*>.*/\1/p')
# Sum the values
QAS_SUM=0
for i in 0 $QAS_VALUE; do
if [ "$i" = "INFINITY" ]; then
QAS_SUM="INFINITY"
break
else
QAS_SUM=$(($QAS_SUM + $i))
fi
done
if [ "$QAS_SUM" = "INFINITY" ]; then
echo $QAS_SUM
elif [ "$QAS_SUM" -ge 1000000 ]; then
echo "INFINITY"
else
echo $QAS_SUM
fi
}
query_failcount() {
QF_TARGET="$1"
QF_RESOURCE="$2"
QF_OPERATION="$3"
QF_INTERVAL="$4"
QF_ATTR_RSC="fail-count-${QF_RESOURCE}"
if [ -n "$QF_OPERATION" ]; then
QF_ATTR_DISPLAY="${QF_ATTR_RSC}#${QF_OPERATION}_${QF_INTERVAL}"
QF_COUNT=$(query_single_attr "$QF_TARGET" "$QF_ATTR_DISPLAY")
else
QF_ATTR_DISPLAY="$QF_ATTR_RSC"
QF_COUNT=$(query_attr_sum "$QF_TARGET" "${QF_ATTR_RSC}#")
fi
# @COMPAT attributes set < 1.1.17:
# If we didn't find any per-operation failcount,
# check whether there is a legacy per-resource failcount.
if [ "$QF_COUNT" = "0" ]; then
QF_COUNT=$(query_single_attr "$QF_TARGET" "$QF_ATTR_RSC")
if [ "$QF_COUNT" != "0" ]; then
QF_ATTR_DISPLAY="$QF_ATTR_RSC"
fi
fi
# Echo result (comparable to crm_attribute, for backward compatibility)
if [ -n "$QUIET" ]; then
echo $QF_COUNT
else
echo "scope=status name=$QF_ATTR_DISPLAY value=$QF_COUNT"
fi
}
clear_failcount() {
CF_TARGET="$1"
CF_RESOURCE="$2"
CF_OPERATION="$3"
CF_INTERVAL="$4"
if [ -n "$CF_OPERATION" ]; then
CF_OPERATION="-n $CF_OPERATION -I ${CF_INTERVAL}ms"
fi
crm_resource $QUIET $VERBOSE --cleanup \
-N "$CF_TARGET" -r "$CF_RESOURCE" $CF_OPERATION
}
QUIET=""
VERBOSE=""
command=""
resource=""
operation=""
interval="0"
target=$(crm_node -n 2>/dev/null)
SHORTOPTS="qDGQVN:U:v:i:l:r:n:I:"
LONGOPTS_COMMON="help,version,verbose,quiet"
LONGOPTS_COMMANDS="query,delete"
LONGOPTS_OTHER="resource:,node:,operation:,interval:"
LONGOPTS_COMPAT="delete-attr,get-value,resource-id:,uname:,lifetime:,attr-value:,attr-id:"
LONGOPTS="$LONGOPTS_COMMON,$LONGOPTS_COMMANDS,$LONGOPTS_OTHER,$LONGOPTS_COMPAT"
TEMP=$(@GETOPT_PATH@ -o $SHORTOPTS --long $LONGOPTS -n crm_failcount -- "$@")
if [ $? -ne 0 ]; then
exit_usage
fi
eval set -- "$TEMP" # Quotes around $TEMP are essential
while true ; do
case "$1" in
--help)
echo "$HELP_TEXT"
exit $CRM_EX_OK
;;
--version)
crm_attribute --version
exit $?
;;
-q|-Q|--quiet)
QUIET="--quiet"
shift
;;
-V|--verbose)
VERBOSE="$VERBOSE $1"
shift
;;
-G|--query|--get-value)
command="--query"
shift
;;
-D|--delete|--delete-attr)
command="--delete"
shift
;;
-r|--resource|--resource-id)
resource="$2"
shift 2
;;
-n|--operation)
operation="$2"
shift 2
;;
-I|--interval)
interval="$2"
shift 2
;;
-N|--node|-U|--uname)
target="$2"
shift 2
;;
-v|--attr-value)
if [ "$2" = "0" ]; then
command="--delete"
else
warn "ignoring deprecated option '$1' with nonzero value"
fi
shift 2
;;
-i|--attr-id|-l|--lifetime)
warn "ignoring deprecated option '$1'"
shift 2
;;
--)
shift
break
;;
*)
exit_usage "unknown option '$1'"
;;
esac
done
[ -n "$command" ] || exit_usage "must specify a command"
[ -n "$resource" ] || exit_usage "resource name required"
[ -n "$target" ] || exit_usage "node name required"
interval=$(parse_interval $interval)
if [ "$command" = "--query" ]; then
query_failcount "$target" "$resource" "$operation" "$interval"
else
clear_failcount "$target" "$resource" "$operation" "$interval"
fi
diff --git a/tools/crm_master.in b/tools/crm_master.in
index 5177c4f26c..896239c1ee 100755
--- a/tools/crm_master.in
+++ b/tools/crm_master.in
@@ -1,103 +1,109 @@
#!@BASH_PATH@
+#
+# Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
+#
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
+#
USAGE_TEXT="Usage: crm_master <command> [<options>]
Common options:
--help Display this text, then exit
--version Display version information, then exit
-V, --verbose Specify multiple times to increase debug output
-q, --quiet Print only the value (if querying)
Commands:
-G, --query Query the current value of the promotion score
-v, --update=VALUE Update the value of the promotion score
-D, --delete Delete the promotion score
Additional Options:
-N, --node=NODE Use promotion score on named node (instead of local node)
-l, --lifetime=VALUE Until when should the setting take effect
(valid values: reboot, forever)
-i, --id=VALUE (Advanced) XML ID used to identify promotion score attribute"
HELP_TEXT="crm_master - Query, update, or delete a resource's promotion score
This program should normally be invoked only from inside an OCF resource agent.
$USAGE_TEXT"
exit_usage() {
if [ $# -gt 0 ]; then
- echo "error: $@" >&2
+ echo "error:" "$@" >&2
fi
echo
echo "$USAGE_TEXT"
exit 1
}
SHORTOPTS_DEPRECATED="U:Q"
LONGOPTS_DEPRECATED="uname:,get-value,delete-attr,attr-value:,attr-id:"
SHORTOPTS="VqGv:DN:l:i:r:"
LONGOPTS="help,version,verbose,quiet,query,update:,delete,node:,lifetime:,id:,resource:"
TEMP=$(@GETOPT_PATH@ -o ${SHORTOPTS}${SHORTOPTS_DEPRECATED} \
--long ${LONGOPTS},${LONGOPTS_DEPRECATED} \
-n crm_master -- "$@")
if [ $? -ne 0 ]; then
exit_usage
fi
eval set -- "$TEMP" # Quotes around $TEMP are essential
# Explicitly set the (usual default) lifetime, so the attribute gets set as a
# node attribute and not a cluster property.
options="--lifetime forever"
while true ; do
case "$1" in
--help)
echo "$HELP_TEXT"
exit 0
;;
--version)
crm_attribute --version
exit 0
;;
--verbose|-V|--quiet|-q|--query|-G|--delete|-D)
options="$options $1"
shift
;;
--update|-v|--node|-N|--lifetime|-l|--id|-i)
options="$options $1 $2"
shift
shift
;;
-r|--resource)
OCF_RESOURCE_INSTANCE=$2;
shift
shift
;;
--get-value|--delete-attr|-Q) # deprecated
options="$options $1"
shift
;;
--uname|-U|--attr-value|--attr-id) # deprecated
options="$options $1 $2"
shift
shift
;;
--)
shift
break
;;
*)
exit_usage "unknown option '$1'"
;;
esac
done
if [ -z "$OCF_RESOURCE_INSTANCE" ]; then
echo "This program should normally only be invoked from inside an OCF resource agent."
echo "To set a promotion score from the command line, please specify resource with -r."
exit 1
fi
crm_attribute -n master-$OCF_RESOURCE_INSTANCE $options
diff --git a/tools/crm_report.in b/tools/crm_report.in
index ffd8137ee6..541893f822 100644
--- a/tools/crm_report.in
+++ b/tools/crm_report.in
@@ -1,465 +1,471 @@
#!/bin/sh
#
# Copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
TEMP=`@GETOPT_PATH@ \
-o hv?xl:f:t:n:T:L:p:c:dSCu:D:MVse: \
--long help,cts:,cts-log:,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \
-n 'crm_report' -- "$@"`
# The quotes around $TEMP are essential
eval set -- "$TEMP"
progname=$(basename "$0")
rsh="ssh -T"
-times=""
tests=""
nodes=""
compress=1
cluster="any"
ssh_user="root"
search_logs=1
report_data=`dirname $0`
maxdepth=5
extra_logs=""
sanitize_patterns="passw.*"
log_patterns="CRIT: ERROR:"
usage() {
cat<<EOF
$progname - Create archive of everything needed when reporting cluster problems
Usage: $progname [options] [DEST]
Required option:
-f, --from TIME time prior to problems beginning
(as "YYYY-M-D H:M:S" including the quotes)
Options:
-V increase verbosity (may be specified multiple times)
-v, --version display software version
--features display software features
-t, --to TIME time at which all problems were resolved
(as "YYYY-M-D H:M:S" including the quotes; default "now")
-T, --cts TEST CTS test or set of tests to extract
--cts-log CTS master logfile
-n, --nodes NODES node names for this cluster (only needed if cluster is
not active on this host; accepts -n "a b" or -n a -n b)
-M do not search for cluster logs
-l, --logfile FILE log file to collect (in addition to detected logs if -M
is not specified; may be specified multiple times)
-p PATT additional regular expression to match variables to be
masked in output (default: "passw.*")
-L PATT additional regular expression to match in log files for
analysis (default: $log_patterns)
-S, --single-node don't attempt to collect data from other nodes
-c, --cluster TYPE force the cluster type instead of detecting
(currently only corosync is supported)
-C, --corosync force the cluster type to be corosync
-u, --user USER username to use when collecting data from other nodes
(default root)
-D, --depth search depth to use when attempting to locate files
-e, --rsh command to use to run commands on other nodes
(default ssh -T)
--sos-mode use defaults suitable for being called by sosreport tool
(behavior subject to change and not useful to end users)
DEST, --dest DEST custom destination directory or file name
$progname works best when run from a cluster node on a running cluster,
but can be run from a stopped cluster node or a Pacemaker Remote node.
If neither --nodes nor --single-node is given, $progname will guess the
node list, but may have trouble detecting Pacemaker Remote nodes.
Unless --single-node is given, the node names (whether specified by --nodes
or detected automatically) must be resolvable and reachable via the command
specified by -e/--rsh using the user specified by -u/--user.
Examples:
$progname -f "2011-12-14 13:05:00" unexplained-apache-failure
$progname -f 2011-12-14 -t 2011-12-15 something-that-took-multiple-days
$progname -f 13:05:00 -t 13:12:00 brief-outage
EOF
}
case "$1" in
-v|--version) echo "$progname @VERSION@-@BUILD_VERSION@"; exit 0;;
--features) echo "@VERSION@-@BUILD_VERSION@: @PCMK_FEATURES@"; exit 0;;
--|-h|--help) usage; exit 0;;
esac
# Prefer helpers in the same directory if they exist, to simplify development
if [ ! -f $report_data/report.common ]; then
report_data=@datadir@/@PACKAGE@
else
echo "Using local helpers"
fi
. $report_data/report.common
while true; do
case "$1" in
-x) set -x; shift;;
-V) verbose=`expr $verbose + 1`; shift;;
-T|--cts-test) tests="$tests $2"; shift; shift;;
--cts-log) ctslog="$2"; shift; shift;;
-f|--from) start_time=`get_time "$2"`; shift; shift;;
-t|--to) end_time=`get_time "$2"`; shift; shift;;
-n|--node|--nodes) nodes="$nodes $2"; shift; shift;;
-S|--single-node) nodes="$host"; shift;;
-E|-l|--logfile) extra_logs="$extra_logs $2"; shift; shift;;
-p) sanitize_patterns="$sanitize_patterns $2"; shift; shift;;
-L) log_patterns="$log_patterns `echo $2 | sed 's/ /\\\W/g'`"; shift; shift;;
-d|--as-directory) compress=0; shift;;
-C|--corosync) cluster="corosync"; shift;;
-c|--cluster) cluster="$2"; shift; shift;;
-e|--rsh) rsh="$2"; shift; shift;;
-u|--user) ssh_user="$2"; shift; shift;;
-D|--max-depth) maxdepth="$2"; shift; shift;;
-M) search_logs=0; shift;;
--sos-mode) search_logs=0; nodes="$host"; shift;;
--dest) DESTDIR=$2; shift; shift;;
--) if [ ! -z $2 ]; then DESTDIR=$2; fi; break;;
-h|--help) usage; exit 0;;
# Options for compatibility with hb_report
-s) shift;;
*) echo "Unknown argument: $1"; usage; exit 1;;
esac
done
collect_data() {
label="$1"
start=`expr $2 - 10`
end=`expr $3 + 10`
masterlog=$4
if [ "x$DESTDIR" != x ]; then
echo $DESTDIR | grep -e "^/" -qs
if [ $? = 0 ]; then
l_base=$DESTDIR
else
l_base="`pwd`/$DESTDIR"
fi
debug "Using custom scratch dir: $l_base"
r_base=`basename $l_base`
else
l_base=$HOME/$label
r_base=$label
fi
if [ -e $l_base ]; then
fatal "Output directory $l_base already exists, specify an alternate name with --dest"
fi
mkdir -p $l_base
if [ "x$masterlog" != "x" ]; then
dumplogset "$masterlog" $start $end > "$l_base/$HALOG_F"
fi
for node in $nodes; do
cat <<EOF >$l_base/.env
LABEL="$label"
REPORT_HOME="$r_base"
REPORT_MASTER="$host"
REPORT_TARGET="$node"
LOG_START=$start
LOG_END=$end
REMOVE=1
SANITIZE="$sanitize_patterns"
CLUSTER=$cluster
LOG_PATTERNS="$log_patterns"
EXTRA_LOGS="$extra_logs"
SEARCH_LOGS=$search_logs
verbose=$verbose
maxdepth=$maxdepth
EOF
if [ $host = $node ]; then
cat <<EOF >>$l_base/.env
REPORT_HOME="$l_base"
EOF
cat $l_base/.env $report_data/report.common $report_data/report.collector > $l_base/collector
bash $l_base/collector
else
cat $l_base/.env $report_data/report.common $report_data/report.collector \
| $rsh -l $ssh_user $node -- "mkdir -p $r_base; cat > $r_base/collector; bash $r_base/collector" | (cd $l_base && tar mxf -)
fi
done
analyze $l_base > $l_base/$ANALYSIS_F
if [ -f $l_base/$HALOG_F ]; then
node_events $l_base/$HALOG_F > $l_base/$EVENTS_F
fi
for node in $nodes; do
cat $l_base/$node/$ANALYSIS_F >> $l_base/$ANALYSIS_F
if [ -s $l_base/$node/$EVENTS_F ]; then
cat $l_base/$node/$EVENTS_F >> $l_base/$EVENTS_F
elif [ -s $l_base/$HALOG_F ]; then
awk "\$4==\"$nodes\"" $l_base/$EVENTS_F >> $l_base/$n/$EVENTS_F
fi
done
log " "
if [ $compress = 1 ]; then
fname=`shrink $l_base`
rm -rf $l_base
log "Collected results are available in $fname"
log " "
log "Please create a bug entry at"
log " http://bugs.clusterlabs.org/enter_bug.cgi?product=Pacemaker"
log "Include a description of your problem and attach this tarball"
log " "
log "Thank you for taking time to create this report."
else
log "Collected results are available in $l_base"
fi
log " "
}
#
# check if files have same content in the cluster
#
cibdiff() {
- d1=`dirname $1`
- d2=`dirname $2`
- if [ -f $d1/RUNNING -a -f $d2/RUNNING ] ||
- [ -f $d1/STOPPED -a -f $d2/STOPPED ]; then
+ d1=$(dirname $1)
+ d2=$(dirname $2)
+
+ if [ -f "$d1/RUNNING" ] && [ ! -f "$d2/RUNNING" ]; then
+ DIFF_OK=0
+ elif [ -f "$d1/STOPPED" ] && [ ! -f "$d2/STOPPED" ]; then
+ DIFF_OK=0
+ else
+ DIFF_OK=1
+ fi
+
+ if [ $DIFF_OK -eq 1 ]; then
if which crm_diff > /dev/null 2>&1; then
crm_diff -c -n $1 -o $2
else
info "crm_diff(8) not found, cannot diff CIBs"
fi
else
echo "can't compare cibs from running and stopped systems"
fi
}
diffcheck() {
[ -f "$1" ] || {
echo "$1 does not exist"
return 1
}
[ -f "$2" ] || {
echo "$2 does not exist"
return 1
}
- case `basename $1` in
- $CIB_F) cibdiff $1 $2;;
- $B_CONF) diff -u $1 $2;; # confdiff?
- *) diff -u $1 $2;;
-esac
+ case $(basename "$1") in
+ $CIB_F) cibdiff $1 $2 ;;
+ *) diff -u $1 $2 ;;
+ esac
}
#
# remove duplicates if files are same, make links instead
#
consolidate() {
- for n in $NODES; do
+ for n in $nodes; do
if [ -f $1/$2 ]; then
rm $1/$n/$2
else
mv $1/$n/$2 $1
fi
ln -s ../$2 $1/$n
done
}
analyze_one() {
rc=0
node0=""
- for n in $NODES; do
+ for n in $nodes; do
if [ "$node0" ]; then
diffcheck $1/$node0/$2 $1/$n/$2
rc=$(($rc+$?))
else
node0=$n
fi
done
return $rc
}
analyze() {
- flist="$MEMBERSHIP_F $CIB_F $CRM_MON_F $B_CONF $SYSINFO_F"
+ flist="$MEMBERSHIP_F $CIB_F $CRM_MON_F $SYSINFO_F"
for f in $flist; do
printf "Diff $f... "
ls $1/*/$f >/dev/null 2>&1 || {
echo "no $1/*/$f :/"
continue
}
if analyze_one $1 $f; then
echo "OK"
[ "$f" != $CIB_F ] && consolidate $1 $f
else
echo ""
fi
done
}
do_cts() {
test_sets=`echo $tests | tr ',' ' '`
for test_set in $test_sets; do
start_time=0
start_test=`echo $test_set | tr '-' ' ' | awk '{print $1}'`
end_time=0
end_test=`echo $test_set | tr '-' ' ' | awk '{print $2}'`
if [ x$end_test = x ]; then
msg="Extracting test $start_test"
label="CTS-$start_test-`date +"%b-%d-%Y"`"
end_test=`expr $start_test + 1`
else
msg="Extracting tests $start_test to $end_test"
label="CTS-$start_test-$end_test-`date +"%b-%d-%Y"`"
end_test=`expr $end_test + 1`
fi
if [ $start_test = 0 ]; then
start_pat="BEGINNING [0-9].* TESTS"
else
start_pat="Running test.*\[ *$start_test\]"
fi
if [ x$ctslog = x ]; then
ctslog=`findmsg 1 "$start_pat"`
if [ x$ctslog = x ]; then
fatal "No CTS control file detected"
else
log "Using CTS control file: $ctslog"
fi
fi
line=`grep -n "$start_pat" $ctslog | tail -1 | sed 's/:.*//'`
if [ ! -z "$line" ]; then
start_time=`linetime $ctslog $line`
fi
line=`grep -n "Running test.*\[ *$end_test\]" $ctslog | tail -1 | sed 's/:.*//'`
if [ ! -z "$line" ]; then
end_time=`linetime $ctslog $line`
fi
if [ -z "$nodes" ]; then
nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '`
log "Calculated node list: $nodes"
fi
if [ $end_time -lt $start_time ]; then
debug "Test didn't complete, grabbing everything up to now"
end_time=`date +%s`
fi
if [ $start_time != 0 ];then
log "$msg (`time2str $start_time` to `time2str $end_time`)"
collect_data $label $start_time $end_time $ctslog
else
fatal "$msg failed: not found"
fi
done
}
node_names_from_xml() {
awk '
/uname/ {
for( i=1; i<=NF; i++ )
if( $i~/^uname=/ ) {
sub("uname=.","",$i);
sub("\".*","",$i);
print $i;
next;
}
}
' | tr '\n' ' '
}
getnodes() {
cluster="$1"
# 1. Live (cluster nodes or Pacemaker Remote nodes)
# TODO: This will not detect Pacemaker Remote nodes unless they
# have ever had a permanent node attribute set, because it only
# searches the nodes section. It should also search the config
# for resources that create Pacemaker Remote nodes.
cib_nodes=$(cibadmin -Ql -o nodes 2>/dev/null)
if [ $? -eq 0 ]; then
debug "Querying CIB for nodes"
echo "$cib_nodes" | node_names_from_xml
return
fi
# 2. Saved
if [ -f "@CRM_CONFIG_DIR@/cib.xml" ]; then
debug "Querying on-disk CIB for nodes"
grep "node " "@CRM_CONFIG_DIR@/cib.xml" | node_names_from_xml
return
fi
# 3. logs
# TODO: Look for something like crm_update_peer
}
if [ "x$tests" != "x" ]; then
do_cts
elif [ "x$start_time" != "x" ]; then
masterlog=""
if [ -z "$sanitize_patterns" ]; then
log "WARNING: The tarball produced by this program may contain"
log " sensitive information such as passwords."
log ""
log "We will attempt to remove such information if you use the"
log "-p option. For example: -p \"pass.*\" -p \"user.*\""
log ""
log "However, doing this may reduce the ability for the recipients"
log "to diagnose issues and generally provide assistance."
log ""
log "IT IS YOUR RESPONSIBILITY TO PROTECT SENSITIVE DATA FROM EXPOSURE"
log ""
fi
# If user didn't specify a cluster stack, make a best guess if possible.
if [ -z "$cluster" ] || [ "$cluster" = "any" ]; then
cluster=$(get_cluster_type)
fi
# If user didn't specify node(s), make a best guess if possible.
if [ -z "$nodes" ]; then
nodes=`getnodes $cluster`
if [ -n "$nodes" ]; then
log "Calculated node list: $nodes"
else
fatal "Cannot determine nodes; specify --nodes or --single-node"
fi
fi
if
echo $nodes | grep -qs $host
then
debug "We are a cluster node"
else
debug "We are a log master"
masterlog=`findmsg 1 "pacemaker-controld\\|CTS"`
fi
if [ -z $end_time ]; then
end_time=`perl -e 'print time()'`
fi
label="pcmk-`date +"%a-%d-%b-%Y"`"
log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)"
collect_data $label $start_time $end_time $masterlog
else
fatal "Not sure what to do, no tests or time ranges to extract"
fi
# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80:
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
index 7f79926e03..42de6a0ea3 100644
--- a/tools/crm_resource_runtime.c
+++ b/tools/crm_resource_runtime.c
@@ -1,1977 +1,1980 @@
/*
* Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_resource.h>
int resource_verbose = 0;
bool do_force = FALSE;
int crmd_replies_needed = 1; /* The welcome message */
const char *attr_set_type = XML_TAG_ATTR_SETS;
static int
do_find_resource(const char *rsc, resource_t * the_rsc, pe_working_set_t * data_set)
{
int found = 0;
GListPtr lpc = NULL;
for (lpc = the_rsc->running_on; lpc != NULL; lpc = lpc->next) {
node_t *node = (node_t *) lpc->data;
if (BE_QUIET) {
fprintf(stdout, "%s\n", node->details->uname);
} else {
const char *state = "";
if (!pe_rsc_is_clone(the_rsc) && the_rsc->fns->state(the_rsc, TRUE) == RSC_ROLE_MASTER) {
state = "Master";
}
fprintf(stdout, "resource %s is running on: %s %s\n", rsc, node->details->uname, state);
}
found++;
}
if (BE_QUIET == FALSE && found == 0) {
fprintf(stderr, "resource %s is NOT running\n", rsc);
}
return found;
}
int
cli_resource_search(resource_t *rsc, const char *requested_name,
pe_working_set_t *data_set)
{
int found = 0;
resource_t *parent = uber_parent(rsc);
if (pe_rsc_is_clone(rsc)) {
for (GListPtr iter = rsc->children; iter != NULL; iter = iter->next) {
found += do_find_resource(requested_name, iter->data, data_set);
}
/* The anonymous clone children's common ID is supplied */
} else if (pe_rsc_is_clone(parent)
&& is_not_set(rsc->flags, pe_rsc_unique)
&& rsc->clone_name
&& safe_str_eq(requested_name, rsc->clone_name)
&& safe_str_neq(requested_name, rsc->id)) {
for (GListPtr iter = parent->children; iter; iter = iter->next) {
found += do_find_resource(requested_name, iter->data, data_set);
}
} else {
found += do_find_resource(requested_name, rsc, data_set);
}
return found;
}
#define XPATH_MAX 1024
static int
find_resource_attr(cib_t * the_cib, const char *attr, const char *rsc, const char *set_type,
const char *set_name, const char *attr_id, const char *attr_name, char **value)
{
int offset = 0;
int rc = pcmk_ok;
xmlNode *xml_search = NULL;
char *xpath_string = NULL;
if(value) {
*value = NULL;
}
if(the_cib == NULL) {
return -ENOTCONN;
}
xpath_string = calloc(1, XPATH_MAX);
offset +=
snprintf(xpath_string + offset, XPATH_MAX - offset, "%s", get_object_path("resources"));
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "//*[@id=\"%s\"]", rsc);
if (set_type) {
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "/%s", set_type);
if (set_name) {
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "[@id=\"%s\"]", set_name);
}
}
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "//nvpair[");
if (attr_id) {
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "@id=\"%s\"", attr_id);
}
if (attr_name) {
if (attr_id) {
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, " and ");
}
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "@name=\"%s\"", attr_name);
}
offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "]");
CRM_LOG_ASSERT(offset > 0);
rc = the_cib->cmds->query(the_cib, xpath_string, &xml_search,
cib_sync_call | cib_scope_local | cib_xpath);
if (rc != pcmk_ok) {
goto bail;
}
crm_log_xml_debug(xml_search, "Match");
if (xml_has_children(xml_search)) {
xmlNode *child = NULL;
rc = -EINVAL;
printf("Multiple attributes match name=%s\n", attr_name);
for (child = __xml_first_child(xml_search); child != NULL; child = __xml_next(child)) {
printf(" Value: %s \t(id=%s)\n",
crm_element_value(child, XML_NVPAIR_ATTR_VALUE), ID(child));
}
} else if(value) {
const char *tmp = crm_element_value(xml_search, attr);
if (tmp) {
*value = strdup(tmp);
}
}
bail:
free(xpath_string);
free_xml(xml_search);
return rc;
}
static resource_t *
find_matching_attr_resource(resource_t * rsc, const char * rsc_id, const char * attr_set, const char * attr_id,
const char * attr_name, cib_t * cib, const char * cmd)
{
int rc = pcmk_ok;
char *lookup_id = NULL;
char *local_attr_id = NULL;
if(do_force == TRUE) {
return rsc;
} else if(rsc->parent) {
switch(rsc->parent->variant) {
case pe_group:
if (BE_QUIET == FALSE) {
printf("Performing %s of '%s' for '%s' will not apply to its peers in '%s'\n", cmd, attr_name, rsc_id, rsc->parent->id);
}
break;
case pe_clone:
rc = find_resource_attr(cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id);
free(local_attr_id);
if(rc != pcmk_ok) {
rsc = rsc->parent;
if (BE_QUIET == FALSE) {
printf("Performing %s of '%s' on '%s', the parent of '%s'\n", cmd, attr_name, rsc->id, rsc_id);
}
}
break;
default:
break;
}
} else if (rsc->parent && BE_QUIET == FALSE) {
printf("Forcing %s of '%s' for '%s' instead of '%s'\n", cmd, attr_name, rsc_id, rsc->parent->id);
} else if(rsc->parent == NULL && rsc->children) {
resource_t *child = rsc->children->data;
if(child->variant == pe_native) {
lookup_id = clone_strip(child->id); /* Could be a cloned group! */
rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id);
if(rc == pcmk_ok) {
rsc = child;
if (BE_QUIET == FALSE) {
printf("A value for '%s' already exists in child '%s', performing %s on that instead of '%s'\n", attr_name, lookup_id, cmd, rsc_id);
}
}
free(local_attr_id);
free(lookup_id);
}
}
return rsc;
}
int
cli_resource_update_attribute(resource_t *rsc, const char *requested_name,
const char *attr_set, const char *attr_id,
const char *attr_name, const char *attr_value,
bool recursive, cib_t *cib,
pe_working_set_t *data_set)
{
int rc = pcmk_ok;
static bool need_init = TRUE;
char *lookup_id = NULL;
char *local_attr_id = NULL;
char *local_attr_set = NULL;
xmlNode *xml_top = NULL;
xmlNode *xml_obj = NULL;
if(attr_id == NULL
&& do_force == FALSE
&& pcmk_ok != find_resource_attr(
cib, XML_ATTR_ID, uber_parent(rsc)->id, NULL, NULL, NULL, attr_name, NULL)) {
printf("\n");
}
if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) {
if (do_force == FALSE) {
rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id,
XML_TAG_META_SETS, attr_set, attr_id,
attr_name, &local_attr_id);
if (rc == pcmk_ok && BE_QUIET == FALSE) {
printf("WARNING: There is already a meta attribute for '%s' called '%s' (id=%s)\n",
uber_parent(rsc)->id, attr_name, local_attr_id);
printf(" Delete '%s' first or use --force to override\n", local_attr_id);
}
free(local_attr_id);
if (rc == pcmk_ok) {
return -ENOTUNIQ;
}
}
} else {
rsc = find_matching_attr_resource(rsc, requested_name, attr_set,
attr_id, attr_name, cib, "update");
}
lookup_id = clone_strip(rsc->id); /* Could be a cloned group! */
rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name,
&local_attr_id);
if (rc == pcmk_ok) {
crm_debug("Found a match for name=%s: id=%s", attr_name, local_attr_id);
attr_id = local_attr_id;
} else if (rc != -ENXIO) {
free(lookup_id);
free(local_attr_id);
return rc;
} else {
const char *tag = crm_element_name(rsc->xml);
if (attr_set == NULL) {
local_attr_set = crm_concat(lookup_id, attr_set_type, '-');
attr_set = local_attr_set;
}
if (attr_id == NULL) {
local_attr_id = crm_concat(attr_set, attr_name, '-');
attr_id = local_attr_id;
}
xml_top = create_xml_node(NULL, tag);
crm_xml_add(xml_top, XML_ATTR_ID, lookup_id);
xml_obj = create_xml_node(xml_top, attr_set_type);
crm_xml_add(xml_obj, XML_ATTR_ID, attr_set);
}
xml_obj = crm_create_nvpair_xml(xml_obj, attr_id, attr_name, attr_value);
if (xml_top == NULL) {
xml_top = xml_obj;
}
crm_log_xml_debug(xml_top, "Update");
rc = cib->cmds->modify(cib, XML_CIB_TAG_RESOURCES, xml_top, cib_options);
if (rc == pcmk_ok && BE_QUIET == FALSE) {
printf("Set '%s' option: id=%s%s%s%s%s=%s\n", lookup_id, local_attr_id,
attr_set ? " set=" : "", attr_set ? attr_set : "",
attr_name ? " name=" : "", attr_name ? attr_name : "", attr_value);
}
free_xml(xml_top);
free(lookup_id);
free(local_attr_id);
free(local_attr_set);
if(recursive && safe_str_eq(attr_set_type, XML_TAG_META_SETS)) {
GListPtr lpc = NULL;
if(need_init) {
xmlNode *cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set->input);
need_init = FALSE;
unpack_constraints(cib_constraints, data_set);
for (lpc = data_set->resources; lpc != NULL; lpc = lpc->next) {
resource_t *r = (resource_t *) lpc->data;
clear_bit(r->flags, pe_rsc_allocating);
}
}
crm_debug("Looking for dependencies %p", rsc->rsc_cons_lhs);
set_bit(rsc->flags, pe_rsc_allocating);
for (lpc = rsc->rsc_cons_lhs; lpc != NULL; lpc = lpc->next) {
rsc_colocation_t *cons = (rsc_colocation_t *) lpc->data;
resource_t *peer = cons->rsc_lh;
crm_debug("Checking %s %d", cons->id, cons->score);
if (cons->score > 0 && is_not_set(peer->flags, pe_rsc_allocating)) {
/* Don't get into colocation loops */
crm_debug("Setting %s=%s for dependent resource %s", attr_name, attr_value, peer->id);
cli_resource_update_attribute(peer, peer->id, NULL, NULL,
attr_name, attr_value, recursive,
cib, data_set);
}
}
}
return rc;
}
int
cli_resource_delete_attribute(resource_t *rsc, const char *requested_name,
const char *attr_set, const char *attr_id,
const char *attr_name, cib_t *cib,
pe_working_set_t *data_set)
{
xmlNode *xml_obj = NULL;
int rc = pcmk_ok;
char *lookup_id = NULL;
char *local_attr_id = NULL;
if(attr_id == NULL
&& do_force == FALSE
&& find_resource_attr(
cib, XML_ATTR_ID, uber_parent(rsc)->id, NULL, NULL, NULL, attr_name, NULL) != pcmk_ok) {
printf("\n");
}
if(safe_str_eq(attr_set_type, XML_TAG_META_SETS)) {
rsc = find_matching_attr_resource(rsc, requested_name, attr_set,
attr_id, attr_name, cib, "delete");
}
lookup_id = clone_strip(rsc->id);
rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name,
&local_attr_id);
if (rc == -ENXIO) {
free(lookup_id);
return pcmk_ok;
} else if (rc != pcmk_ok) {
free(lookup_id);
return rc;
}
if (attr_id == NULL) {
attr_id = local_attr_id;
}
xml_obj = crm_create_nvpair_xml(NULL, attr_id, attr_name, NULL);
crm_log_xml_debug(xml_obj, "Delete");
CRM_ASSERT(cib);
rc = cib->cmds->remove(cib, XML_CIB_TAG_RESOURCES, xml_obj, cib_options);
if (rc == pcmk_ok && BE_QUIET == FALSE) {
printf("Deleted '%s' option: id=%s%s%s%s%s\n", lookup_id, local_attr_id,
attr_set ? " set=" : "", attr_set ? attr_set : "",
attr_name ? " name=" : "", attr_name ? attr_name : "");
}
free(lookup_id);
free_xml(xml_obj);
free(local_attr_id);
return rc;
}
static int
send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op,
const char *host_uname, const char *rsc_id,
bool only_failed, pe_working_set_t * data_set)
{
char *our_pid = NULL;
char *key = NULL;
int rc = -ECOMM;
xmlNode *cmd = NULL;
xmlNode *xml_rsc = NULL;
const char *value = NULL;
const char *router_node = host_uname;
xmlNode *params = NULL;
xmlNode *msg_data = NULL;
resource_t *rsc = pe_find_resource(data_set->resources, rsc_id);
if (rsc == NULL) {
CMD_ERR("Resource %s not found", rsc_id);
return -ENXIO;
} else if (rsc->variant != pe_native) {
CMD_ERR("We can only process primitive resources, not %s", rsc_id);
return -EINVAL;
} else if (host_uname == NULL) {
CMD_ERR("Please supply a node name with --node");
return -EINVAL;
} else {
node_t *node = pe_find_node(data_set->nodes, host_uname);
if (node && is_remote_node(node)) {
node = pe__current_node(node->details->remote_rsc);
if (node == NULL) {
CMD_ERR("No cluster connection to Pacemaker Remote node %s detected",
host_uname);
return -ENXIO;
}
router_node = node->details->uname;
}
}
key = generate_transition_key(0, getpid(), 0, "xxxxxxxx-xrsc-opxx-xcrm-resourcexxxx");
msg_data = create_xml_node(NULL, XML_GRAPH_TAG_RSC_OP);
crm_xml_add(msg_data, XML_ATTR_TRANSITION_KEY, key);
free(key);
crm_xml_add(msg_data, XML_LRM_ATTR_TARGET, host_uname);
if (safe_str_neq(router_node, host_uname)) {
crm_xml_add(msg_data, XML_LRM_ATTR_ROUTER_NODE, router_node);
}
xml_rsc = create_xml_node(msg_data, XML_CIB_TAG_RESOURCE);
if (rsc->clone_name) {
crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->clone_name);
crm_xml_add(xml_rsc, XML_ATTR_ID_LONG, rsc->id);
} else {
crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->id);
}
value = crm_copy_xml_element(rsc->xml, xml_rsc, XML_ATTR_TYPE);
if (value == NULL) {
CMD_ERR("%s has no type! Aborting...", rsc_id);
return -ENXIO;
}
value = crm_copy_xml_element(rsc->xml, xml_rsc, XML_AGENT_ATTR_CLASS);
if (value == NULL) {
CMD_ERR("%s has no class! Aborting...", rsc_id);
return -ENXIO;
}
crm_copy_xml_element(rsc->xml, xml_rsc, XML_AGENT_ATTR_PROVIDER);
params = create_xml_node(msg_data, XML_TAG_ATTRS);
crm_xml_add(params, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
key = crm_meta_name(XML_LRM_ATTR_INTERVAL_MS);
crm_xml_add(params, key, "60000"); /* 1 minute */
free(key);
our_pid = crm_getpid_s();
cmd = create_request(op, msg_data, router_node, CRM_SYSTEM_CRMD, crm_system_name, our_pid);
/* crm_log_xml_warn(cmd, "send_lrm_rsc_op"); */
free_xml(msg_data);
if (crm_ipc_send(crmd_channel, cmd, 0, 0, NULL) > 0) {
rc = 0;
} else {
crm_debug("Could not send %s op to the controller", op);
rc = -ENOTCONN;
}
free_xml(cmd);
return rc;
}
/*!
* \internal
* \brief Get resource name as used in failure-related node attributes
*
* \param[in] rsc Resource to check
*
* \return Newly allocated string containing resource's fail name
* \note The caller is responsible for freeing the result.
*/
static inline char *
rsc_fail_name(resource_t *rsc)
{
const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
return is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
}
static int
clear_rsc_history(crm_ipc_t *crmd_channel, const char *host_uname,
const char *rsc_id, pe_working_set_t *data_set)
{
int rc = pcmk_ok;
/* Erase the resource's entire LRM history in the CIB, even if we're only
* clearing a single operation's fail count. If we erased only entries for a
* single operation, we might wind up with a wrong idea of the current
* resource state, and we might not re-probe the resource.
*/
rc = send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_DELETE, host_uname, rsc_id,
TRUE, data_set);
if (rc != pcmk_ok) {
return rc;
}
crmd_replies_needed++;
crm_trace("Processing %d mainloop inputs", crmd_replies_needed);
while (g_main_context_iteration(NULL, FALSE)) {
crm_trace("Processed mainloop input, %d still remaining",
crmd_replies_needed);
}
if (crmd_replies_needed < 0) {
crmd_replies_needed = 0;
}
return rc;
}
static int
clear_rsc_failures(crm_ipc_t *crmd_channel, const char *node_name,
const char *rsc_id, const char *operation,
const char *interval_spec, pe_working_set_t *data_set)
{
int rc = pcmk_ok;
const char *failed_value = NULL;
const char *failed_id = NULL;
const char *interval_ms_s = NULL;
GHashTable *rscs = NULL;
GHashTableIter iter;
/* Create a hash table to use as a set of resources to clean. This lets us
* clean each resource only once (per node) regardless of how many failed
* operations it has.
*/
rscs = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, NULL);
// Normalize interval to milliseconds for comparison to history entry
if (operation) {
interval_ms_s = crm_strdup_printf("%u",
crm_parse_interval_spec(interval_spec));
}
for (xmlNode *xml_op = __xml_first_child(data_set->failed); xml_op != NULL;
xml_op = __xml_next(xml_op)) {
failed_id = crm_element_value(xml_op, XML_LRM_ATTR_RSCID);
if (failed_id == NULL) {
// Malformed history entry, should never happen
continue;
}
// No resource specified means all resources match
if (rsc_id) {
resource_t *fail_rsc = pe_find_resource_with_flags(data_set->resources,
failed_id,
pe_find_renamed|pe_find_anon);
if (!fail_rsc || safe_str_neq(rsc_id, fail_rsc->id)) {
continue;
}
}
// Host name should always have been provided by this point
failed_value = crm_element_value(xml_op, XML_ATTR_UNAME);
if (safe_str_neq(node_name, failed_value)) {
continue;
}
// No operation specified means all operations match
if (operation) {
failed_value = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
if (safe_str_neq(operation, failed_value)) {
continue;
}
// Interval (if operation was specified) defaults to 0 (not all)
failed_value = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS);
if (safe_str_neq(interval_ms_s, failed_value)) {
continue;
}
}
g_hash_table_add(rscs, (gpointer) failed_id);
}
g_hash_table_iter_init(&iter, rscs);
while (g_hash_table_iter_next(&iter, (gpointer *) &failed_id, NULL)) {
crm_debug("Erasing failures of %s on %s", failed_id, node_name);
rc = clear_rsc_history(crmd_channel, node_name, failed_id, data_set);
if (rc != pcmk_ok) {
return rc;
}
}
g_hash_table_destroy(rscs);
return rc;
}
static int
clear_rsc_fail_attrs(resource_t *rsc, const char *operation,
const char *interval_spec, node_t *node)
{
int rc = pcmk_ok;
int attr_options = attrd_opt_none;
char *rsc_name = rsc_fail_name(rsc);
if (is_remote_node(node)) {
attr_options |= attrd_opt_remote;
}
rc = attrd_clear_delegate(NULL, node->details->uname, rsc_name, operation,
interval_spec, NULL, attr_options);
free(rsc_name);
return rc;
}
int
cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
resource_t *rsc, const char *operation,
const char *interval_spec, bool just_failures,
pe_working_set_t *data_set)
{
int rc = pcmk_ok;
node_t *node = NULL;
if (rsc == NULL) {
return -ENXIO;
} else if (rsc->children) {
GListPtr lpc = NULL;
for (lpc = rsc->children; lpc != NULL; lpc = lpc->next) {
resource_t *child = (resource_t *) lpc->data;
rc = cli_resource_delete(crmd_channel, host_uname, child, operation,
interval_spec, just_failures, data_set);
if (rc != pcmk_ok) {
return rc;
}
}
return pcmk_ok;
} else if (host_uname == NULL) {
GListPtr lpc = NULL;
GListPtr nodes = g_hash_table_get_values(rsc->known_on);
if(nodes == NULL && do_force) {
nodes = node_list_dup(data_set->nodes, FALSE, FALSE);
} else if(nodes == NULL && rsc->exclusive_discover) {
GHashTableIter iter;
pe_node_t *node = NULL;
g_hash_table_iter_init(&iter, rsc->allowed_nodes);
while (g_hash_table_iter_next(&iter, NULL, (void**)&node)) {
if(node->weight >= 0) {
nodes = g_list_prepend(nodes, node);
}
}
} else if(nodes == NULL) {
nodes = g_hash_table_get_values(rsc->allowed_nodes);
}
for (lpc = nodes; lpc != NULL; lpc = lpc->next) {
node = (node_t *) lpc->data;
if (node->details->online) {
rc = cli_resource_delete(crmd_channel, node->details->uname,
rsc, operation, interval_spec,
just_failures, data_set);
}
if (rc != pcmk_ok) {
g_list_free(nodes);
return rc;
}
}
g_list_free(nodes);
return pcmk_ok;
}
node = pe_find_node(data_set->nodes, host_uname);
if (node == NULL) {
printf("Unable to clean up %s because node %s not found\n",
rsc->id, host_uname);
return -ENODEV;
}
if (!node->details->rsc_discovery_enabled) {
printf("Unable to clean up %s because resource discovery disabled on %s\n",
rsc->id, host_uname);
return -EOPNOTSUPP;
}
if (crmd_channel == NULL) {
printf("Dry run: skipping clean-up of %s on %s due to CIB_file\n",
rsc->id, host_uname);
return pcmk_ok;
}
rc = clear_rsc_fail_attrs(rsc, operation, interval_spec, node);
if (rc != pcmk_ok) {
printf("Unable to clean up %s failures on %s: %s\n",
rsc->id, host_uname, pcmk_strerror(rc));
return rc;
}
if (just_failures) {
rc = clear_rsc_failures(crmd_channel, host_uname, rsc->id, operation,
interval_spec, data_set);
} else {
rc = clear_rsc_history(crmd_channel, host_uname, rsc->id, data_set);
}
if (rc != pcmk_ok) {
printf("Cleaned %s failures on %s, but unable to clean history: %s\n",
rsc->id, host_uname, pcmk_strerror(rc));
} else {
printf("Cleaned up %s on %s\n", rsc->id, host_uname);
}
return rc;
}
int
cli_cleanup_all(crm_ipc_t *crmd_channel, const char *node_name,
const char *operation, const char *interval_spec,
pe_working_set_t *data_set)
{
int rc = pcmk_ok;
int attr_options = attrd_opt_none;
const char *display_name = node_name? node_name : "all nodes";
if (crmd_channel == NULL) {
printf("Dry run: skipping clean-up of %s due to CIB_file\n",
display_name);
return pcmk_ok;
}
crmd_replies_needed = 0;
if (node_name) {
node_t *node = pe_find_node(data_set->nodes, node_name);
if (node == NULL) {
CMD_ERR("Unknown node: %s", node_name);
return -ENXIO;
}
if (is_remote_node(node)) {
attr_options |= attrd_opt_remote;
}
}
rc = attrd_clear_delegate(NULL, node_name, NULL, operation, interval_spec,
NULL, attr_options);
if (rc != pcmk_ok) {
printf("Unable to clean up all failures on %s: %s\n",
display_name, pcmk_strerror(rc));
return rc;
}
if (node_name) {
rc = clear_rsc_failures(crmd_channel, node_name, NULL,
operation, interval_spec, data_set);
if (rc != pcmk_ok) {
printf("Cleaned all resource failures on %s, but unable to clean history: %s\n",
node_name, pcmk_strerror(rc));
return rc;
}
} else {
for (GList *iter = data_set->nodes; iter; iter = iter->next) {
pe_node_t *node = (pe_node_t *) iter->data;
rc = clear_rsc_failures(crmd_channel, node->details->uname, NULL,
operation, interval_spec, data_set);
if (rc != pcmk_ok) {
printf("Cleaned all resource failures on all nodes, but unable to clean history: %s\n",
pcmk_strerror(rc));
return rc;
}
}
}
printf("Cleaned up all resources on %s\n", display_name);
return pcmk_ok;
}
void
cli_resource_check(cib_t * cib_conn, resource_t *rsc)
{
int need_nl = 0;
char *role_s = NULL;
char *managed = NULL;
resource_t *parent = uber_parent(rsc);
find_resource_attr(cib_conn, XML_NVPAIR_ATTR_VALUE, parent->id,
NULL, NULL, NULL, XML_RSC_ATTR_MANAGED, &managed);
find_resource_attr(cib_conn, XML_NVPAIR_ATTR_VALUE, parent->id,
NULL, NULL, NULL, XML_RSC_ATTR_TARGET_ROLE, &role_s);
if(role_s) {
enum rsc_role_e role = text2role(role_s);
+
+ free(role_s);
if(role == RSC_ROLE_UNKNOWN) {
// Treated as if unset
} else if(role == RSC_ROLE_STOPPED) {
printf("\n * The configuration specifies that '%s' should remain stopped\n", parent->id);
need_nl++;
} else if (is_set(parent->flags, pe_rsc_promotable)
&& (role == RSC_ROLE_SLAVE)) {
printf("\n * The configuration specifies that '%s' should not be promoted\n", parent->id);
need_nl++;
}
}
if(managed && crm_is_true(managed) == FALSE) {
printf("%s * The configuration prevents the cluster from stopping or starting '%s' (unmanaged)\n", need_nl == 0?"\n":"", parent->id);
need_nl++;
}
+ free(managed);
if(need_nl) {
printf("\n");
}
}
int
cli_resource_fail(crm_ipc_t * crmd_channel, const char *host_uname,
const char *rsc_id, pe_working_set_t * data_set)
{
crm_warn("Failing: %s", rsc_id);
return send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_FAIL, host_uname, rsc_id, FALSE, data_set);
}
static GHashTable *
generate_resource_params(resource_t * rsc, pe_working_set_t * data_set)
{
GHashTable *params = NULL;
GHashTable *meta = NULL;
GHashTable *combined = NULL;
GHashTableIter iter;
if (!rsc) {
crm_err("Resource does not exist in config");
return NULL;
}
params = crm_str_table_new();
meta = crm_str_table_new();
combined = crm_str_table_new();
get_rsc_attributes(params, rsc, NULL /* TODO: Pass in local node */ , data_set);
get_meta_attributes(meta, rsc, NULL /* TODO: Pass in local node */ , data_set);
if (params) {
char *key = NULL;
char *value = NULL;
g_hash_table_iter_init(&iter, params);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
g_hash_table_insert(combined, strdup(key), strdup(value));
}
g_hash_table_destroy(params);
}
if (meta) {
char *key = NULL;
char *value = NULL;
g_hash_table_iter_init(&iter, meta);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
char *crm_name = crm_meta_name(key);
g_hash_table_insert(combined, crm_name, strdup(value));
}
g_hash_table_destroy(meta);
}
return combined;
}
static bool resource_is_running_on(resource_t *rsc, const char *host)
{
bool found = TRUE;
GListPtr hIter = NULL;
GListPtr hosts = NULL;
if(rsc == NULL) {
return FALSE;
}
rsc->fns->location(rsc, &hosts, TRUE);
for (hIter = hosts; host != NULL && hIter != NULL; hIter = hIter->next) {
pe_node_t *node = (pe_node_t *) hIter->data;
if(strcmp(host, node->details->uname) == 0) {
crm_trace("Resource %s is running on %s\n", rsc->id, host);
goto done;
} else if(strcmp(host, node->details->id) == 0) {
crm_trace("Resource %s is running on %s\n", rsc->id, host);
goto done;
}
}
if(host != NULL) {
crm_trace("Resource %s is not running on: %s\n", rsc->id, host);
found = FALSE;
} else if(host == NULL && hosts == NULL) {
crm_trace("Resource %s is not running\n", rsc->id);
found = FALSE;
}
done:
g_list_free(hosts);
return found;
}
/*!
* \internal
* \brief Create a list of all resources active on host from a given list
*
* \param[in] host Name of host to check whether resources are active
* \param[in] rsc_list List of resources to check
*
* \return New list of resources from list that are active on host
*/
static GList *
get_active_resources(const char *host, GList *rsc_list)
{
GList *rIter = NULL;
GList *active = NULL;
for (rIter = rsc_list; rIter != NULL; rIter = rIter->next) {
resource_t *rsc = (resource_t *) rIter->data;
/* Expand groups to their members, because if we're restarting a member
* other than the first, we can't otherwise tell which resources are
* stopping and starting.
*/
if (rsc->variant == pe_group) {
active = g_list_concat(active,
get_active_resources(host, rsc->children));
} else if (resource_is_running_on(rsc, host)) {
active = g_list_append(active, strdup(rsc->id));
}
}
return active;
}
static GList*
subtract_lists(GList *from, GList *items)
{
GList *item = NULL;
GList *result = g_list_copy(from);
for (item = items; item != NULL; item = item->next) {
GList *candidate = NULL;
for (candidate = from; candidate != NULL; candidate = candidate->next) {
crm_info("Comparing %s with %s", (const char *) candidate->data,
(const char *) item->data);
if(strcmp(candidate->data, item->data) == 0) {
result = g_list_remove(result, candidate->data);
break;
}
}
}
return result;
}
static void dump_list(GList *items, const char *tag)
{
int lpc = 0;
GList *item = NULL;
for (item = items; item != NULL; item = item->next) {
crm_trace("%s[%d]: %s", tag, lpc, (char*)item->data);
lpc++;
}
}
static void display_list(GList *items, const char *tag)
{
GList *item = NULL;
for (item = items; item != NULL; item = item->next) {
fprintf(stdout, "%s%s\n", tag, (const char *)item->data);
}
}
/*!
* \internal
* \brief Upgrade XML to latest schema version and use it as working set input
*
* This also updates the working set timestamp to the current time.
*
* \param[in] data_set Working set instance to update
* \param[in] xml XML to use as input
*
* \return pcmk_ok on success, -ENOKEY if unable to upgrade XML
* \note On success, caller is responsible for freeing memory allocated for
* data_set->now.
* \todo This follows the example of other callers of cli_config_update()
* and returns -ENOKEY ("Required key not available") if that fails,
* but perhaps -pcmk_err_schema_validation would be better in that case.
*/
int
update_working_set_xml(pe_working_set_t *data_set, xmlNode **xml)
{
if (cli_config_update(xml, NULL, FALSE) == FALSE) {
return -ENOKEY;
}
data_set->input = *xml;
data_set->now = crm_time_new(NULL);
return pcmk_ok;
}
/*!
* \internal
* \brief Update a working set's XML input based on a CIB query
*
* \param[in] data_set Data set instance to initialize
* \param[in] cib Connection to the CIB manager
*
* \return pcmk_ok on success, -errno on failure
* \note On success, caller is responsible for freeing memory allocated for
* data_set->input and data_set->now.
*/
static int
update_working_set_from_cib(pe_working_set_t * data_set, cib_t *cib)
{
xmlNode *cib_xml_copy = NULL;
int rc;
rc = cib->cmds->query(cib, NULL, &cib_xml_copy, cib_scope_local | cib_sync_call);
if (rc != pcmk_ok) {
fprintf(stderr, "Could not obtain the current CIB: %s (%d)\n", pcmk_strerror(rc), rc);
return rc;
}
rc = update_working_set_xml(data_set, &cib_xml_copy);
if (rc != pcmk_ok) {
fprintf(stderr, "Could not upgrade the current CIB XML\n");
free_xml(cib_xml_copy);
return rc;
}
return pcmk_ok;
}
static int
update_dataset(cib_t *cib, pe_working_set_t * data_set, bool simulate)
{
char *pid = NULL;
char *shadow_file = NULL;
cib_t *shadow_cib = NULL;
int rc;
cleanup_alloc_calculations(data_set);
rc = update_working_set_from_cib(data_set, cib);
if (rc != pcmk_ok) {
return rc;
}
if(simulate) {
pid = crm_getpid_s();
shadow_cib = cib_shadow_new(pid);
shadow_file = get_shadow_file(pid);
if (shadow_cib == NULL) {
fprintf(stderr, "Could not create shadow cib: '%s'\n", pid);
rc = -ENXIO;
goto cleanup;
}
rc = write_xml_file(data_set->input, shadow_file, FALSE);
if (rc < 0) {
fprintf(stderr, "Could not populate shadow cib: %s (%d)\n", pcmk_strerror(rc), rc);
goto cleanup;
}
rc = shadow_cib->cmds->signon(shadow_cib, crm_system_name, cib_command);
if(rc != pcmk_ok) {
fprintf(stderr, "Could not connect to shadow cib: %s (%d)\n", pcmk_strerror(rc), rc);
goto cleanup;
}
do_calculations(data_set, data_set->input, NULL);
run_simulation(data_set, shadow_cib, NULL, TRUE);
rc = update_dataset(shadow_cib, data_set, FALSE);
} else {
cluster_status(data_set);
}
cleanup:
/* Do not free data_set->input here, we need rsc->xml to be valid later on */
cib_delete(shadow_cib);
free(pid);
if(shadow_file) {
unlink(shadow_file);
free(shadow_file);
}
return rc;
}
static int
max_delay_for_resource(pe_working_set_t * data_set, resource_t *rsc)
{
int delay = 0;
int max_delay = 0;
if(rsc && rsc->children) {
GList *iter = NULL;
for(iter = rsc->children; iter; iter = iter->next) {
resource_t *child = (resource_t *)iter->data;
delay = max_delay_for_resource(data_set, child);
if(delay > max_delay) {
double seconds = delay / 1000.0;
crm_trace("Calculated new delay of %.1fs due to %s", seconds, child->id);
max_delay = delay;
}
}
} else if(rsc) {
char *key = crm_strdup_printf("%s_%s_0", rsc->id, RSC_STOP);
action_t *stop = custom_action(rsc, key, RSC_STOP, NULL, TRUE, FALSE, data_set);
const char *value = g_hash_table_lookup(stop->meta, XML_ATTR_TIMEOUT);
max_delay = crm_int_helper(value, NULL);
pe_free_action(stop);
}
return max_delay;
}
static int
max_delay_in(pe_working_set_t * data_set, GList *resources)
{
int max_delay = 0;
GList *item = NULL;
for (item = resources; item != NULL; item = item->next) {
int delay = 0;
resource_t *rsc = pe_find_resource(data_set->resources, (const char *)item->data);
delay = max_delay_for_resource(data_set, rsc);
if(delay > max_delay) {
double seconds = delay / 1000.0;
crm_trace("Calculated new delay of %.1fs due to %s", seconds, rsc->id);
max_delay = delay;
}
}
return 5 + (max_delay / 1000);
}
#define waiting_for_starts(d, r, h) ((g_list_length(d) > 0) || \
(resource_is_running_on((r), (h)) == FALSE))
/*!
* \internal
* \brief Restart a resource (on a particular host if requested).
*
* \param[in] rsc The resource to restart
* \param[in] host The host to restart the resource on (or NULL for all)
* \param[in] timeout_ms Consider failed if actions do not complete in this time
* (specified in milliseconds, but a two-second
* granularity is actually used; if 0, a timeout will be
* calculated based on the resource timeout)
* \param[in] cib Connection to the CIB manager
*
* \return pcmk_ok on success, -errno on failure (exits on certain failures)
*/
int
cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t * cib)
{
int rc = 0;
int lpc = 0;
int before = 0;
int step_timeout_s = 0;
int sleep_interval = 2;
int timeout = timeout_ms / 1000;
bool stop_via_ban = FALSE;
char *rsc_id = NULL;
char *orig_target_role = NULL;
GList *list_delta = NULL;
GList *target_active = NULL;
GList *current_active = NULL;
GList *restart_target_active = NULL;
pe_working_set_t data_set;
if(resource_is_running_on(rsc, host) == FALSE) {
const char *id = rsc->clone_name?rsc->clone_name:rsc->id;
if(host) {
printf("%s is not running on %s and so cannot be restarted\n", id, host);
} else {
printf("%s is not running anywhere and so cannot be restarted\n", id);
}
return -ENXIO;
}
/* We might set the target-role meta-attribute */
attr_set_type = XML_TAG_META_SETS;
rsc_id = strdup(rsc->id);
if ((pe_rsc_is_clone(rsc) || pe_bundle_replicas(rsc)) && host) {
stop_via_ban = TRUE;
}
/*
grab full cib
determine originally active resources
disable or ban
poll cib and watch for affected resources to get stopped
without --timeout, calculate the stop timeout for each step and wait for that
if we hit --timeout or the service timeout, re-enable or un-ban, report failure and indicate which resources we couldn't take down
if everything stopped, re-enable or un-ban
poll cib and watch for affected resources to get started
without --timeout, calculate the start timeout for each step and wait for that
if we hit --timeout or the service timeout, report (different) failure and indicate which resources we couldn't bring back up
report success
Optimizations:
- use constraints to determine ordered list of affected resources
- Allow a --no-deps option (aka. --force-restart)
*/
set_working_set_defaults(&data_set);
rc = update_dataset(cib, &data_set, FALSE);
if(rc != pcmk_ok) {
fprintf(stdout, "Could not get new resource list: %s (%d)\n", pcmk_strerror(rc), rc);
free(rsc_id);
return rc;
}
restart_target_active = get_active_resources(host, data_set.resources);
current_active = get_active_resources(host, data_set.resources);
dump_list(current_active, "Origin");
if (stop_via_ban) {
/* Stop the clone or bundle instance by banning it from the host */
BE_QUIET = TRUE;
rc = cli_resource_ban(rsc_id, host, NULL, cib);
} else {
/* Stop the resource by setting target-role to Stopped.
* Remember any existing target-role so we can restore it later
* (though it only makes any difference if it's Slave).
*/
char *lookup_id = clone_strip(rsc->id);
find_resource_attr(cib, XML_NVPAIR_ATTR_VALUE, lookup_id, NULL, NULL,
NULL, XML_RSC_ATTR_TARGET_ROLE, &orig_target_role);
free(lookup_id);
rc = cli_resource_update_attribute(rsc, rsc_id, NULL, NULL,
XML_RSC_ATTR_TARGET_ROLE,
RSC_STOPPED, FALSE, cib, &data_set);
}
if(rc != pcmk_ok) {
fprintf(stderr, "Could not set target-role for %s: %s (%d)\n", rsc_id, pcmk_strerror(rc), rc);
if (current_active) {
g_list_free_full(current_active, free);
}
if (restart_target_active) {
g_list_free_full(restart_target_active, free);
}
free(rsc_id);
return crm_exit(crm_errno2exit(rc));
}
rc = update_dataset(cib, &data_set, TRUE);
if(rc != pcmk_ok) {
fprintf(stderr, "Could not determine which resources would be stopped\n");
goto failure;
}
target_active = get_active_resources(host, data_set.resources);
dump_list(target_active, "Target");
list_delta = subtract_lists(current_active, target_active);
fprintf(stdout, "Waiting for %d resources to stop:\n", g_list_length(list_delta));
display_list(list_delta, " * ");
step_timeout_s = timeout / sleep_interval;
while(g_list_length(list_delta) > 0) {
before = g_list_length(list_delta);
if(timeout_ms == 0) {
step_timeout_s = max_delay_in(&data_set, list_delta) / sleep_interval;
}
/* We probably don't need the entire step timeout */
for(lpc = 0; lpc < step_timeout_s && g_list_length(list_delta) > 0; lpc++) {
sleep(sleep_interval);
if(timeout) {
timeout -= sleep_interval;
crm_trace("%ds remaining", timeout);
}
rc = update_dataset(cib, &data_set, FALSE);
if(rc != pcmk_ok) {
fprintf(stderr, "Could not determine which resources were stopped\n");
goto failure;
}
if (current_active) {
g_list_free_full(current_active, free);
}
current_active = get_active_resources(host, data_set.resources);
g_list_free(list_delta);
list_delta = subtract_lists(current_active, target_active);
dump_list(current_active, "Current");
dump_list(list_delta, "Delta");
}
crm_trace("%d (was %d) resources remaining", g_list_length(list_delta), before);
if(before == g_list_length(list_delta)) {
/* aborted during stop phase, print the contents of list_delta */
fprintf(stderr, "Could not complete shutdown of %s, %d resources remaining\n", rsc_id, g_list_length(list_delta));
display_list(list_delta, " * ");
rc = -ETIME;
goto failure;
}
}
if (stop_via_ban) {
rc = cli_resource_clear(rsc_id, host, NULL, cib);
} else if (orig_target_role) {
rc = cli_resource_update_attribute(rsc, rsc_id, NULL, NULL,
XML_RSC_ATTR_TARGET_ROLE,
orig_target_role, FALSE, cib,
&data_set);
free(orig_target_role);
orig_target_role = NULL;
} else {
rc = cli_resource_delete_attribute(rsc, rsc_id, NULL, NULL,
XML_RSC_ATTR_TARGET_ROLE, cib,
&data_set);
}
if(rc != pcmk_ok) {
fprintf(stderr, "Could not unset target-role for %s: %s (%d)\n", rsc_id, pcmk_strerror(rc), rc);
free(rsc_id);
return crm_exit(crm_errno2exit(rc));
}
if (target_active) {
g_list_free_full(target_active, free);
}
target_active = restart_target_active;
if (list_delta) {
g_list_free(list_delta);
}
list_delta = subtract_lists(target_active, current_active);
fprintf(stdout, "Waiting for %d resources to start again:\n", g_list_length(list_delta));
display_list(list_delta, " * ");
step_timeout_s = timeout / sleep_interval;
while (waiting_for_starts(list_delta, rsc, host)) {
before = g_list_length(list_delta);
if(timeout_ms == 0) {
step_timeout_s = max_delay_in(&data_set, list_delta) / sleep_interval;
}
/* We probably don't need the entire step timeout */
for (lpc = 0; (lpc < step_timeout_s) && waiting_for_starts(list_delta, rsc, host); lpc++) {
sleep(sleep_interval);
if(timeout) {
timeout -= sleep_interval;
crm_trace("%ds remaining", timeout);
}
rc = update_dataset(cib, &data_set, FALSE);
if(rc != pcmk_ok) {
fprintf(stderr, "Could not determine which resources were started\n");
goto failure;
}
if (current_active) {
g_list_free_full(current_active, free);
}
/* It's OK if dependent resources moved to a different node,
* so we check active resources on all nodes.
*/
current_active = get_active_resources(NULL, data_set.resources);
g_list_free(list_delta);
list_delta = subtract_lists(target_active, current_active);
dump_list(current_active, "Current");
dump_list(list_delta, "Delta");
}
if(before == g_list_length(list_delta)) {
/* aborted during start phase, print the contents of list_delta */
fprintf(stdout, "Could not complete restart of %s, %d resources remaining\n", rsc_id, g_list_length(list_delta));
display_list(list_delta, " * ");
rc = -ETIME;
goto failure;
}
}
rc = pcmk_ok;
goto done;
failure:
if (stop_via_ban) {
cli_resource_clear(rsc_id, host, NULL, cib);
} else if (orig_target_role) {
cli_resource_update_attribute(rsc, rsc_id, NULL, NULL,
XML_RSC_ATTR_TARGET_ROLE,
orig_target_role, FALSE, cib, &data_set);
free(orig_target_role);
} else {
cli_resource_delete_attribute(rsc, rsc_id, NULL, NULL,
XML_RSC_ATTR_TARGET_ROLE, cib, &data_set);
}
done:
if (list_delta) {
g_list_free(list_delta);
}
if (current_active) {
g_list_free_full(current_active, free);
}
if (target_active && (target_active != restart_target_active)) {
g_list_free_full(target_active, free);
}
if (restart_target_active) {
g_list_free_full(restart_target_active, free);
}
cleanup_alloc_calculations(&data_set);
free(rsc_id);
return rc;
}
static inline int action_is_pending(action_t *action)
{
if(is_set(action->flags, pe_action_optional)) {
return FALSE;
} else if(is_set(action->flags, pe_action_runnable) == FALSE) {
return FALSE;
} else if(is_set(action->flags, pe_action_pseudo)) {
return FALSE;
} else if(safe_str_eq("notify", action->task)) {
return FALSE;
}
return TRUE;
}
/*!
* \internal
* \brief Return TRUE if any actions in a list are pending
*
* \param[in] actions List of actions to check
*
* \return TRUE if any actions in the list are pending, FALSE otherwise
*/
static bool
actions_are_pending(GListPtr actions)
{
GListPtr action;
for (action = actions; action != NULL; action = action->next) {
action_t *a = (action_t *)action->data;
if (action_is_pending(a)) {
crm_notice("Waiting for %s (flags=0x%.8x)", a->uuid, a->flags);
return TRUE;
}
}
return FALSE;
}
/*!
* \internal
* \brief Print pending actions to stderr
*
* \param[in] actions List of actions to check
*
* \return void
*/
static void
print_pending_actions(GListPtr actions)
{
GListPtr action;
fprintf(stderr, "Pending actions:\n");
for (action = actions; action != NULL; action = action->next) {
action_t *a = (action_t *) action->data;
if (action_is_pending(a)) {
fprintf(stderr, "\tAction %d: %s", a->id, a->uuid);
if (a->node) {
fprintf(stderr, "\ton %s", a->node->details->uname);
}
fprintf(stderr, "\n");
}
}
}
/* For --wait, timeout (in seconds) to use if caller doesn't specify one */
#define WAIT_DEFAULT_TIMEOUT_S (60 * 60)
/* For --wait, how long to sleep between cluster state checks */
#define WAIT_SLEEP_S (2)
/*!
* \internal
* \brief Wait until all pending cluster actions are complete
*
* This waits until either the CIB's transition graph is idle or a timeout is
* reached.
*
* \param[in] timeout_ms Consider failed if actions do not complete in this time
* (specified in milliseconds, but one-second granularity
* is actually used; if 0, a default will be used)
* \param[in] cib Connection to the CIB manager
*
* \return pcmk_ok on success, -errno on failure
*/
int
wait_till_stable(int timeout_ms, cib_t * cib)
{
pe_working_set_t data_set;
int rc = -1;
int timeout_s = timeout_ms? ((timeout_ms + 999) / 1000) : WAIT_DEFAULT_TIMEOUT_S;
time_t expire_time = time(NULL) + timeout_s;
time_t time_diff;
bool printed_version_warning = BE_QUIET; // i.e. don't print if quiet
set_working_set_defaults(&data_set);
do {
/* Abort if timeout is reached */
time_diff = expire_time - time(NULL);
if (time_diff > 0) {
crm_info("Waiting up to %ld seconds for cluster actions to complete", time_diff);
} else {
print_pending_actions(data_set.actions);
cleanup_alloc_calculations(&data_set);
return -ETIME;
}
if (rc == pcmk_ok) { /* this avoids sleep on first loop iteration */
sleep(WAIT_SLEEP_S);
}
/* Get latest transition graph */
cleanup_alloc_calculations(&data_set);
rc = update_working_set_from_cib(&data_set, cib);
if (rc != pcmk_ok) {
cleanup_alloc_calculations(&data_set);
return rc;
}
do_calculations(&data_set, data_set.input, NULL);
if (!printed_version_warning) {
/* If the DC has a different version than the local node, the two
* could come to different conclusions about what actions need to be
* done. Warn the user in this case.
*
* @TODO A possible long-term solution would be to reimplement the
* wait as a new controller operation that would be forwarded to the
* DC. However, that would have potential problems of its own.
*/
const char *dc_version = g_hash_table_lookup(data_set.config_hash,
"dc-version");
if (safe_str_neq(dc_version, PACEMAKER_VERSION "-" BUILD_VERSION)) {
printf("warning: --wait command may not work properly in mixed-version cluster\n");
printed_version_warning = TRUE;
}
}
} while (actions_are_pending(data_set.actions));
return pcmk_ok;
}
int
cli_resource_execute(resource_t *rsc, const char *requested_name,
const char *rsc_action, GHashTable *override_hash,
int timeout_ms, cib_t * cib, pe_working_set_t *data_set)
{
int rc = pcmk_ok;
svc_action_t *op = NULL;
const char *rid = NULL;
const char *rtype = NULL;
const char *rprov = NULL;
const char *rclass = NULL;
const char *action = NULL;
GHashTable *params = NULL;
if (safe_str_eq(rsc_action, "validate")) {
action = "validate-all";
} else if (safe_str_eq(rsc_action, "force-check")) {
action = "monitor";
} else if (safe_str_eq(rsc_action, "force-stop")) {
action = rsc_action+6;
} else if (safe_str_eq(rsc_action, "force-start")
|| safe_str_eq(rsc_action, "force-demote")
|| safe_str_eq(rsc_action, "force-promote")) {
action = rsc_action+6;
if(pe_rsc_is_clone(rsc)) {
rc = cli_resource_search(rsc, requested_name, data_set);
if(rc > 0 && do_force == FALSE) {
CMD_ERR("It is not safe to %s %s here: the cluster claims it is already active",
action, rsc->id);
CMD_ERR("Try setting target-role=stopped first or specifying --force");
crm_exit(CRM_EX_UNSAFE);
}
}
}
if(pe_rsc_is_clone(rsc)) {
/* Grab the first child resource in the hope it's not a group */
rsc = rsc->children->data;
}
if(rsc->variant == pe_group) {
CMD_ERR("Sorry, --%s doesn't support group resources", rsc_action);
crm_exit(CRM_EX_UNIMPLEMENT_FEATURE);
}
rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS);
rprov = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER);
rtype = crm_element_value(rsc->xml, XML_ATTR_TYPE);
if (safe_str_eq(rclass, PCMK_RESOURCE_CLASS_STONITH)) {
CMD_ERR("Sorry, --%s doesn't support %s resources yet", rsc_action, rclass);
crm_exit(CRM_EX_UNIMPLEMENT_FEATURE);
}
params = generate_resource_params(rsc, data_set);
/* add meta_timeout env needed by some resource agents */
if (timeout_ms == 0) {
timeout_ms = pe_get_configured_timeout(rsc, action, data_set);
}
g_hash_table_insert(params, strdup("CRM_meta_timeout"),
crm_strdup_printf("%d", timeout_ms));
/* add crm_feature_set env needed by some resource agents */
g_hash_table_insert(params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET));
rid = pe_rsc_is_anon_clone(rsc->parent)? requested_name : rsc->id;
op = resources_action_create(rid, rclass, rprov, rtype, action, 0,
timeout_ms, params, 0);
if (op == NULL) {
/* Re-run with stderr enabled so we can display a sane error message */
crm_enable_stderr(TRUE);
op = resources_action_create(rid, rclass, rprov, rtype, action, 0,
timeout_ms, params, 0);
/* We know op will be NULL, but this makes static analysis happy */
services_action_free(op);
return crm_exit(CRM_EX_DATAERR);
}
setenv("HA_debug", resource_verbose > 0 ? "1" : "0", 1);
if(resource_verbose > 1) {
setenv("OCF_TRACE_RA", "1", 1);
}
if (override_hash) {
GHashTableIter iter;
char *name = NULL;
char *value = NULL;
g_hash_table_iter_init(&iter, override_hash);
while (g_hash_table_iter_next(&iter, (gpointer *) & name, (gpointer *) & value)) {
printf("Overriding the cluster configuration for '%s' with '%s' = '%s'\n",
rsc->id, name, value);
g_hash_table_replace(op->params, strdup(name), strdup(value));
}
}
if (services_action_sync(op)) {
int more, lpc, last;
char *local_copy = NULL;
if (op->status == PCMK_LRM_OP_DONE) {
printf("Operation %s for %s (%s:%s:%s) returned: '%s' (%d)\n",
action, rsc->id, rclass, rprov ? rprov : "", rtype,
services_ocf_exitcode_str(op->rc), op->rc);
} else {
printf("Operation %s for %s (%s:%s:%s) failed: '%s' (%d)\n",
action, rsc->id, rclass, rprov ? rprov : "", rtype,
services_lrm_status_str(op->status), op->status);
}
/* hide output for validate-all if not in verbose */
if (resource_verbose == 0 && safe_str_eq(action, "validate-all"))
goto done;
if (op->stdout_data) {
local_copy = strdup(op->stdout_data);
more = strlen(local_copy);
last = 0;
for (lpc = 0; lpc < more; lpc++) {
if (local_copy[lpc] == '\n' || local_copy[lpc] == 0) {
local_copy[lpc] = 0;
printf(" > stdout: %s\n", local_copy + last);
last = lpc + 1;
}
}
free(local_copy);
}
if (op->stderr_data) {
local_copy = strdup(op->stderr_data);
more = strlen(local_copy);
last = 0;
for (lpc = 0; lpc < more; lpc++) {
if (local_copy[lpc] == '\n' || local_copy[lpc] == 0) {
local_copy[lpc] = 0;
printf(" > stderr: %s\n", local_copy + last);
last = lpc + 1;
}
}
free(local_copy);
}
}
done:
rc = op->rc;
services_action_free(op);
return rc;
}
int
cli_resource_move(resource_t *rsc, const char *rsc_id, const char *host_name,
cib_t *cib, pe_working_set_t *data_set)
{
int rc = pcmk_ok;
unsigned int count = 0;
node_t *current = NULL;
node_t *dest = pe_find_node(data_set->nodes, host_name);
bool cur_is_dest = FALSE;
if (dest == NULL) {
return -pcmk_err_node_unknown;
}
if (scope_master && is_not_set(rsc->flags, pe_rsc_promotable)) {
resource_t *p = uber_parent(rsc);
if (is_set(p->flags, pe_rsc_promotable)) {
CMD_ERR("Using parent '%s' for --move command instead of '%s'.", rsc->id, rsc_id);
rsc_id = p->id;
rsc = p;
} else {
CMD_ERR("Ignoring '--master' option: %s is not a promotable resource",
rsc_id);
scope_master = FALSE;
}
}
current = pe__find_active_requires(rsc, &count);
if (is_set(rsc->flags, pe_rsc_promotable)) {
GListPtr iter = NULL;
unsigned int master_count = 0;
pe_node_t *master_node = NULL;
for(iter = rsc->children; iter; iter = iter->next) {
resource_t *child = (resource_t *)iter->data;
enum rsc_role_e child_role = child->fns->state(child, TRUE);
if(child_role == RSC_ROLE_MASTER) {
rsc = child;
master_node = pe__current_node(child);
master_count++;
}
}
if (scope_master || master_count) {
count = master_count;
current = master_node;
}
}
if (count > 1) {
if (pe_rsc_is_clone(rsc)) {
current = NULL;
} else {
return -pcmk_err_multiple;
}
}
if (current && (current->details == dest->details)) {
cur_is_dest = TRUE;
if (do_force) {
crm_info("%s is already %s on %s, reinforcing placement with location constraint.",
rsc_id, scope_master?"promoted":"active", dest->details->uname);
} else {
return -pcmk_err_already;
}
}
/* Clear any previous constraints for 'dest' */
cli_resource_clear(rsc_id, dest->details->uname, data_set->nodes, cib);
/* Record an explicit preference for 'dest' */
rc = cli_resource_prefer(rsc_id, dest->details->uname, cib);
crm_trace("%s%s now prefers node %s%s",
rsc->id, scope_master?" (master)":"", dest->details->uname, do_force?"(forced)":"");
/* only ban the previous location if current location != destination location.
* it is possible to use -M to enforce a location without regard of where the
* resource is currently located */
if(do_force && (cur_is_dest == FALSE)) {
/* Ban the original location if possible */
if(current) {
(void)cli_resource_ban(rsc_id, current->details->uname, NULL, cib);
} else if(count > 1) {
CMD_ERR("Resource '%s' is currently %s in %d locations. One may now move to %s",
rsc_id, scope_master?"promoted":"active", count, dest->details->uname);
CMD_ERR("You can prevent '%s' from being %s at a specific location with:"
" --ban %s--host <name>", rsc_id, scope_master?"promoted":"active", scope_master?"--master ":"");
} else {
crm_trace("Not banning %s from its current location: not active", rsc_id);
}
}
return rc;
}
static void
cli_resource_why_without_rsc_and_host(cib_t *cib_conn,GListPtr resources)
{
GListPtr lpc = NULL;
GListPtr hosts = NULL;
for (lpc = resources; lpc != NULL; lpc = lpc->next) {
resource_t *rsc = (resource_t *) lpc->data;
rsc->fns->location(rsc, &hosts, TRUE);
if (hosts == NULL) {
printf("Resource %s is not running\n", rsc->id);
} else {
printf("Resource %s is running\n", rsc->id);
}
cli_resource_check(cib_conn, rsc);
g_list_free(hosts);
hosts = NULL;
}
}
static void
cli_resource_why_with_rsc_and_host(cib_t *cib_conn, GListPtr resources,
resource_t *rsc, const char *host_uname)
{
if (resource_is_running_on(rsc, host_uname)) {
printf("Resource %s is running on host %s\n",rsc->id,host_uname);
} else {
printf("Resource %s is not running on host %s\n", rsc->id, host_uname);
}
cli_resource_check(cib_conn, rsc);
}
static void
cli_resource_why_without_rsc_with_host(cib_t *cib_conn,GListPtr resources,node_t *node)
{
const char* host_uname = node->details->uname;
GListPtr allResources = node->details->allocated_rsc;
GListPtr activeResources = node->details->running_rsc;
GListPtr unactiveResources = subtract_lists(allResources,activeResources);
GListPtr lpc = NULL;
for (lpc = activeResources; lpc != NULL; lpc = lpc->next) {
resource_t *rsc = (resource_t *) lpc->data;
printf("Resource %s is running on host %s\n",rsc->id,host_uname);
cli_resource_check(cib_conn,rsc);
}
for(lpc = unactiveResources; lpc != NULL; lpc = lpc->next) {
resource_t *rsc = (resource_t *) lpc->data;
printf("Resource %s is assigned to host %s but not running\n",
rsc->id, host_uname);
cli_resource_check(cib_conn,rsc);
}
g_list_free(allResources);
g_list_free(activeResources);
g_list_free(unactiveResources);
}
static void
cli_resource_why_with_rsc_without_host(cib_t *cib_conn, GListPtr resources,
resource_t *rsc)
{
GListPtr hosts = NULL;
rsc->fns->location(rsc, &hosts, TRUE);
printf("Resource %s is %srunning\n", rsc->id, (hosts? "" : "not "));
cli_resource_check(cib_conn, rsc);
g_list_free(hosts);
}
void cli_resource_why(cib_t *cib_conn, GListPtr resources, resource_t *rsc,
node_t *node)
{
const char *host_uname = (node == NULL)? NULL : node->details->uname;
if ((rsc == NULL) && (host_uname == NULL)) {
cli_resource_why_without_rsc_and_host(cib_conn, resources);
} else if ((rsc != NULL) && (host_uname != NULL)) {
cli_resource_why_with_rsc_and_host(cib_conn, resources, rsc,
host_uname);
} else if ((rsc == NULL) && (host_uname != NULL)) {
cli_resource_why_without_rsc_with_host(cib_conn, resources, node);
} else if ((rsc != NULL) && (host_uname == NULL)) {
cli_resource_why_with_rsc_without_host(cib_conn, resources, rsc);
}
}
diff --git a/tools/crm_standby.in b/tools/crm_standby.in
index 220c1e7f94..1c6dea4655 100755
--- a/tools/crm_standby.in
+++ b/tools/crm_standby.in
@@ -1,150 +1,156 @@
#!@BASH_PATH@
+#
+# Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
+#
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
+#
USAGE_TEXT="Usage: crm_standby <command> [options]
Common options:
--help Display this text, then exit
--version Display version information, then exit
-V, --verbose Specify multiple times to increase debug output
-q, --quiet Print only the standby status (if querying)
Commands:
-G, --query Query the current value of standby mode (on/off)
-v, --update=VALUE Update the value of standby mode (on/off)
-D, --delete Let standby mode use default value
Additional Options:
-N, --node=NODE Operate on the named node instead of the current one
-l, --lifetime=VALUE Until when should the setting take effect
(valid values: reboot, forever)
-i, --id=VALUE (Advanced) XML ID used to identify standby attribute"
HELP_TEXT="crm_standby - Query, enable, or disable standby mode for a node
Nodes in standby mode may not host cluster resources.
$USAGE_TEXT
"
exit_usage() {
if [ $# -gt 0 ]; then
- echo "error: $@" >&2
+ echo "error:" "$@" >&2
fi
echo
echo "$USAGE_TEXT"
exit 1
}
op=""
options=""
lifetime=0
target=""
SHORTOPTS_DEPRECATED="U:Q"
LONGOPTS_DEPRECATED="uname:,get-value,delete-attr,attr-value:,attr-id:"
SHORTOPTS="VqGv:DN:l:i:"
LONGOPTS="help,version,verbose,quiet,query,update:,delete,node:,lifetime:,id:"
TEMP=$(@GETOPT_PATH@ -o ${SHORTOPTS}${SHORTOPTS_DEPRECATED} \
--long ${LONGOPTS},${LONGOPTS_DEPRECATED} \
-n crm_standby -- "$@")
if [ $? -ne 0 ]; then
exit_usage
fi
eval set -- "$TEMP" # Quotes around $TEMP are essential
while true ; do
case "$1" in
--help)
echo "$HELP_TEXT"
exit 0
;;
--version)
crm_attribute --version
exit 0
;;
-q|--quiet|-V|--verbose|-Q)
options="$options $1"
shift
;;
-N|--node|-U|--uname)
target="$2"
shift
shift
;;
-G|--query|--get-value)
options="$options --query"
op=g
shift
;;
-v|--update|--attr-value)
options="$options --update $2"
op=u
shift
shift
;;
-D|--delete|--delete-attr)
options="$options --delete"
op=d
shift
;;
-l|--lifetime)
options="$options --lifetime $2"
lifetime=1
shift
shift
;;
-i|--id|--attr-id)
options="$options --id $2"
shift
shift
;;
--)
shift
break
;;
*)
exit_usage "unknown option '$1'"
;;
esac
done
# It's important to call cluster commands only after arguments are processed,
# so --version and --help work without problems even if those commands don't.
if [ "$target" = "" ]; then
target=$(crm_node -n)
fi
options="-N $target -n standby $options"
if [ x$op = x ]; then
options="$options -G"; op=g
fi
# If the user didn't explicitly specify a lifetime ...
if [ $lifetime -eq 0 ]; then
case $op in
g)
# For query, report the forever entry if one exists, otherwise
# report the reboot entry if one exists, otherwise report off.
crm_attribute $options -l forever >/dev/null 2>&1
if [ $? -eq 0 ]; then
options="$options -l forever"
else
options="$options -l reboot -d off"
fi
;;
u)
# For update, default to updating the forever entry.
options="$options -l forever"
;;
d)
# For delete, default to deleting both forever and reboot entries.
crm_attribute $options -l forever
crm_attribute $options -l reboot
exit 0
;;
esac
fi
crm_attribute $options
diff --git a/tools/report.collector.in b/tools/report.collector.in
index 2540fc7769..25e3c6a7b2 100644
--- a/tools/report.collector.in
+++ b/tools/report.collector.in
@@ -1,823 +1,820 @@
#
# Originally based on hb_report
# Copyright 2007 Dejan Muhamedagic <dmuhamedagic@suse.de>
#
# Later changes copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
if
echo $REPORT_HOME | grep -qs '^/'
then
debug "Using full path to working directory: $REPORT_HOME"
else
REPORT_HOME="$HOME/$REPORT_HOME"
debug "Canonicalizing working directory path: $REPORT_HOME"
fi
detect_host
#
# find files newer than a and older than b
#
isnumber() {
echo "$*" | grep -qs '^[0-9][0-9]*$'
}
touchfile() {
t=`mktemp` &&
perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' &&
echo $t
}
find_files_clean() {
[ -z "$from_stamp" ] || rm -f "$from_stamp"
[ -z "$to_stamp" ] || rm -f "$to_stamp"
from_stamp=""
to_stamp=""
}
find_files() {
dirs=
from_time=$2
to_time=$3
for d in $1; do
if [ -d $d ]; then
dirs="$dirs $d"
fi
done
if [ x"$dirs" = x ]; then
return
fi
isnumber "$from_time" && [ "$from_time" -gt 0 ] || {
warning "sorry, can't find files in [ $1 ] based on time if you don't supply time"
return
}
trap find_files_clean 0
if ! from_stamp=`touchfile $from_time`; then
warning "sorry, can't create temporary file for find_files"
return
fi
findexp="-newer $from_stamp"
if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then
if ! to_stamp=`touchfile $to_time`; then
warning "sorry, can't create temporary file for find_files"
find_files_clean
return
fi
findexp="$findexp ! -newer $to_stamp"
fi
find $dirs -type f $findexp
find_files_clean
trap "" 0
}
#
# check permissions of files/dirs
#
pl_checkperms() {
perl -e '
# check permissions and ownership
# uid and gid are numeric
# everything must match exactly
# no error checking! (file should exist, etc)
($filename, $perms, $in_uid, $in_gid) = @ARGV;
($mode,$uid,$gid) = (stat($filename))[2,4,5];
$p=sprintf("%04o", $mode & 07777);
$p ne $perms and exit(1);
$uid ne $in_uid and exit(1);
$gid ne $in_gid and exit(1);
' $*
}
num_id() {
getent $1 $2 | awk -F: '{print $3}'
}
chk_id() {
[ "$2" ] && return 0
echo "$1: id not found"
return 1
}
check_perms() {
while read type f p uid gid; do
if [ ! -e "$f" ]; then
echo "$f doesn't exist"
continue
elif [ ! -$type "$f" ]; then
echo "$f has wrong type"
continue
fi
n_uid=`num_id passwd $uid`
chk_id "$uid" "$n_uid" || continue
n_gid=`num_id group $gid`
chk_id "$gid" "$n_gid" || continue
pl_checkperms $f $p $n_uid $n_gid || {
echo "wrong permissions or ownership for $f:"
ls -ld $f
}
done
}
#
# coredumps
#
findbinary() {
random_binary=`which cat 2>/dev/null` # suppose we are lucky
binary=`gdb $random_binary $1 < /dev/null 2>/dev/null |
grep 'Core was generated' | awk '{print $5}' |
sed "s/^.//;s/[.':]*$//"`
if [ x = x"$binary" ]; then
debug "Could not detect the program name for core $1 from the gdb output; will try with file(1)"
binary=$(file $1 | awk '/from/{
for( i=1; i<=NF; i++ )
if( $i == "from" ) {
print $(i+1)
break
}
}')
binary=`echo $binary | tr -d "'"`
binary=$(echo $binary | tr -d '`')
if [ "$binary" ]; then
binary=`which $binary 2>/dev/null`
fi
fi
if [ x = x"$binary" ]; then
warning "Could not find the program path for core $1"
return
fi
fullpath=`which $binary 2>/dev/null`
if [ x = x"$fullpath" ]; then
if [ -x $CRM_DAEMON_DIR/$binary ]; then
echo $CRM_DAEMON_DIR/$binary
debug "Found the program at $CRM_DAEMON_DIR/$binary for core $1"
else
warning "Could not find the program path for core $1"
fi
else
echo $fullpath
debug "Found the program at $fullpath for core $1"
fi
}
getbt() {
which gdb > /dev/null 2>&1 || {
warning "Please install gdb to get backtraces"
return
}
for corefile; do
absbinpath=`findbinary $corefile`
[ x = x"$absbinpath" ] && continue
echo "====================== start backtrace ======================"
ls -l $corefile
# Summary first...
gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt"} -ex quit \
$absbinpath $corefile 2>/dev/null
echo "====================== start detail ======================"
# Now the unreadable details...
gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \
$absbinpath $corefile 2>/dev/null
echo "======================= end backtrace ======================="
done
}
dump_status_and_config() {
crm_mon -1 2>&1 | grep -v '^Last upd' > $target/$CRM_MON_F
cibadmin -Ql 2>/dev/null > $target/${CIB_F}.live
}
getconfig() {
cluster=$1; shift;
target=$1; shift;
for cf in $*; do
if [ -e "$cf" ]; then
cp -a "$cf" $target/
fi
done
if is_running pacemaker-controld; then
dump_status_and_config
- case $cluster in
- corosync) crm_node -p --corosync > $target/$MEMBERSHIP_F 2>&1;;
- *) crm_node -p > $target/$MEMBERSHIP_F 2>&1;;
- esac
+ crm_node -p > "$target/$MEMBERSHIP_F" 2>&1
echo "$host" > $target/RUNNING
elif is_running pacemaker-remoted; then
dump_status_and_config
echo "$host" > $target/RUNNING
# Pre-2.0.0 daemon name in case we're collecting on a mixed-version cluster
elif is_running pacemaker_remoted; then
dump_status_and_config
echo "$host" > $target/RUNNING
else
echo "$host" > $target/STOPPED
fi
}
get_readable_cib() {
target="$1"; shift;
if [ -f "$target/$CIB_F" ]; then
crm_verify -V -x "$target/$CIB_F" >"$target/$CRM_VERIFY_F" 2>&1
if which crm >/dev/null 2>&1 ; then
CIB_file="$target/$CIB_F" crm configure show >"$target/$CIB_TXT_F" 2>&1
elif which pcs >/dev/null 2>&1 ; then
pcs config -f "$target/$CIB_F" >"$target/$CIB_TXT_F" 2>&1
fi
fi
}
#
# remove values of sensitive attributes
#
# this is not proper xml parsing, but it will work under the
# circumstances
sanitize_xml_attrs() {
sed $(
for patt in $SANITIZE; do
echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/"
done
)
}
sanitize_hacf() {
awk '
$1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; }
{print}
'
}
sanitize_one_clean() {
[ -z "$tmp" ] || rm -f "$tmp"
tmp=""
[ -z "$ref" ] || rm -f "$ref"
ref=""
}
sanitize() {
file=$1
compress=""
if [ -z "$SANITIZE" ]; then
return
fi
echo $file | grep -qs 'gz$' && compress=gzip
echo $file | grep -qs 'bz2$' && compress=bzip2
if [ "$compress" ]; then
decompress="$compress -dc"
else
compress=cat
decompress=cat
fi
trap sanitize_one_clean 0
tmp=`mktemp`
ref=`mktemp`
if [ -z "$tmp" -o -z "$ref" ]; then
sanitize_one_clean
fatal "cannot create temporary files"
fi
touch -r $file $ref # save the mtime
if [ "`basename $file`" = ha.cf ]; then
sanitize_hacf
else
$decompress | sanitize_xml_attrs | $compress
fi < $file > $tmp
mv $tmp $file
# note: cleaning $tmp up is still needed even after it's renamed
# because its temp directory is still there.
touch -r $ref $file
sanitize_one_clean
trap "" 0
}
#
# get some system info
#
distro() {
if
which lsb_release >/dev/null 2>&1
then
lsb_release -d | sed -e 's/^Description:\s*//'
debug "Using lsb_release for distribution info"
return
fi
relf=`ls /etc/debian_version 2>/dev/null` ||
relf=`ls /etc/slackware-version 2>/dev/null` ||
relf=`ls -d /etc/*-release 2>/dev/null` && {
for f in $relf; do
test -f $f && {
echo "`ls $f` `cat $f`"
debug "Found `echo $relf | tr '\n' ' '` distribution release file(s)"
return
}
done
}
warning "No lsb_release, no /etc/*-release, no /etc/debian_version: no distro information"
}
pkg_ver() {
if which dpkg >/dev/null 2>&1 ; then
pkg_mgr="deb"
elif which rpm >/dev/null 2>&1 ; then
pkg_mgr="rpm"
elif which pkg_info >/dev/null 2>&1 ; then
pkg_mgr="pkg_info"
elif which pkginfo >/dev/null 2>&1 ; then
pkg_mgr="pkginfo"
else
warning "Unknown package manager"
return
fi
debug "The package manager is: $pkg_mgr"
echo "The package manager is: $pkg_mgr"
echo "Installed packages:"
case $pkg_mgr in
deb)
dpkg-query -f '${Package} ${Version} ${Architecture}\n' -W | sort
echo
for pkg in $*; do
if dpkg-query -W $pkg 2>/dev/null ; then
debug "Verifying installation of: $pkg"
echo "Verifying installation of: $pkg"
debsums -s $pkg 2>/dev/null
fi
done
;;
rpm)
rpm -qa --qf '%{name} %{version}-%{release} - %{distribution} %{arch}\n' | sort
echo
for pkg in $*; do
if rpm -q $pkg >/dev/null 2>&1 ; then
debug "Verifying installation of: $pkg"
echo "Verifying installation of: $pkg"
rpm --verify $pkg 2>&1
fi
done
;;
pkg_info)
pkg_info
;;
pkginfo)
pkginfo | awk '{print $3}' # format?
;;
esac
}
getbacktraces() {
debug "Looking for backtraces: $*"
flist=$(
for f in `find_files "$CRM_CORE_DIRS" $1 $2`; do
bf=`basename $f`
test `expr match $bf core` -gt 0 &&
echo $f
done)
if [ "$flist" ]; then
for core in $flist; do
log "Found core file: `ls -al $core`"
done
# Make a copy of them in case we need more data later
# Luckily they compress well
mkdir cores >/dev/null 2>&1
cp -a $flist cores/
shrink cores
rm -rf cores
# Now get as much as we can from them automagically
for f in $flist; do
getbt $f
done
fi
}
getpeinputs() {
if [ -n "$PE_STATE_DIR" ]; then
flist=$(
find_files "$PE_STATE_DIR" "$1" "$2" | sed "s,`dirname $PE_STATE_DIR`/,,g"
)
if [ "$flist" ]; then
(cd $(dirname "$PE_STATE_DIR") && tar cf - $flist) | (cd "$3" && tar xf -)
debug "found `echo $flist | wc -w` scheduler input files in $PE_STATE_DIR"
fi
fi
}
getblackboxes() {
flist=$(
find_files $BLACKBOX_DIR $1 $2
)
for bb in $flist; do
bb_short=`basename $bb`
qb-blackbox $bb > $3/${bb_short}.blackbox 2>&1
info "Extracting contents of blackbox: $bb_short"
done
}
#
# some basic system info and stats
#
sys_info() {
cluster=$1; shift
echo "Platform: `uname`"
echo "Kernel release: `uname -r`"
echo "Architecture: `uname -m`"
if [ `uname` = Linux ]; then
echo "Distribution: `distro`"
fi
echo
cibadmin --version 2>&1 | head -1
cibadmin -! 2>&1
case $cluster in
corosync)
/usr/sbin/corosync -v 2>&1 | head -1
;;
esac
# Cluster glue version hash (if available)
stonith -V 2>/dev/null
# Resource agents version hash
echo "resource-agents: `grep 'Build version:' /usr/lib/ocf/resource.d/heartbeat/.ocf-shellfuncs`"
echo
pkg_ver $*
}
sys_stats() {
set -x
uname -n
uptime
ps axf
ps auxw
top -b -n 1
ifconfig -a
ip addr list
netstat -i
arp -an
test -d /proc && {
cat /proc/cpuinfo
}
lsscsi
lspci
mount
df
set +x
}
dlm_dump() {
if which dlm_tool >/dev/null 2>&1 ; then
if is_running dlm_controld; then
echo "--- Lockspace overview:"
dlm_tool ls -n
echo "---Lockspace history:"
dlm_tool dump
echo "---Lockspace status:"
dlm_tool status
dlm_tool status -v
echo "---Lockspace config:"
dlm_tool dump_config
dlm_tool log_plock
dlm_tool ls | grep name |
while read X N ; do
echo "--- Lockspace $N:"
dlm_tool lockdump "$N"
dlm_tool lockdebug -svw "$N"
done
fi
fi
}
drbd_info() {
test -f /proc/drbd && {
echo "--- /proc/drbd:"
cat /proc/drbd 2>&1
echo
}
if which drbd-overview >/dev/null 2>&1; then
echo "--- drbd-overview:"
drbd-overview 2>&1
echo
fi
if which drbdsetup >/dev/null 2>&1; then
echo "--- drbdsetup status:"
drbdsetup status --verbose --statistics 2>&1
echo
echo "--- drbdsetup events2:"
drbdsetup events2 --timestamps --statistics --now 2>&1
echo
fi
if which drbdadm >/dev/null 2>&1; then
echo "--- drbdadm show-gi:"
for res in $(drbdsetup status | grep -e ^\\S | awk '{ print $1 }'); do
echo "$res:"
drbdadm show-gi $res 2>&1
echo
done
fi
}
iscfvarset() {
test "`getcfvar $1 $2`"
}
iscfvartrue() {
getcfvar $1 $2 $3 | egrep -qsi "^(true|y|yes|on|1)"
}
get_logfiles() {
cf_type=$1
cf_file="$2"
facility_var="logfacility"
case $cf_type in
corosync)
if [ -f "$cf_file" ]; then
debug "Reading $cf_type log settings from $cf_file"
if iscfvartrue $cf_type to_syslog "$cf_file"; then
facility_var=syslog_facility
fi
if iscfvartrue $cf_type to_logfile "$cf_file"; then
logfile=$(getcfvar $cf_type logfile "$cf_file")
fi
fi
;;
esac
if [ -z "$logfile" ]; then
# @TODO Use PCMK_logfile if set
logfile="@CRM_LOG_DIR@/pacemaker.log"
debug "Log settings not found for cluster type $cf_type, assuming $logfile"
fi
if [ -f "$logfile" ]; then
echo $logfile
fi
if [ "x$facility" = x ]; then
facility=`getcfvar $cf_type $facility_var $cf_file`
[ "" = "$facility" ] && facility="daemon"
fi
# Always include system logs (if we can find them)
msg="Mark:pcmk:`perl -e 'print time()'`"
logger -p $facility.info $msg >/dev/null 2>&1
sleep 2 # Give syslog time to catch up in case it's busy
findmsg 1 "$msg"
# Look for detail logs:
# - initial pacemakerd logs and tracing might go to a different file
pattern="Starting Pacemaker"
# - make sure we get something from the scheduler
pattern="$pattern\\|Calculated Transition"
# - cib and pacemaker-execd updates
# (helpful on non-DC nodes and when cluster has been up for a long time)
pattern="$pattern\\|cib_perform_op\\|process_lrm_event"
# - pacemaker_remote might use a different file
pattern="$pattern\\|pacemaker[-_]remoted:"
findmsg 3 "$pattern"
}
essential_files() {
cat<<EOF
d $PE_STATE_DIR 0750 hacluster haclient
d $CRM_CONFIG_DIR 0750 hacluster haclient
d $CRM_STATE_DIR 0750 hacluster haclient
EOF
}
# Trim leading and ending whitespace (using only POSIX expressions)
trim() {
TRIM_S="$1"
TRIM_S="${TRIM_S#"${TRIM_S%%[![:space:]]*}"}"
TRIM_S="${TRIM_S%"${TRIM_S##*[![:space:]]}"}"
echo -n "$TRIM_S"
}
collect_logs() {
CL_START="$1"
shift
CL_END="$1"
shift
CL_LOGFILES="$@"
which journalctl > /dev/null 2>&1
if [ $? -eq 0 ]; then
cl_have_journald=1
else
cl_have_journald=0
fi
cl_lognames="$CL_LOGFILES"
if [ $cl_have_journald -eq 1 ]; then
cl_lognames="$cl_lognames journalctl"
fi
cl_lognames=$(trim "$cl_lognames")
if [ -z "$cl_lognames" ]; then
return
fi
# YYYY-MM-DD HH:MM:SS
cl_start_ymd=$(date -d @${CL_START} +"%F %T")
cl_end_ymd=$(date -d @${CL_END} +"%F %T")
debug "Gathering logs from $cl_start_ymd to $cl_end_ymd:"
debug " $cl_lognames"
# Remove our temporary file if we get interrupted here
trap '[ -z "$cl_pattfile" ] || rm -f "$cl_pattfile"' 0
# Create a temporary file with patterns to grep for
cl_pattfile=$(mktemp) || fatal "cannot create temporary files"
for cl_pattern in $LOG_PATTERNS; do
echo "$cl_pattern"
done > $cl_pattfile
echo "Log pattern matches from $REPORT_TARGET:" > $ANALYSIS_F
if [ -n "$CL_LOGFILES" ]; then
for cl_logfile in $CL_LOGFILES; do
cl_extract="$(basename $cl_logfile).extract.txt"
if [ ! -f "$cl_logfile" ]; then
# Not a file
continue
elif [ -f "$cl_extract" ]; then
# We already have it
continue
fi
dumplogset "$cl_logfile" $LOG_START $LOG_END > "$cl_extract"
sanitize "$cl_extract"
grep -f "$cl_pattfile" "$cl_extract" >> $ANALYSIS_F
done
fi
# Collect systemd logs if present
if [ $cl_have_journald -eq 1 ]; then
journalctl --since "$cl_start_ymd" --until "$cl_end_ymd" > journal.log
grep -f "$cl_pattfile" journal.log >> $ANALYSIS_F
fi
rm -f $cl_pattfile
trap "" 0
}
debug "Initializing $REPORT_TARGET subdir"
if [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then
if [ -e $REPORT_HOME/$REPORT_TARGET ]; then
warning "Directory $REPORT_HOME/$REPORT_TARGET already exists, using /tmp/$$/$REPORT_TARGET instead"
REPORT_HOME=/tmp/$$
fi
fi
mkdir -p $REPORT_HOME/$REPORT_TARGET
cd $REPORT_HOME/$REPORT_TARGET
case $CLUSTER in
any) cluster=`get_cluster_type`;;
*) cluster=$CLUSTER;;
esac
cluster_cf=`find_cluster_cf $cluster`
# If cluster stack is still "any", this might be a Pacemaker Remote node,
# so don't complain in that case.
if [ -z "$cluster_cf" ] && [ $cluster != "any" ]; then
warning "Could not determine the location of your cluster configuration"
fi
if [ "$SEARCH_LOGS" = "1" ]; then
logfiles=$(get_logfiles "$cluster" "$cluster_cf" | sort -u)
fi
logfiles="$(trim "$logfiles $EXTRA_LOGS")"
if [ -z "$logfiles" ]; then
which journalctl > /dev/null 2>&1
if [ $? -eq 0 ]; then
info "Systemd journal will be only log collected"
else
info "No logs will be collected"
fi
info "No log files found or specified with --logfile /some/path"
fi
debug "Config: $cluster ($cluster_cf) $logfiles"
sys_info $cluster $PACKAGES > $SYSINFO_F
essential_files $cluster | check_perms > $PERMISSIONS_F 2>&1
getconfig $cluster "$REPORT_HOME/$REPORT_TARGET" "$cluster_cf" "$CRM_CONFIG_DIR/$CIB_F" "/etc/drbd.conf" "/etc/drbd.d" "/etc/booth"
getpeinputs $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET
getbacktraces $LOG_START $LOG_END > $REPORT_HOME/$REPORT_TARGET/$BT_F
getblackboxes $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET
case $cluster in
corosync)
if is_running corosync; then
corosync-blackbox >corosync-blackbox-live.txt 2>&1
# corosync-fplay > corosync-blackbox.txt
tool=`pickfirst corosync-objctl corosync-cmapctl`
case $tool in
*objctl) $tool -a > corosync.dump 2>/dev/null;;
*cmapctl) $tool > corosync.dump 2>/dev/null;;
esac
corosync-quorumtool -s -i > corosync.quorum 2>&1
fi
;;
esac
dc=`crm_mon -1 2>/dev/null | awk '/Current DC/ {print $3}'`
if [ "$REPORT_TARGET" = "$dc" ]; then
echo "$REPORT_TARGET" > DC
fi
dlm_dump > $DLM_DUMP_F 2>&1
sys_stats > $SYSSTATS_F 2>&1
drbd_info > $DRBD_INFO_F 2>&1
debug "Sanitizing files: $SANITIZE"
#
# replace sensitive info with '****'
#
cf=""
if [ ! -z "$cluster_cf" ]; then
cf=`basename $cluster_cf`
fi
for f in "$cf" "$CIB_F" "$CIB_F.live" pengine/*; do
if [ -f "$f" ]; then
sanitize "$f"
fi
done
# For convenience, generate human-readable version of CIB and any XML errors
# in it (AFTER sanitizing, so we don't need to sanitize this output)
get_readable_cib "$REPORT_HOME/$REPORT_TARGET"
collect_logs "$LOG_START" "$LOG_END" $logfiles
# Purge files containing no information
for f in `ls -1`; do
if [ -d "$f" ]; then
continue
elif [ ! -s "$f" ]; then
case $f in
*core*) log "Detected empty core file: $f";;
*) debug "Removing empty file: `ls -al $f`"
rm -f $f
;;
esac
fi
done
# Parse for events
for l in $logfiles; do
b="$(basename $l).extract.txt"
node_events "$b" > $EVENTS_F
# Link the first logfile to a standard name if it doesn't yet exist
if [ -e "$b" -a ! -e "$HALOG_F" ]; then
ln -s "$b" "$HALOG_F"
fi
done
if [ -e $REPORT_HOME/.env ]; then
debug "Localhost: $REPORT_MASTER $REPORT_TARGET"
elif [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then
debug "Streaming report back to $REPORT_MASTER"
(cd $REPORT_HOME && tar cf - $REPORT_TARGET)
if [ "$REMOVE" = "1" ]; then
cd
rm -rf $REPORT_HOME
fi
fi
# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80:
diff --git a/tools/report.common.in b/tools/report.common.in
index 9c4113fc30..39e59360af 100644
--- a/tools/report.common.in
+++ b/tools/report.common.in
@@ -1,866 +1,866 @@
#
# Originally based on hb_report
# Copyright 2007 Dejan Muhamedagic <dmuhamedagic@suse.de>
#
# Later changes copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
host=`uname -n`
shorthost=`echo $host | sed s:\\\\..*::`
if [ -z $verbose ]; then
verbose=0
fi
# Target Files
EVENTS_F=events.txt
ANALYSIS_F=analysis.txt
HALOG_F=cluster-log.txt
BT_F=backtraces.txt
SYSINFO_F=sysinfo.txt
SYSSTATS_F=sysstats.txt
DLM_DUMP_F=dlm_dump.txt
CRM_MON_F=crm_mon.txt
MEMBERSHIP_F=members.txt
CRM_VERIFY_F=crm_verify.txt
PERMISSIONS_F=permissions.txt
CIB_F=cib.xml
CIB_TXT_F=cib.txt
DRBD_INFO_F=drbd_info.txt
EVENT_PATTERNS="
state do_state_transition
membership pcmk_peer_update.*(lost|memb):
quorum (crmd|pacemaker-controld).*crm_update_quorum
pause Process.pause.detected
resources (lrmd|pacemaker-execd).*rsc:(start|stop)
stonith te_fence_node|fenced.*(requests|(Succeeded|Failed).to.|result=)
start_stop shutdown.decision|Corosync.Cluster.Engine|corosync.*Initializing.transport|Executive.Service.RELEASE|crm_shutdown:.Requesting.shutdown|pcmk_shutdown:.Shutdown.complete
"
# superset of all packages of interest on all distros
# (the package manager will be used to validate the installation
# of any of these packages that are installed)
PACKAGES="pacemaker pacemaker-libs pacemaker-cluster-libs libpacemaker3
pacemaker-remote pacemaker-pygui pacemaker-pymgmt pymgmt-client
corosync corosynclib libcorosync4
resource-agents cluster-glue-libs cluster-glue libglue2 ldirectord
ocfs2-tools ocfs2-tools-o2cb ocfs2console
ocfs2-kmp-default ocfs2-kmp-pae ocfs2-kmp-xen ocfs2-kmp-debug ocfs2-kmp-trace
drbd drbd-kmp-xen drbd-kmp-pae drbd-kmp-default drbd-kmp-debug drbd-kmp-trace
drbd-pacemaker drbd-utils drbd-bash-completion drbd-xen
lvm2 lvm2-clvm cmirrord
libdlm libdlm2 libdlm3
hawk ruby lighttpd
kernel-default kernel-pae kernel-xen
glibc
"
# Potential locations of system log files
SYSLOGS="
/var/log/*
/var/logs/*
/var/syslog/*
/var/adm/*
/var/log/ha/*
/var/log/cluster/*
"
# Whether pacemaker-remoted was found (0 = yes, 1 = no, -1 = haven't looked yet)
REMOTED_STATUS=-1
#
# keep the user posted
#
record() {
if [ x != x"$REPORT_HOME" -a -d "${REPORT_HOME}/$shorthost" ]; then
rec="${REPORT_HOME}/$shorthost/report.out"
elif [ x != x"${l_base}" -a -d "${l_base}" ]; then
rec="${l_base}/report.summary"
else
rec="/dev/null"
fi
printf "%-10s $*\n" "$shorthost:" 2>&1 >> "${rec}"
}
log() {
printf "%-10s $*\n" "$shorthost:" 1>&2
record "$*"
}
debug() {
if [ $verbose -gt 0 ]; then
log "Debug: $*"
else
record "Debug: $*"
fi
}
info() {
log "$*"
}
warning() {
log "WARN: $*"
}
fatal() {
log "ERROR: $*"
exit 1
}
# check if process of given substring in its name does exist;
# only look for processes originated by user 0 (by UID), "@CRM_DAEMON_USER@"
# or effective user running this script, and/or group 0 (by GID),
# "@CRM_DAEMON_GROUP@" or one of the groups the effective user belongs to
# (there's no business in probing any other processes)
is_running() {
- ps -G "0 $(getent group '@CRM_DAEMON_GROUP@' 2>/dev/null | cut -d: -f3) \
- $(id -G)" \
- -u "0 @CRM_DAEMON_USER@ $(id -u)" \
+ ps -G "0 $(getent group '@CRM_DAEMON_GROUP@' 2>/dev/null | cut -d: -f3) $(id -G)" \
+ -u "0 @CRM_DAEMON_USER@ $(id -u)" -f \
| grep -Eqs $(echo "$1" | sed -e 's/^\(.\)/[\1]/')
}
has_remoted() {
if [ $REMOTED_STATUS -eq -1 ]; then
REMOTED_STATUS=1
if which pacemaker-remoted >/dev/null 2>&1; then
REMOTED_STATUS=0
# Check for pre-2.0.0 daemon name in case we have mixed-version cluster
elif which pacemaker_remoted >/dev/null 2>&1; then
REMOTED_STATUS=0
elif [ -x "@sbindir@/pacemaker-remoted" ]; then
REMOTED_STATUS=0
elif [ -x "@sbindir@/pacemaker_remoted" ]; then
REMOTED_STATUS=0
else
# @TODO: the binary might be elsewhere,
# but a global search is too expensive
for d in /{usr,opt}/{local/,}{s,}bin; do
if [ -x "${d}/pacemaker-remoted" ]; then
REMOTED_STATUS=0
elif [ -x "${d}/pacemaker_remoted" ]; then
REMOTED_STATUS=0
fi
done
fi
fi
return $REMOTED_STATUS
}
# found_dir <description> <dirname>
found_dir() {
echo "$2"
info "Pacemaker $1 found in: $2"
}
detect_daemon_dir() {
info "Searching for where Pacemaker daemons live... this may take a while"
for d in \
{/usr,/usr/local,/opt/local,@exec_prefix@}/{libexec,lib64,lib}/pacemaker
do
# pacemaker and pacemaker-cts packages can install to daemon directory,
# so check for a file from each
if [ -e $d/pacemaker-schedulerd ] || [ -e $d/cts-exec-helper ]; then
found_dir "daemons" "$d"
return
fi
done
# Pacemaker Remote nodes don't need to install daemons
if has_remoted; then
info "Pacemaker daemons not found (this appears to be a Pacemaker Remote node)"
return
fi
for f in $(find / -maxdepth $maxdepth -type f -name pacemaker-schedulerd -o -name cts-exec-helper); do
d=$(dirname "$f")
found_dir "daemons" "$d"
return
done
fatal "Pacemaker daemons not found (nonstandard installation?)"
}
detect_cib_dir() {
- if [ "-f ${local_state_dir}/lib/pacemaker/cib/cib.xml" ]; then
+ d="${local_state_dir}/lib/pacemaker/cib"
+ if [ -f "$d/cib.xml" ]; then
found_dir "config files" "$d"
return
fi
# Pacemaker Remote nodes don't need a CIB
if has_remoted; then
info "Pacemaker config not found (this appears to be a Pacemaker Remote node)"
return
fi
info "Searching for where Pacemaker keeps config information... this may take a while"
# TODO: What about false positives where someone copied the CIB?
for f in $(find / -maxdepth $maxdepth -type f -name cib.xml); do
d=$(dirname $f)
found_dir "config files" "$d"
return
done
warning "Pacemaker config not found (nonstandard installation?)"
}
detect_state_dir() {
if [ -n "$CRM_CONFIG_DIR" ]; then
# Assume new layout
# $local_state_dir/lib/pacemaker/(cib,pengine,blackbox,cores)
dirname "$CRM_CONFIG_DIR"
# Pacemaker Remote nodes might not have a CRM_CONFIG_DIR
elif [ -d "$local_state_dir/lib/pacemaker" ]; then
echo $local_state_dir/lib/pacemaker
fi
}
detect_pe_dir() {
config_root="$1"
d="$config_root/pengine"
if [ -d "$d" ]; then
found_dir "scheduler inputs" "$d"
return
fi
if has_remoted; then
info "Pacemaker scheduler inputs not found (this appears to be a Pacemaker Remote node)"
return
fi
info "Searching for where Pacemaker keeps scheduler inputs... this may take a while"
for d in $(find / -maxdepth $maxdepth -type d -name pengine); do
found_dir "scheduler inputs" "$d"
return
done
fatal "Pacemaker scheduler inputs not found (nonstandard installation?)"
}
detect_host() {
local_state_dir=@localstatedir@
if [ -d $local_state_dir/run ]; then
CRM_STATE_DIR=$local_state_dir/run/crm
else
info "Searching for where Pacemaker keeps runtime data... this may take a while"
for d in `find / -maxdepth $maxdepth -type d -name run`; do
local_state_dir=`dirname $d`
CRM_STATE_DIR=$d/crm
break
done
info "Found: $CRM_STATE_DIR"
fi
debug "Machine runtime directory: $local_state_dir"
debug "Pacemaker runtime data located in: $CRM_STATE_DIR"
CRM_DAEMON_DIR=$(detect_daemon_dir)
CRM_CONFIG_DIR=$(detect_cib_dir)
config_root=$(detect_state_dir)
# Older versions had none
BLACKBOX_DIR=$config_root/blackbox
debug "Pacemaker blackboxes (if any) located in: $BLACKBOX_DIR"
PE_STATE_DIR=$(detect_pe_dir "$config_root")
CRM_CORE_DIRS=""
for d in $config_root/cores $local_state_dir/lib/corosync; do
if [ -d $d ]; then
CRM_CORE_DIRS="$CRM_CORE_DIRS $d"
fi
done
debug "Core files located under: $CRM_CORE_DIRS"
}
time2str() {
perl -e "use POSIX; print strftime('%x %X',localtime($1));"
}
get_time() {
perl -e "\$time=\"$*\";" -e '
$unix_tm = 0;
eval "use Date::Parse";
if (index($time, ":") < 0) {
} elsif (!$@) {
$unix_tm = str2time($time);
} else {
eval "use Date::Manip";
if (!$@) {
$unix_tm = UnixDate(ParseDateString($time), "%s");
}
}
if ($unix_tm != "") {
print int($unix_tm);
} else {
print "";
}
'
}
get_time_() {
warning "Unknown time format used by: $*"
}
get_time_syslog() {
awk '{print $1,$2,$3}'
}
get_time_legacy() {
awk '{print $2}' | sed 's/_/ /'
}
get_time_iso8601() {
awk '{print $1}'
}
get_time_format_for_string() {
l="$*"
t=$(get_time `echo $l | get_time_syslog`)
if [ "x$t" != x ]; then
echo syslog
return
fi
t=$(get_time `echo $l | get_time_iso8601`)
if [ "x$t" != x ]; then
echo iso8601
return
fi
t=$(get_time `echo $l | get_time_legacy`)
if [ "x$t" != x ]; then
echo legacy
return
fi
}
get_time_format() {
t=0 l="" func=""
trycnt=10
while [ $trycnt -gt 0 ] && read l; do
func=$(get_time_format_for_string $l)
if [ "x$func" != x ]; then
break
fi
trycnt=$(($trycnt-1))
done
#debug "Logfile uses the $func time format"
echo $func
}
get_first_time() {
l=""
format=$1
while read l; do
t=$(echo $l | get_time_$format)
ts=$(get_time $t)
if [ "x$ts" != x ]; then
echo "$ts"
return
fi
done
}
get_last_time() {
l=""
best=`date +%s` # Now
format=$1
while read l; do
t=$(echo $l | get_time_$format)
ts=$(get_time $t)
if [ "x$ts" != x ]; then
best=$ts
fi
done
echo $best
}
linetime() {
l=`tail -n +$2 $1 | grep -a ":[0-5][0-9]:" | head -n 1`
format=`get_time_format_for_string $l`
t=`echo $l | get_time_$format`
get_time "$t"
}
#
# findmsg <max> <pattern>
#
# Print the names of up to <max> system logs that contain <pattern>,
# ordered by most recently modified.
#
findmsg() {
max=$1
pattern="$2"
found=0
# List all potential system logs ordered by most recently modified.
candidates=$(ls -1td $SYSLOGS 2>/dev/null)
if [ -z "$candidates" ]; then
debug "No system logs found to search for pattern \'$pattern\'"
return
fi
# Portable way to handle files with spaces in their names.
SAVE_IFS=$IFS
IFS="
"
# Check each log file for matches.
logfiles=""
for f in $candidates; do
local cat=""
# We only care about readable files with something in them.
if [ ! -f "$f" ] || [ ! -r "$f" ] || [ ! -s "$f" ] ; then
continue
fi
cat=$(find_decompressor "$f")
# We want to avoid grepping through potentially huge binary logs such
# as lastlog. However, control characters sometimes find their way into
# text logs, so we use a heuristic of more than 256 nonprintable
# characters in the file's first kilobyte.
if [ $($cat "$f" 2>/dev/null | head -c 1024 | tr -d '[:print:][:space:]' | wc -c) -gt 256 ]
then
continue
fi
# Our patterns are ASCII, so we can use LC_ALL="C" to speed up grep
$cat "$f" 2>/dev/null | LC_ALL="C" grep -q -e "$pattern"
if [ $? -eq 0 ]; then
# Add this file to the list of hits
# (using newline as separator to handle spaces in names).
if [ -z "$logfiles" ]; then
logfiles="$f"
else
logfiles="$logfiles
$f"
fi
# If we have enough hits, print them and return.
found=$(($found+1))
if [ $found -ge $max ]; then
debug "Pattern \'$pattern\' found in: [ $logfiles ]"
IFS=$SAVE_IFS
echo "$logfiles"
return
fi
fi
done 2>/dev/null
IFS=$SAVE_IFS
debug "Pattern \'$pattern\' not found in any system logs"
}
node_events() {
if [ -e $1 ]; then
Epatt=`echo "$EVENT_PATTERNS" |
while read title p; do [ -n "$p" ] && echo -n "|$p"; done |
sed 's/.//'
`
grep -E "$Epatt" $1
fi
}
pickfirst() {
for x; do
which $x >/dev/null 2>&1 && {
echo $x
return 0
}
done
return 1
}
shrink() {
olddir=$PWD
dir=`dirname $1`
base=`basename $1`
target=$1.tar
tar_options="cf"
variant=`pickfirst bzip2 gzip xz false`
case $variant in
bz*)
tar_options="jcf"
target="$target.bz2"
;;
gz*)
tar_options="zcf"
target="$target.gz"
;;
xz*)
tar_options="Jcf"
target="$target.xz"
;;
*)
warning "Could not find a compression program, the resulting tarball may be huge"
;;
esac
if [ -e $target ]; then
fatal "Destination $target already exists, specify an alternate name with --dest"
fi
cd $dir >/dev/null 2>&1
tar $tar_options $target $base >/dev/null 2>&1
cd $olddir >/dev/null 2>&1
echo $target
}
findln_by_time() {
local logf=$1
local tm=$2
local first=1
# Some logs can be massive (over 1,500,000,000 lines have been seen in the wild)
# Even just 'wc -l' on these files can take 10+ minutes
local fileSize=`ls -lh | awk '{ print $5 }' | grep -ie G`
if [ x$fileSize != x ]; then
warning "$logf is ${fileSize} in size and could take many hours to process. Skipping."
return
fi
local last=`wc -l < $logf`
while [ $first -le $last ]; do
mid=$((($last+$first)/2))
trycnt=10
while [ $trycnt -gt 0 ]; do
tmid=`linetime $logf $mid`
[ "$tmid" ] && break
warning "cannot extract time: $logf:$mid; will try the next one"
trycnt=$(($trycnt-1))
# shift the whole first-last segment
first=$(($first-1))
last=$(($last-1))
mid=$((($last+$first)/2))
done
if [ -z "$tmid" ]; then
warning "giving up on log..."
return
fi
if [ $tmid -gt $tm ]; then
last=$(($mid-1))
elif [ $tmid -lt $tm ]; then
first=$(($mid+1))
else
break
fi
done
echo $mid
}
dumplog() {
local logf=$1
local from_line=$2
local to_line=$3
[ "$from_line" ] ||
return
tail -n +$from_line $logf |
if [ "$to_line" ]; then
head -$(($to_line-$from_line+1))
else
cat
fi
}
#
# find log/set of logs which are interesting for us
#
#
# find log slices
#
find_decompressor() {
case $1 in
*bz2) echo "bzip2 -dc" ;;
*gz) echo "gzip -dc" ;;
*xz) echo "xz -dc" ;;
*) echo "cat" ;;
esac
}
#
# check if the log contains a piece of our segment
#
is_our_log() {
local logf=$1
local from_time=$2
local to_time=$3
local cat=`find_decompressor $logf`
local format=`$cat $logf | get_time_format`
local first_time=`$cat $logf | head -10 | get_first_time $format`
local last_time=`$cat $logf | tail -10 | get_last_time $format`
if [ x = "x$first_time" -o x = "x$last_time" ]; then
warning "Skipping bad logfile '$1': Could not determine log dates"
return 0 # skip (empty log?)
fi
if [ $from_time -gt $last_time ]; then
# we shouldn't get here anyway if the logs are in order
return 2 # we're past good logs; exit
fi
if [ $from_time -ge $first_time ]; then
return 3 # this is the last good log
fi
# have to go further back
if [ x = "x$to_time" -o $to_time -ge $first_time ]; then
return 1 # include this log
else
return 0 # don't include this log
fi
}
#
# go through archived logs (timewise backwards) and see if there
# are lines belonging to us
# (we rely on untouched log files, i.e. that modify time
# hasn't been changed)
#
arch_logs() {
local logf=$1
local from_time=$2
local to_time=$3
# look for files such as: ha-log-20090308 or
# ha-log-20090308.gz (.bz2) or ha-log.0, etc
ls -t $logf $logf*[0-9z] 2>/dev/null |
while read next_log; do
is_our_log $next_log $from_time $to_time
case $? in
0) ;; # noop, continue
1) echo $next_log # include log and continue
debug "Found log $next_log"
;;
2) break;; # don't go through older logs!
3) echo $next_log # include log and continue
debug "Found log $next_log"
break
;; # don't go through older logs!
esac
done
}
#
# print part of the log
#
drop_tmp_file() {
[ -z "$tmp" ] || rm -f "$tmp"
}
print_logseg() {
local logf=$1
local from_time=$2
local to_time=$3
# uncompress to a temp file (if necessary)
local cat=`find_decompressor $logf`
if [ "$cat" != "cat" ]; then
tmp=`mktemp`
$cat $logf > $tmp
trap drop_tmp_file 0
sourcef=$tmp
else
sourcef=$logf
tmp=""
fi
if [ "$from_time" = 0 ]; then
FROM_LINE=1
else
FROM_LINE=`findln_by_time $sourcef $from_time`
fi
if [ -z "$FROM_LINE" ]; then
warning "couldn't find line for time $from_time; corrupt log file?"
return
fi
TO_LINE=""
if [ "$to_time" != 0 ]; then
TO_LINE=`findln_by_time $sourcef $to_time`
if [ -z "$TO_LINE" ]; then
warning "couldn't find line for time $to_time; corrupt log file?"
return
fi
if [ $FROM_LINE -lt $TO_LINE ]; then
dumplog $sourcef $FROM_LINE $TO_LINE
log "Including segment [$FROM_LINE-$TO_LINE] from $logf"
else
debug "Empty segment [$FROM_LINE-$TO_LINE] from $logf"
fi
else
dumplog $sourcef $FROM_LINE $TO_LINE
log "Including all logs after line $FROM_LINE from $logf"
fi
drop_tmp_file
trap "" 0
}
#
# find log/set of logs which are interesting for us
#
dumplogset() {
local logf=$1
local from_time=$2
local to_time=$3
local logf_set=`arch_logs $logf $from_time $to_time`
if [ x = "x$logf_set" ]; then
return
fi
local num_logs=`echo "$logf_set" | wc -l`
local oldest=`echo $logf_set | awk '{print $NF}'`
local newest=`echo $logf_set | awk '{print $1}'`
local mid_logfiles=`echo $logf_set | awk '{for(i=NF-1; i>1; i--) print $i}'`
# the first logfile: from $from_time to $to_time (or end)
# logfiles in the middle: all
# the last logfile: from beginning to $to_time (or end)
case $num_logs in
1) print_logseg $newest $from_time $to_time;;
*)
print_logseg $oldest $from_time 0
for f in $mid_logfiles; do
`find_decompressor $f` $f
debug "including complete $f logfile"
done
print_logseg $newest 0 $to_time
;;
esac
}
# cut out a stanza
getstanza() {
awk -v name="$1" '
!in_stanza && NF==2 && /^[a-z][a-z]*[[:space:]]*{/ { # stanza start
if ($1 == name)
in_stanza = 1
}
in_stanza { print }
in_stanza && NF==1 && $1 == "}" { exit }
'
}
# supply stanza in $1 and variable name in $2
# (stanza is optional)
getcfvar() {
cf_type=$1; shift;
cf_var=$1; shift;
cf_file=$*
[ -f "$cf_file" ] || return
case $cf_type in
corosync)
sed 's/#.*//' < $cf_file |
if [ $# -eq 2 ]; then
getstanza "$cf_var"
shift 1
else
cat
fi |
awk -v varname="$cf_var" '
NF==2 && match($1,varname":$")==1 { print $2; exit; }
'
;;
esac
}
pickfirst() {
for x; do
which $x >/dev/null 2>&1 && {
echo $x
return 0
}
done
return 1
}
#
# figure out the cluster type, depending on the process list
# and existence of configuration files
#
get_cluster_type() {
if is_running corosync; then
tool=`pickfirst corosync-objctl corosync-cmapctl`
case $tool in
*objctl) quorum=`$tool -a | grep quorum.provider | sed 's/.*=\s*//'`;;
*cmapctl) quorum=`$tool | grep quorum.provider | sed 's/.*=\s*//'`;;
esac
stack="corosync"
# Now we're guessing...
# TODO: Technically these could be anywhere :-/
elif [ -f /etc/corosync/corosync.conf ]; then
stack="corosync"
else
# We still don't know. This might be a Pacemaker Remote node,
# or the configuration might be in a nonstandard location.
stack="any"
fi
debug "Detected the '$stack' cluster stack"
echo $stack
}
find_cluster_cf() {
case $1 in
corosync)
best_size=0
best_file=""
# TODO: Technically these could be anywhere :-/
for cf in /etc/corosync/corosync.conf; do
if [ -f $cf ]; then
size=`wc -l $cf | awk '{print $1}'`
if [ $size -gt $best_size ]; then
best_size=$size
best_file=$cf
fi
fi
done
if [ -z "$best_file" ]; then
debug "Looking for corosync configuration file. This may take a while..."
for f in `find / -maxdepth $maxdepth -type f -name corosync.conf`; do
best_file=$f
break
done
fi
debug "Located corosync config file: $best_file"
echo "$best_file"
;;
any)
# Cluster type is undetermined. Don't complain, because this
# might be a Pacemaker Remote node.
;;
*)
warning "Unknown cluster type: $1"
;;
esac
}
#
# check for the major prereq for a) parameter parsing and b)
# parsing logs
#
t=`get_time "12:00"`
if [ "$t" = "" ]; then
fatal "please install the perl Date::Parse module (perl-DateTime-Format-DateParse on Fedora/Red Hat)"
fi
# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80:

File Metadata

Mime Type
text/x-diff
Expires
Tue, Jul 8, 4:23 PM (1 d, 47 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2002271
Default Alt Text
(658 KB)

Event Timeline