diff --git a/cts/CTStests.py b/cts/CTStests.py
index a57a5da805..f8f2cb626f 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3130 +1,3129 @@
""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2000-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
#
# SPECIAL NOTE:
#
# Tests may NOT implement any cluster-manager-specific code in them.
# EXTEND the ClusterManager object to provide the base capabilities
# the test needs if you need to do something that the current CM classes
# do not. Otherwise you screw up the whole point of the object structure
# in CTS.
#
# Thank you.
#
import os
import re
import time
import subprocess
import tempfile
from stat import *
from cts import CTS
from cts.CTSaudits import *
from cts.CTSvars import *
from cts.patterns import PatternSelector
from cts.logging import LogFactory
from cts.remote import RemoteFactory, input_wrapper
from cts.watcher import LogWatcher
from cts.environment import EnvFactory
AllTestClasses = [ ]
class CTSTest(object):
'''
A Cluster test.
We implement the basic set of properties and behaviors for a generic
cluster test.
Cluster tests track their own statistics.
We keep each of the kinds of counts we track as separate {name,value}
pairs.
'''
def __init__(self, cm):
#self.name="the unnamed test"
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
# if not issubclass(cm.__class__, ClusterManager):
# raise ValueError("Must be a ClusterManager object")
self.CM = cm
self.Env = EnvFactory().getInstance()
self.rsh = RemoteFactory().getInstance()
self.logger = LogFactory()
self.templates = PatternSelector(cm["Name"])
self.Audits = []
self.timeout = 120
self.passed = 1
self.is_loop = 0
self.is_unsafe = 0
self.is_docker_unsafe = 0
self.is_experimental = 0
self.is_container = 0
self.is_valgrind = 0
self.benchmark = 0 # which tests to benchmark
self.timer = {} # timers
def log(self, args):
self.logger.log(args)
def debug(self, args):
self.logger.debug(args)
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
if str(key) == "0":
raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")
if key in self.Stats:
return self.Stats[key]
return None
def log_mark(self, msg):
self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
return
def get_timer(self,key = "test"):
try: return self.timer[key]
except: return 0
def set_timer(self,key = "test"):
self.timer[key] = time.time()
return self.timer[key]
def log_timer(self,key = "test"):
elapsed = 0
if key in self.timer:
elapsed = time.time() - self.timer[key]
s = key == "test" and self.name or "%s:%s" % (self.name,key)
self.debug("%s runtime: %.2f" % (s, elapsed))
del self.timer[key]
return elapsed
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
# Reset the test passed boolean
if name == "calls":
self.passed = 1
def failure(self, reason="none"):
'''Increment the failure count'''
self.passed = 0
self.incr("failure")
self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
return None
def success(self):
'''Increment the success count'''
self.incr("success")
return 1
def skipped(self):
'''Increment the skipped count'''
self.incr("skipped")
return 1
def __call__(self, node):
'''Perform the given test'''
raise ValueError("Abstract Class member (__call__)")
self.incr("calls")
return self.failure()
def audit(self):
passed = 1
if len(self.Audits) > 0:
for audit in self.Audits:
if not audit():
self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
self.incr("auditfail")
passed = 0
return passed
def setup(self, node):
'''Setup the given test'''
return self.success()
def teardown(self, node):
'''Tear down the given test'''
return self.success()
def create_watch(self, patterns, timeout, name=None):
if not name:
name = self.name
return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
def local_badnews(self, prefix, watch, local_ignore=[]):
errcount = 0
if not prefix:
prefix = "LocalBadNews:"
ignorelist = []
ignorelist.append(" CTS: ")
ignorelist.append(prefix)
ignorelist.extend(local_ignore)
while errcount < 100:
match = watch.look(0)
if match:
add_err = 1
for ignore in ignorelist:
if add_err == 1 and re.search(ignore, match):
add_err = 0
if add_err == 1:
self.logger.log(prefix + " " + match)
errcount = errcount + 1
else:
break
else:
self.logger.log("Too many errors!")
watch.end()
return errcount
def is_applicable(self):
return self.is_applicable_common()
def is_applicable_common(self):
'''Return TRUE if we are applicable in the current test configuration'''
#raise ValueError("Abstract Class member (is_applicable)")
if self.is_loop and not self.Env["loop-tests"]:
return 0
elif self.is_unsafe and not self.Env["unsafe-tests"]:
return 0
elif self.is_valgrind and not self.Env["valgrind-tests"]:
return 0
elif self.is_experimental and not self.Env["experimental-tests"]:
return 0
elif self.is_docker_unsafe and self.Env["docker"]:
return 0
elif self.is_container and not self.Env["container-tests"]:
return 0
elif self.Env["benchmark"] and self.benchmark == 0:
return 0
return 1
def find_ocfs2_resources(self, node):
self.r_o2cb = None
self.r_ocfs2 = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "o2cb" and r.parent != "NA":
self.debug("Found o2cb: %s" % self.r_o2cb)
self.r_o2cb = r.parent
if re.search("^Constraint", line):
c = AuditConstraint(self.CM, line)
if c.type == "rsc_colocation" and c.target == self.r_o2cb:
self.r_ocfs2.append(c.rsc)
self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
return len(self.r_ocfs2)
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
return 1
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return []
class StopTest(CTSTest):
'''Stop (deactivate) the cluster manager on a node'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stop"
def __call__(self, node):
'''Perform the 'stop' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] != "up":
return self.skipped()
patterns = []
# Technically we should always be able to notice ourselves stopping
patterns.append(self.templates["Pat:We_stopped"] % node)
# Any active node needs to notice this one left
# (note that this won't work if we have multiple partitions)
for other in self.Env["nodes"]:
if self.CM.ShouldBeStatus[other] == "up" and other != node:
patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
#self.debug("Checking %s will notice %s left"%(other, node))
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
if node == self.CM.OurNode:
self.incr("us")
else:
if self.CM.upcount() <= 1:
self.incr("all")
else:
self.incr("them")
self.CM.StopaCM(node)
watch_result = watch.lookforall()
failreason = None
UnmatchedList = "||"
if watch.unmatched:
(rc, output) = self.rsh(node, "/bin/ps axf", None)
for line in output:
self.debug(line)
(rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
for line in output:
self.debug(line)
for regex in watch.unmatched:
self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
UnmatchedList += regex + "||";
failreason = "Missing shutdown pattern"
self.CM.cluster_stable(self.Env["DeadTime"])
if not watch.unmatched or self.CM.upcount() == 0:
return self.success()
if len(watch.unmatched) >= self.CM.upcount():
return self.failure("no match against (%s)" % UnmatchedList)
if failreason == None:
return self.success()
else:
return self.failure(failreason)
#
# We don't register StopTest because it's better when called by
# another test...
#
class StartTest(CTSTest):
'''Start (activate) the cluster manager on a node'''
def __init__(self, cm, debug=None):
CTSTest.__init__(self,cm)
self.name = "start"
self.debug = debug
def __call__(self, node):
'''Perform the 'start' test. '''
self.incr("calls")
if self.CM.upcount() == 0:
self.incr("us")
else:
self.incr("them")
if self.CM.ShouldBeStatus[node] != "down":
return self.skipped()
elif self.CM.StartaCM(node):
return self.success()
else:
return self.failure("Startup %s on node %s failed"
% (self.Env["Name"], node))
#
# We don't register StartTest because it's better when called by
# another test...
#
class FlipTest(CTSTest):
'''If it's running, stop it. If it's stopped start it.
Overthrow the status quo...
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Flip"
self.start = StartTest(cm)
self.stop = StopTest(cm)
def __call__(self, node):
'''Perform the 'Flip' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] == "up":
self.incr("stopped")
ret = self.stop(node)
type = "up->down"
# Give the cluster time to recognize it's gone...
time.sleep(self.Env["StableTime"])
elif self.CM.ShouldBeStatus[node] == "down":
self.incr("started")
ret = self.start(node)
type = "down->up"
else:
return self.skipped()
self.incr(type)
if ret:
return self.success()
else:
return self.failure("%s failure" % type)
# Register FlipTest as a good test to run
AllTestClasses.append(FlipTest)
class RestartTest(CTSTest):
'''Stop and restart a node'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Restart"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.benchmark = 1
def __call__(self, node):
'''Perform the 'restart' test. '''
self.incr("calls")
self.incr("node:" + node)
ret1 = 1
if self.CM.StataCM(node):
self.incr("WasStopped")
if not self.start(node):
return self.failure("start (setup) failure: "+node)
self.set_timer()
if not self.stop(node):
return self.failure("stop failure: "+node)
if not self.start(node):
return self.failure("start failure: "+node)
return self.success()
# Register RestartTest as a good test to run
AllTestClasses.append(RestartTest)
class StonithdTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stonithd"
self.startall = SimulStartLite(cm)
self.benchmark = 1
def __call__(self, node):
self.incr("calls")
if len(self.Env["nodes"]) < 2:
return self.skipped()
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
is_dc = self.CM.is_node_dc(node)
watchpats = []
watchpats.append(self.templates["Pat:FenceOpOK"] % node)
watchpats.append(self.templates["Pat:NodeFenced"] % node)
if self.Env["at-boot"] == 0:
self.debug("Expecting %s to stay down" % node)
self.CM.ShouldBeStatus[node] = "down"
else:
self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
origin = self.Env.RandomGen.choice(self.Env["nodes"])
rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
if rc == 194:
# 194 - 256 = -62 = Timer expired
#
# Look for the patterns, usually this means the required
# device was running on the node to be fenced - or that
# the required devices were in the process of being loaded
# and/or moved
#
# Effectively the node committed suicide so there will be
# no confirmation, but pacemaker should be watching and
# fence the node again
self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
elif origin != node and rc != 0:
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
elif origin == node and rc != 255:
# 255 == broken pipe, ie. the node was fenced as expected
self.logger.log("Locally originated fencing returned %d" % rc)
self.set_timer("fence")
matched = watch.lookforall()
self.log_timer("fence")
self.set_timer("reform")
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected patterns")
elif not is_stable:
return self.failure("Cluster did not become stable")
self.log_timer("reform")
return self.success()
def errorstoignore(self):
return [
self.templates["Pat:Fencing_start"] % ".*",
self.templates["Pat:Fencing_ok"] % ".*",
r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
r"error.*: Operation 'reboot' targeting .* on .* for stonith_admin.*: Timer expired",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return 1
AllTestClasses.append(StonithdTest)
class StartOnebyOne(CTSTest):
'''Start all the nodes ~ one by one'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StartOnebyOne"
self.stopall = SimulStopLite(cm)
self.start = StartTest(cm)
self.ns = CTS.NodeStatus(cm.Env)
def __call__(self, dummy):
'''Perform the 'StartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Test setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.start(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to start: " + repr(failed))
return self.success()
# Register StartOnebyOne as a good test to run
AllTestClasses.append(StartOnebyOne)
class SimulStart(CTSTest):
'''Start all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStart"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStart' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
if not self.startall(None):
return self.failure("Startall failed")
return self.success()
# Register SimulStart as a good test to run
AllTestClasses.append(SimulStart)
class SimulStop(CTSTest):
'''Stop all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStop"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStop' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.stopall(None):
return self.failure("Stopall failed")
return self.success()
# Register SimulStop as a good test to run
AllTestClasses.append(SimulStop)
class StopOnebyOne(CTSTest):
'''Stop all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StopOnebyOne"
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
def __call__(self, dummy):
'''Perform the 'StopOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.stop(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to stop: " + repr(failed))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(StopOnebyOne)
class RestartOnebyOne(CTSTest):
'''Restart all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RestartOnebyOne"
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'RestartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
did_fail = []
self.set_timer()
self.restart = RestartTest(self.CM)
for node in self.Env["nodes"]:
if not self.restart(node):
did_fail.append(node)
if did_fail:
return self.failure("Could not restart %d nodes: %s"
% (len(did_fail), repr(did_fail)))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(RestartOnebyOne)
class PartialStart(CTSTest):
'''Start a node - but tell it to stop before it finishes starting up'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "PartialStart"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
self.stop = StopTest(cm)
#self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'PartialStart' test. '''
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
# FIXME! This should use the CM class to get the pattern
# then it would be applicable in general
watchpats = []
watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.CM.StartaCMnoBlock(node)
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
return self.failure("Setup of %s failed" % node)
ret = self.stop(node)
if not ret:
return self.failure("%s did not stop in time" % node)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# We might do some fencing in the 2-node case if we make it up far enough
return [
r"Executing reboot fencing operation",
r"Requesting fencing \([^)]+\) of node ",
]
# Register StopOnebyOne as a good test to run
AllTestClasses.append(PartialStart)
class StandbyTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Standby"
self.benchmark = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
# make sure the node is active
# set the node to standby mode
# check resources, none resource should be running on the node
# set the node to active mode
# check resouces, resources should have been migrated back (SHOULD THEY?)
def __call__(self, node):
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
self.debug("Make sure node %s is active" % node)
if self.CM.StandbyStatus(node) != "off":
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.debug("Getting resources running on node %s" % node)
rsc_on_node = self.CM.active_resources(node)
watchpats = []
watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.debug("Setting node %s to standby mode" % node)
if not self.CM.SetStandbyMode(node, "on"):
return self.failure("can't set node %s to standby mode" % node)
self.set_timer("on")
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.CM.SetStandbyMode(node, "off")
return self.failure("cluster didn't react to standby change on %s" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "on":
return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
self.log_timer("on")
self.debug("Checking resources")
bad_run = self.CM.active_resources(node)
if len(bad_run) > 0:
rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
self.debug("Setting node %s to active mode" % node)
self.CM.SetStandbyMode(node, "off")
return rc
self.debug("Setting node %s to active mode" % node)
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.set_timer("off")
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.log_timer("off")
return self.success()
AllTestClasses.append(StandbyTest)
class ValgrindTest(CTSTest):
'''Check for memory leaks'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Valgrind"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_valgrind = 1
self.is_loop = 1
def setup(self, node):
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind,
# and clear any valgrind logs from previous runs. For now, we rely on
# the user to do this manually.
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
return self.success()
def teardown(self, node):
# Return all nodes to normal
# @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
return self.success()
def find_leaks(self):
# Check for leaks
# (no longer used but kept in case feature is restored)
leaked = []
self.stop = StopTest(self.CM)
for node in self.Env["nodes"]:
rc = self.stop(node)
if not rc:
self.failure("Couldn't shut down %s" % node)
rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
if rc != 1:
leaked.append(node)
self.failure("Valgrind errors detected on %s" % node)
(rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
for line in output:
self.logger.log(line)
(rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
for line in output:
self.debug(line)
self.rsh(node, "rm -f %s" % self.logger.logPat, None)
return leaked
def __call__(self, node):
#leaked = self.find_leaks()
#if len(leaked) > 0:
# return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
r"pacemaker-based.*: .* avoid confusing Valgrind",
r"HA_VALGRIND_ENABLED",
]
class StandbyLoopTest(ValgrindTest):
'''Check for memory leaks by putting a node in and out of standby for an hour'''
# @TODO This is not a useful test for memory leaks
def __init__(self, cm):
ValgrindTest.__init__(self,cm)
self.name = "StandbyLoop"
def __call__(self, node):
lpc = 0
delay = 2
failed = 0
done = time.time() + self.Env["loop-minutes"] * 60
while time.time() <= done and not failed:
lpc = lpc + 1
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "on"):
self.failure("can't set node %s to standby mode" % node)
failed = lpc
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "off"):
self.failure("can't set node %s to active mode" % node)
failed = lpc
leaked = self.find_leaks()
if failed:
return self.failure("Iteration %d failed" % failed)
elif len(leaked) > 0:
return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
#AllTestClasses.append(StandbyLoopTest)
class BandwidthTest(CTSTest):
# Tests should not be cluster-manager-specific
# If you need to find out cluster manager configuration to do this, then
# it should be added to the generic cluster manager API.
'''Test the bandwidth which the cluster uses'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Bandwidth"
self.start = StartTest(cm)
self.__setitem__("min",0)
self.__setitem__("max",0)
self.__setitem__("totalbandwidth",0)
(handle, self.tempfile) = tempfile.mkstemp(".cts")
os.close(handle)
self.startall = SimulStartLite(cm)
def __call__(self, node):
'''Perform the Bandwidth test'''
self.incr("calls")
if self.CM.upcount() < 1:
return self.skipped()
Path = self.CM.InternalCommConfig()
if "ip" not in Path["mediatype"]:
return self.skipped()
port = Path["port"][0]
port = int(port)
ret = self.startall(None)
if not ret:
return self.failure("Test setup failed")
time.sleep(5) # We get extra messages right after startup.
fstmpfile = "/var/run/band_estimate"
dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
% (port, fstmpfile)
rc = self.rsh(node, dumpcmd)
if rc == 0:
farfile = "root@%s:%s" % (node, fstmpfile)
self.rsh.cp(farfile, self.tempfile)
Bandwidth = self.countbandwidth(self.tempfile)
if not Bandwidth:
self.logger.log("Could not compute bandwidth.")
return self.success()
intband = int(Bandwidth + 0.5)
self.logger.log("...bandwidth: %d bits/sec" % intband)
self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
if self.Stats["min"] == 0:
self.Stats["min"] = Bandwidth
if Bandwidth > self.Stats["max"]:
self.Stats["max"] = Bandwidth
if Bandwidth < self.Stats["min"]:
self.Stats["min"] = Bandwidth
self.rsh(node, "rm -f %s" % fstmpfile)
os.unlink(self.tempfile)
return self.success()
else:
return self.failure("no response from tcpdump command [%d]!" % rc)
def countbandwidth(self, file):
fp = open(file, "r")
fp.seek(0)
count = 0
sum = 0
while 1:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count + 1
linesplit = line.split(" ")
for j in range(len(linesplit)-1):
if linesplit[j] == "udp": break
if linesplit[j] == "length:": break
try:
sum = sum + int(linesplit[j+1])
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T1 = linesplit[0]
timesplit = T1.split(":")
time2split = timesplit[2].split(".")
time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
break
while count < 100:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count+1
linessplit = line.split(" ")
for j in range(len(linessplit)-1):
if linessplit[j] == "udp": break
if linessplit[j] == "length:": break
try:
sum = int(linessplit[j+1]) + sum
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T2 = linessplit[0]
timesplit = T2.split(":")
time2split = timesplit[2].split(".")
time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
time = time2-time1
if (time <= 0):
return 0
return int((sum*8)/time)
def is_applicable(self):
'''BandwidthTest never applicable'''
return 0
AllTestClasses.append(BandwidthTest)
###################################################################
class MaintenanceMode(CTSTest):
###################################################################
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "MaintenanceMode"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
#self.is_unsafe = 1
self.benchmark = 1
self.action = "asyncmon"
self.interval = 0
self.rid = "maintenanceDummy"
def toggleMaintenanceMode(self, node, action):
pats = []
pats.append(self.templates["Pat:DC_IDLE"])
# fail the resource right after turning Maintenance mode on
# verify it is not recovered until maintenance mode is turned off
if action == "On":
- pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of %s on" % (self.action, self.rid))
+ pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid))
else:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.debug("Turning maintenance mode %s" % action)
self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
if (action == "On"):
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover%s" % (action))
watch.lookforall()
self.log_timer("recover%s" % (action))
if watch.unmatched:
self.debug("Failed to find patterns when turning maintenance mode %s" % action)
return repr(watch.unmatched)
return ""
def insertMaintenanceDummy(self, node):
pats = []
pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.AddDummyRsc(node, self.rid)
self.set_timer("addDummy")
watch.lookforall()
self.log_timer("addDummy")
if watch.unmatched:
self.debug("Failed to find patterns when adding maintenance dummy resource")
return repr(watch.unmatched)
return ""
def removeMaintenanceDummy(self, node):
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.RemoveDummyRsc(node, self.rid)
self.set_timer("removeDummy")
watch.lookforall()
self.log_timer("removeDummy")
if watch.unmatched:
self.debug("Failed to find patterns when removing maintenance dummy resource")
return repr(watch.unmatched)
return ""
def managedRscList(self, node):
rscList = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.managed():
rscList.append(tmp.id)
return rscList
def verifyResources(self, node, rscList, managed):
managedList = list(rscList)
managed_str = "managed"
if not managed:
managed_str = "unmanaged"
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if managed and not tmp.managed():
continue
elif not managed and tmp.managed():
continue
elif managedList.count(tmp.id):
managedList.remove(tmp.id)
if len(managedList) == 0:
self.debug("Found all %s resources on %s" % (managed_str, node))
return True
self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
return False
def __call__(self, node):
'''Perform the 'MaintenanceMode' test. '''
self.incr("calls")
verify_managed = False
verify_unmanaged = False
failPat = ""
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
# get a list of all the managed resources. We use this list
# after enabling maintenance mode to verify all managed resources
# become un-managed. After maintenance mode is turned off, we use
# this list to verify all the resources become managed again.
managedResources = self.managedRscList(node)
if len(managedResources) == 0:
self.logger.log("No managed resources on %s" % node)
return self.skipped()
# insert a fake resource we can fail during maintenance mode
# so we can verify recovery does not take place until after maintenance
# mode is disabled.
failPat = failPat + self.insertMaintenanceDummy(node)
# toggle maintenance mode ON, then fail dummy resource.
failPat = failPat + self.toggleMaintenanceMode(node, "On")
# verify all the resources are now unmanaged
if self.verifyResources(node, managedResources, False):
verify_unmanaged = True
# Toggle maintenance mode OFF, verify dummy is recovered.
failPat = failPat + self.toggleMaintenanceMode(node, "Off")
# verify all the resources are now managed again
if self.verifyResources(node, managedResources, True):
verify_managed = True
# Remove our maintenance dummy resource.
failPat = failPat + self.removeMaintenanceDummy(node)
self.CM.cluster_stable()
if failPat != "":
return self.failure("Unmatched patterns: %s" % (failPat))
elif verify_unmanaged is False:
return self.failure("Failed to verify resources became unmanaged during maintenance mode")
elif verify_managed is False:
return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(MaintenanceMode)
class ResourceRecover(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ResourceRecover"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
self.rid = None
self.rid_alt = None
#self.is_unsafe = 1
self.benchmark = 1
# these are the values used for the new LRM API call
self.action = "asyncmon"
self.interval = 0
def __call__(self, node):
'''Perform the 'ResourceRecover' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
resourcelist = self.CM.active_resources(node)
# if there are no resourcelist, return directly
if len(resourcelist) == 0:
self.logger.log("No active resources on %s" % node)
return self.skipped()
self.rid = self.Env.RandomGen.choice(resourcelist)
self.rid_alt = self.rid
rsc = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.id == self.rid:
rsc = tmp
# Handle anonymous clones that get renamed
self.rid = rsc.clone_id
break
if not rsc:
return self.failure("Could not find %s in the resource list" % self.rid)
self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
pats = []
- pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of (%s|%s) on" % (self.action,
- rsc.id, rsc.clone_id))
+ pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id))
if rsc.managed():
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
if rsc.unique():
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
else:
# Anonymous clones may get restarted with a different clone number
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover")
watch.lookforall()
self.log_timer("recover")
self.CM.cluster_stable()
recovered = self.CM.ResourceLocation(self.rid)
if watch.unmatched:
return self.failure("Patterns not found: %s" % repr(watch.unmatched))
elif rsc.unique() and len(recovered) > 1:
return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
elif len(recovered) > 0:
self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
elif rsc.managed():
return self.failure("%s was not recovered and is inactive" % self.rid)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
r"Unknown operation: fail",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(ResourceRecover)
class ComponentFail(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ComponentFail"
# TODO make this work correctly in docker.
self.is_docker_unsafe = 1
self.startall = SimulStartLite(cm)
self.complist = cm.Components()
self.patterns = []
self.okerrpatterns = []
self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'ComponentFail' test. '''
self.incr("calls")
self.patterns = []
self.okerrpatterns = []
# start all nodes
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.CM.cluster_stable(self.Env["StableTime"]):
return self.failure("Setup failed - unstable")
node_is_dc = self.CM.is_node_dc(node, None)
# select a component to kill
chosen = self.Env.RandomGen.choice(self.complist)
while chosen.dc_only == 1 and node_is_dc == 0:
chosen = self.Env.RandomGen.choice(self.complist)
self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
self.incr(chosen.name)
if chosen.name != "corosync":
self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
self.patterns.extend(chosen.pats)
if node_is_dc:
self.patterns.extend(chosen.dc_pats)
# @TODO this should be a flag in the Component
if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]:
# Ignore actions for fence devices if fencer will respawn
# (their registration will be lost, and probes will fail)
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
self.okerrpatterns.append(self.templates["Pat:Fencing_active"] % r.id)
self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id)
# supply a copy so self.patterns doesn't end up empty
tmpPats = []
tmpPats.extend(self.patterns)
self.patterns.extend(chosen.badnews_ignore)
# Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
stonithPats = []
stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
stonith = self.create_watch(stonithPats, 0)
stonith.setwatch()
# set the watch for stable
watch = self.create_watch(
tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
# kill the component
chosen.kill(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for any fenced node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
self.CM.cluster_stable(self.Env["StartTime"])
self.debug("Checking if %s was shot" % node)
shot = stonith.look(60)
if shot:
self.debug("Found: " + repr(shot))
self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
if self.Env["at-boot"] == 0:
self.CM.ShouldBeStatus[node] = "down"
# If fencing occurred, chances are many (if not all) the expected logs
# will not be sent - or will be lost when the node reboots
return self.success()
# check for logs indicating a graceful recovery
matched = watch.lookforall(allow_multiple_matches=1)
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected %s patterns" % chosen.name)
elif not is_stable:
return self.failure("Cluster did not become stable after killing %s" % chosen.name)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Note that okerrpatterns refers to the last time we ran this test
# The good news is that this works fine for us...
self.okerrpatterns.extend(self.patterns)
return self.okerrpatterns
AllTestClasses.append(ComponentFail)
class SplitBrainTest(CTSTest):
'''It is used to test split-brain. when the path between the two nodes break
check the two nodes both take over the resource'''
def __init__(self,cm):
CTSTest.__init__(self,cm)
self.name = "SplitBrain"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.is_experimental = 1
def isolate_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
if len(other_nodes) == 0:
return 1
self.debug("Creating partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
if not self.CM.isolate_node(node, other_nodes):
self.logger.log("Could not isolate %s" % node)
return 0
return 1
def heal_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
if len(other_nodes) == 0:
return 1
self.debug("Healing partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
self.CM.unisolate_node(node, other_nodes)
def __call__(self, node):
'''Perform split-brain test'''
self.incr("calls")
self.passed = 1
partitions = {}
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
while 1:
# Retry until we get multiple partitions
partitions = {}
p_max = len(self.Env["nodes"])
for node in self.Env["nodes"]:
p = self.Env.RandomGen.randint(1, p_max)
if not p in partitions:
partitions[p] = []
partitions[p].append(node)
p_max = len(list(partitions.keys()))
if p_max > 1:
break
# else, try again
self.debug("Created %d partitions" % p_max)
for key in list(partitions.keys()):
self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
# Disabling STONITH to reduce test complexity for now
self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
for key in list(partitions.keys()):
self.isolate_partition(partitions[key])
count = 30
while count > 0:
if len(self.CM.find_partitions()) != p_max:
time.sleep(10)
else:
break
else:
self.failure("Expected partitions were not created")
# Target number of partitions formed - wait for stability
if not self.CM.cluster_stable():
self.failure("Partitioned cluster not stable")
# Now audit the cluster state
self.CM.partitions_expected = p_max
if not self.audit():
self.failure("Audits failed")
self.CM.partitions_expected = 1
# And heal them again
for key in list(partitions.keys()):
self.heal_partition(partitions[key])
# Wait for a single partition to form
count = 30
while count > 0:
if len(self.CM.find_partitions()) != 1:
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not reform")
# Wait for it to have the right number of members
count = 30
while count > 0:
members = []
partitions = self.CM.find_partitions()
if len(partitions) > 0:
members = partitions[0].split()
if len(members) != len(self.Env["nodes"]):
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not completely reform")
# Wait up to 20 minutes - the delay is more preferable than
# trying to continue with in a messed up state
if not self.CM.cluster_stable(1200):
self.failure("Reformed cluster not stable")
if self.Env["continue"] == 1:
answer = "Y"
else:
try:
answer = input_wrapper('Continue? [nY]')
except EOFError as e:
answer = "n"
if answer and answer == "n":
raise ValueError("Reformed cluster not stable")
# Turn fencing back on
if self.Env["DoFencing"]:
self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
self.CM.cluster_stable()
if self.passed:
return self.success()
return self.failure("See previous errors")
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return [
r"Another DC detected:",
r"(ERROR|error).*: .*Application of an update diff failed",
r"pacemaker-controld.*:.*not in our membership list",
r"CRIT:.*node.*returning after partition",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
return len(self.Env["nodes"]) > 2
AllTestClasses.append(SplitBrainTest)
class Reattach(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Reattach"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
self.is_unsafe = 0 # Handled by canrunnow()
def _is_managed(self, node):
is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1)
is_managed = is_managed[:-1] # Strip off the newline
return is_managed == "true"
def _set_unmanaged(self, node):
self.debug("Disable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
def _set_managed(self, node):
self.debug("Re-enable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
def setup(self, node):
attempt = 0
if not self.startall(None):
return None
# Make sure we are really _really_ stable and that all
# resources, including those that depend on transient node
# attributes, are started
while not self.CM.cluster_stable(double_check=True):
if attempt < 5:
attempt += 1
self.debug("Not stable yet, re-testing")
else:
self.logger.log("Cluster is not stable")
return None
return 1
def teardown(self, node):
# Make sure 'node' is up
start = StartTest(self.CM)
start(node)
if not self._is_managed(node):
self.logger.log("Attempting to re-enable resource management on %s" % node)
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
self.logger.log("Could not re-enable resource management")
return 0
return 1
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
if self.find_ocfs2_resources(node):
self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
return 0
return 1
def __call__(self, node):
self.incr("calls")
pats = []
# Conveniently, the scheduler will display this message when disabling
# management, even if fencing is not enabled, so we can rely on it.
managed = self.create_watch(["Delaying fencing operations"], 60)
managed.setwatch()
self._set_unmanaged(node)
if not managed.lookforall():
self.logger.log("Patterns not found: " + repr(managed.unmatched))
return self.failure("Resource management not disabled")
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
watch = self.create_watch(pats, 60, "ShutdownActivity")
watch.setwatch()
self.debug("Shutting down the cluster")
ret = self.stopall(None)
if not ret:
self._set_managed(node)
return self.failure("Couldn't shut down the cluster")
self.debug("Bringing the cluster back up")
ret = self.startall(None)
time.sleep(5) # allow ping to update the CIB
if not ret:
self._set_managed(node)
return self.failure("Couldn't restart the cluster")
if self.local_badnews("ResourceActivity:", watch):
self._set_managed(node)
return self.failure("Resources stopped or started during cluster restart")
watch = self.create_watch(pats, 60, "StartupActivity")
watch.setwatch()
# Re-enable resource management (and verify it happened).
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
# Ignore actions for STONITH resources
ignore = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.debug("Ignoring start actions for %s" % r.id)
ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
if self.local_badnews("ResourceActivity:", watch, ignore):
return self.failure("Resources stopped or started after resource management was re-enabled")
return ret
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"resource( was|s were) active at shutdown",
]
def is_applicable(self):
return 1
AllTestClasses.append(Reattach)
class SpecialTest1(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SpecialTest1"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, node):
'''Perform the 'SpecialTest1' test for Andrew. '''
self.incr("calls")
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Could not stop all nodes")
# Test config recovery when the other nodes come up
self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
# Start the selected node
ret = self.restart1(node)
if not ret:
return self.failure("Could not start "+node)
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Could not start the remaining nodes")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Errors that occur as a result of the CIB being wiped
return [
r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
r"error.*: Resource start-up disabled since no STONITH resources have been defined",
r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
]
AllTestClasses.append(SpecialTest1)
class HAETest(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "HAETest"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_loop = 1
def setup(self, node):
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
return self.success()
def wait_on_state(self, node, resource, expected_clones, attempts=240):
while attempts > 0:
active = 0
(rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
# Hack until crm_resource does the right thing
if rc == 0 and lines:
active = len(lines)
if len(lines) == expected_clones:
return 1
elif rc == 1:
self.debug("Resource %s is still inactive" % resource)
elif rc == 234:
self.logger.log("Unknown resource %s" % resource)
return 0
elif rc == 246:
self.logger.log("Cluster is inactive")
return 0
elif rc != 0:
self.logger.log("Call to crm_resource failed, rc=%d" % rc)
return 0
else:
self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
attempts -= 1
time.sleep(1)
return 0
def find_dlm(self, node):
self.r_dlm = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "controld" and r.parent != "NA":
self.debug("Found dlm: %s" % self.r_dlm)
self.r_dlm = r.parent
return 1
return 0
def find_hae_resources(self, node):
self.r_dlm = None
self.r_o2cb = None
self.r_ocfs2 = []
if self.find_dlm(node):
self.find_ocfs2_resources(node)
def is_applicable(self):
if not self.is_applicable_common():
return 0
if self.Env["Schema"] == "hae":
return 1
return None
class HAERoleTest(HAETest):
def __init__(self, cm):
'''Lars' mount/unmount test for the HA extension. '''
HAETest.__init__(self,cm)
self.name = "HAERoleTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
delay = 2
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "Stopped")
if not self.wait_on_state(node, self.r_dlm, 0):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "Started")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAERoleTest)
class HAEStandbyTest(HAETest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
HAETest.__init__(self,cm)
self.name = "HAEStandbyTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "true")
if not self.wait_on_state(node, self.r_dlm, clone_max-1):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "false")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAEStandbyTest)
class NearQuorumPointTest(CTSTest):
'''
This test brings larger clusters near the quorum point (50%).
In addition, it will test doing starts and stops at the same time.
Here is how I think it should work:
- loop over the nodes and decide randomly which will be up and which
will be down Use a 50% probability for each of up/down.
- figure out what to do to get into that state from the current state
- in parallel, bring up those going up and bring those going down.
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "NearQuorumPoint"
def __call__(self, dummy):
'''Perform the 'NearQuorumPoint' test. '''
self.incr("calls")
startset = []
stopset = []
stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
#decide what to do with each node
for node in self.Env["nodes"]:
action = self.Env.RandomGen.choice(["start","stop"])
#action = self.Env.RandomGen.choice(["start","stop","no change"])
if action == "start" :
startset.append(node)
elif action == "stop" :
stopset.append(node)
self.debug("start nodes:" + repr(startset))
self.debug("stop nodes:" + repr(stopset))
#add search patterns
watchpats = [ ]
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
watchpats.append(self.templates["Pat:We_stopped"] % node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
#watchpats.append(self.templates["Pat:NonDC_started"] % node)
watchpats.append(self.templates["Pat:Local_started"] % node)
else:
for stopping in stopset:
if self.CM.ShouldBeStatus[stopping] == "up":
watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
if len(watchpats) == 0:
return self.skipped()
if len(startset) != 0:
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
#begin actions
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
self.CM.StartaCMnoBlock(node)
#get the result
if watch.lookforall():
self.CM.cluster_stable()
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
return self.success()
self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
#get the "bad" nodes
upnodes = []
for node in stopset:
if self.CM.StataCM(node) == 1:
upnodes.append(node)
downnodes = []
for node in startset:
if self.CM.StataCM(node) == 0:
downnodes.append(node)
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
if upnodes == [] and downnodes == []:
self.CM.cluster_stable()
# Make sure they're completely down with no residule
for node in stopset:
self.rsh(node, self.templates["StopCmd"])
return self.success()
if len(upnodes) > 0:
self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
if len(downnodes) > 0:
self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
return self.failure()
def is_applicable(self):
return 1
AllTestClasses.append(NearQuorumPointTest)
class RollingUpgradeTest(CTSTest):
'''Perform a rolling upgrade of the cluster'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RollingUpgrade"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def setup(self, node):
# Start all remaining nodes
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.downgrade(node, None):
return self.failure("Couldn't downgrade %s" % node)
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.upgrade(node, None):
return self.failure("Couldn't upgrade %s" % node)
return self.success()
def install(self, node, version, start=1, flags="--force"):
target_dir = "/tmp/rpm-%s" % version
src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
self.logger.log("Installing %s on %s with %s" % (version, node, flags))
if not self.stop(node):
return self.failure("stop failure: "+node)
rc = self.rsh(node, "mkdir -p %s" % target_dir)
rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
(rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
for line in lines:
line = line[:-1]
rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
if start and not self.start(node):
return self.failure("start failure: "+node)
return self.success()
def upgrade(self, node, start=1):
return self.install(node, self.Env["current-version"], start)
def downgrade(self, node, start=1):
return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
def __call__(self, node):
'''Perform the 'Rolling Upgrade' test. '''
self.incr("calls")
for node in self.Env["nodes"]:
if self.upgrade(node):
return self.failure("Couldn't upgrade %s" % node)
self.CM.cluster_stable()
return self.success()
def is_applicable(self):
if not self.is_applicable_common():
return None
if not "rpm-dir" in list(self.Env.keys()):
return None
if not "current-version" in list(self.Env.keys()):
return None
if not "previous-version" in list(self.Env.keys()):
return None
return 1
# Register RestartTest as a good test to run
AllTestClasses.append(RollingUpgradeTest)
class BSC_AddResource(CTSTest):
'''Add a resource to the cluster'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "AddResource"
self.resource_offset = 0
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
def __call__(self, node):
self.incr("calls")
self.resource_offset = self.resource_offset + 1
r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok"
patterns = []
patterns.append(start_pat % r_id)
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
ip = self.NextIP()
if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
return self.failure("Make resource %s failed" % r_id)
failed = 0
watch_result = watch.lookforall()
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Pattern not found: %s" % (regex))
failed = 1
if failed:
return self.failure("Resource pattern(s) not found")
if not self.CM.cluster_stable(self.Env["DeadTime"]):
return self.failure("Unstable cluster")
return self.success()
def NextIP(self):
ip = self.Env["IPBase"]
if ":" in ip:
fields = ip.rpartition(":")
fields[2] = str(hex(int(fields[2], 16)+1))
print(str(hex(int(f[2], 16)+1)))
else:
fields = ip.rpartition('.')
fields[2] = str(int(fields[2])+1)
ip = fields[0] + fields[1] + fields[3];
self.Env["IPBase"] = ip
return ip.strip()
def make_ip_resource(self, node, id, rclass, type, ip):
self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
rsc_xml="""
""" % (id, rclass, type, id, id, ip)
node_constraint = """
""" % (id, id, id, id, node)
rc = 0
(rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
if rc != 0:
self.logger.log("Constraint creation failed: %d" % rc)
return None
(rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
if rc != 0:
self.logger.log("Resource creation failed: %d" % rc)
return None
return 1
def is_applicable(self):
if self.Env["DoBSC"]:
return 1
return None
AllTestClasses.append(BSC_AddResource)
class SimulStopLite(CTSTest):
'''Stop any active nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStopLite"
def __call__(self, dummy):
'''Perform the 'SimulStopLite' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
watchpats = [ ]
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.incr("WasStarted")
watchpats.append(self.templates["Pat:We_stopped"] % node)
if len(watchpats) == 0:
return self.success()
# Stop all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.set_timer()
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
if watch.lookforall():
# Make sure they're completely down with no residule
for node in self.Env["nodes"]:
self.rsh(node, self.templates["StopCmd"])
return self.success()
did_fail = 0
up_nodes = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 1:
did_fail = 1
up_nodes.append(node)
if did_fail:
return self.failure("Active nodes exist: " + repr(up_nodes))
self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
+ repr(watch.unmatched))
return self.failure("Missing log message: "+repr(watch.unmatched))
def is_applicable(self):
'''SimulStopLite is a setup test and never applicable'''
return 0
class SimulStartLite(CTSTest):
'''Start any stopped nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStartLite"
def __call__(self, dummy):
'''Perform the 'SimulStartList' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
node_list = []
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "down":
self.incr("WasStopped")
node_list.append(node)
self.set_timer()
while len(node_list) > 0:
# Repeat until all nodes come up
watchpats = [ ]
uppat = self.templates["Pat:NonDC_started"]
if self.CM.upcount() == 0:
uppat = self.templates["Pat:Local_started"]
watchpats.append(self.templates["Pat:DC_IDLE"])
for node in node_list:
watchpats.append(uppat % node)
watchpats.append(self.templates["Pat:InfraUp"] % node)
watchpats.append(self.templates["Pat:PacemakerUp"] % node)
# Start all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
stonith = self.CM.prepare_fencing_watcher(self.name)
for node in node_list:
self.CM.StartaCMnoBlock(node)
watch.lookforall()
node_list = self.CM.fencing_cleanup(self.name, stonith)
if node_list == None:
return self.failure("Cluster did not stabilize")
# Remove node_list messages from watch.unmatched
for node in node_list:
self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
if watch.unmatched:
try:
watch.unmatched.remove(uppat % node)
except:
self.debug("Already matched: %s" % (uppat % node))
try:
watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
try:
watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
if not self.CM.cluster_stable():
return self.failure("Cluster did not stabilize")
did_fail = 0
unstable = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 0:
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstarted nodes exist: " + repr(unstable))
unstable = []
for node in self.Env["nodes"]:
if not self.CM.node_stable(node):
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstable cluster nodes exist: " + repr(unstable))
return self.success()
def is_applicable(self):
'''SimulStartLite is a setup test and never applicable'''
return 0
def TestList(cm, audits):
result = []
for testclass in AllTestClasses:
bound_test = testclass(cm)
if bound_test.is_applicable():
bound_test.Audits = audits
result.append(bound_test)
return result
class RemoteLXC(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteLXC"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = 1
self.is_docker_unsafe = 1
self.failed = 0
self.fail_string = ""
def start_lxc_simple(self, node):
# restore any artifacts laying around from a previous test.
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
# generate the containers, put them in the config, add some resources to them
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
self.set_timer("remoteSimpleInit")
watch.lookforall()
self.log_timer("remoteSimpleInit")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
def cleanup_lxc_simple(self, node):
pats = [ ]
# if the test failed, attempt to clean up the cib and libvirt environment
# as best as possible
if self.failed == 1:
# restore libvirt and cib
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
return
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
self.set_timer("remoteSimpleCleanup")
watch.lookforall()
self.log_timer("remoteSimpleCleanup")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
# cleanup libvirt
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
def __call__(self, node):
'''Perform the 'RemoteLXC' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed, start all nodes failed.")
rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
if rc == 1:
self.log("Environment test for lxc support failed.")
return self.skipped()
self.start_lxc_simple(node)
self.cleanup_lxc_simple(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed == 1:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for ping",
r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
# The orphaned lxc-ms resource causes an expected transition error
# that is a result of the scheduler not having knowledge that the
# promotable resource used to be a clone. As a result, it looks like that
# resource is running in multiple locations when it shouldn't... But in
# this instance we know why this error is occurring and that it is expected.
r"Calculated [Tt]ransition .*pe-error",
r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
r"Unknown operation: fail",
r"VirtualDomain.*ERROR: Unable to determine emulator",
]
AllTestClasses.append(RemoteLXC)
class RemoteDriver(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = self.__class__.__name__
self.is_docker_unsafe = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
self.remote_rsc = "remote-rsc"
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
self.reset()
def reset(self):
self.pcmk_started = 0
self.failed = False
self.fail_string = ""
self.remote_node_added = 0
self.remote_rsc_added = 0
self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False])
def fail(self, msg):
""" Mark test as failed. """
self.failed = True
# Always log the failure.
self.logger.log(msg)
# Use first failure as test status, as it's likely to be most useful.
if not self.fail_string:
self.fail_string = msg
def get_othernode(self, node):
for othernode in self.Env["nodes"]:
if othernode == node:
# we don't want to try and use the cib that we just shutdown.
# find a cluster node that is not our soon to be remote-node.
continue
else:
return othernode
def del_rsc(self, node, rsc):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
if rc != 0:
self.fail("Removal of resource '%s' failed" % rsc)
def add_rsc(self, node, rsc_xml):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
if rc != 0:
self.fail("resource creation failed")
def add_primitive_rsc(self, node):
rsc_xml = """
""" % { "node": self.remote_rsc }
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_rsc_added = 1
def add_connection_rsc(self, node):
rsc_xml = """
""" % { "node": self.remote_node, "server": node }
if self.remote_use_reconnect_interval:
# Set reconnect interval on resource
rsc_xml = rsc_xml + """
""" % (self.remote_node)
rsc_xml = rsc_xml + """
""" % { "node": self.remote_node }
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_node_added = 1
def disable_services(self, node):
self.corosync_enabled = self.Env.service_is_enabled(node, "corosync")
if self.corosync_enabled:
self.Env.disable_service(node, "corosync")
self.pacemaker_enabled = self.Env.service_is_enabled(node, "pacemaker")
if self.pacemaker_enabled:
self.Env.disable_service(node, "pacemaker")
def restore_services(self, node):
if self.corosync_enabled:
self.Env.enable_service(node, "corosync")
if self.pacemaker_enabled:
self.Env.enable_service(node, "pacemaker")
def stop_pcmk_remote(self, node):
# disable pcmk remote
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote stop")
if rc != 0:
time.sleep(6)
else:
break
def start_pcmk_remote(self, node):
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote start")
if rc != 0:
time.sleep(6)
else:
self.pcmk_started = 1
break
def freeze_pcmk_remote(self, node):
""" Simulate a Pacemaker Remote daemon failure. """
# We freeze the process.
self.rsh(node, "killall -STOP pacemaker-remoted")
def resume_pcmk_remote(self, node):
# We resume the process.
self.rsh(node, "killall -CONT pacemaker-remoted")
def start_metal(self, node):
# Cluster nodes are reused as remote nodes in remote tests. If cluster
# services were enabled at boot, in case the remote node got fenced, the
# cluster node would join instead of the expected remote one. Meanwhile
# pacemaker_remote would not be able to start. Depending on the chances,
# the situations might not be able to be orchestrated gracefully any more.
#
# Temporarily disable any enabled cluster serivces.
self.disable_services(node)
pcmk_started = 0
# make sure the resource doesn't already exist for some reason
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
if not self.stop(node):
self.fail("Failed to shutdown cluster node %s" % node)
return
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
# Convert node to baremetal now that it has shutdown the cluster stack
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
self.add_connection_rsc(node)
self.set_timer("remoteMetalInit")
watch.lookforall()
self.log_timer("remoteMetalInit")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def migrate_connection(self, node):
if self.failed:
return
pats = [ ]
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(pats, 120)
watch.setwatch()
(rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
if rc != 0:
self.fail("failed to move remote node connection resource")
return
self.set_timer("remoteMetalMigrate")
watch.lookforall()
self.log_timer("remoteMetalMigrate")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def fail_rsc(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, 120)
watch.setwatch()
self.debug("causing dummy rsc to fail.")
rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
self.set_timer("remoteRscFail")
watch.lookforall()
self.log_timer("remoteRscFail")
if watch.unmatched:
self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)
def fail_connection(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
watch = self.create_watch(watchpats, 120)
watch.setwatch()
# freeze the pcmk remote daemon. this will result in fencing
self.debug("Force stopped active remote node")
self.freeze_pcmk_remote(node)
self.debug("Waiting for remote node to be fenced.")
self.set_timer("remoteMetalFence")
watch.lookforall()
self.log_timer("remoteMetalFence")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
self.debug("Waiting for the remote node to come back up")
self.CM.ns.WaitForNodeToComeUp(node, 120);
pats = [ ]
watch = self.create_watch(pats, 240)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
# start the remote node again watch it integrate back into cluster.
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
self.debug("Waiting for remote node to rejoin cluster after being fenced.")
self.set_timer("remoteMetalRestart")
watch.lookforall()
self.log_timer("remoteMetalRestart")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def add_dummy_rsc(self, node):
if self.failed:
return
# verify we can put a resource on the remote node
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
# Add a resource that must live on remote-node
self.add_primitive_rsc(node)
# force that rsc to prefer the remote node.
(rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
if rc != 0:
self.fail("Failed to place remote resource on remote node.")
return
self.set_timer("remoteMetalRsc")
watch.lookforall()
self.log_timer("remoteMetalRsc")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def test_attributes(self, node):
if self.failed:
return
# This verifies permanent attributes can be set on a remote-node. It also
# verifies the remote-node can edit its own cib node section remotely.
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to get remote-node attribute")
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to delete remote-node attribute")
return
def cleanup_metal(self, node):
self.restore_services(node)
if self.pcmk_started == 0:
return
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
if self.remote_node_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))
self.set_timer("remoteMetalCleanup")
self.resume_pcmk_remote(node)
if self.remote_rsc_added == 1:
# Remove dummy resource added for remote node tests
self.debug("Cleaning up dummy rsc put on remote node")
self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc)
self.del_rsc(node, self.remote_rsc)
if self.remote_node_added == 1:
# Remove remote node's connection resource
self.debug("Cleaning up remote node connection resource")
self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node))
self.del_rsc(node, self.remote_node)
watch.lookforall()
self.log_timer("remoteMetalCleanup")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
self.stop_pcmk_remote(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.remote_node_added == 1:
# Remove remote node itself
self.debug("Cleaning up node entry for remote node")
self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)
def setup_env(self, node):
self.remote_node = "remote-%s" % (node)
# we are assuming if all nodes have a key, that it is
# the right key... If any node doesn't have a remote
# key, we regenerate it everywhere.
if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
return
# create key locally
(handle, keyfile) = tempfile.mkstemp(".cts")
os.close(handle)
devnull = open(os.devnull, 'wb')
subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
stdout=devnull, stderr=devnull)
devnull.close()
# sync key throughout the cluster
for node in self.Env["nodes"]:
self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
os.unlink(keyfile)
def is_applicable(self):
if not self.is_applicable_common():
return False
for node in self.Env["nodes"]:
rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
if rc != 0:
return False
return True
def start_new_test(self, node):
self.incr("calls")
self.reset()
ret = self.startall(None)
if not ret:
return self.failure("setup failed: could not start all nodes")
self.setup_env(node)
self.start_metal(node)
self.add_dummy_rsc(node)
return True
def __call__(self, node):
return self.failure("This base class is not meant to be called directly.")
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [ r"""is running on remote.*which isn't allowed""",
r"""Connection terminated""",
r"""Could not send remote""",
]
# RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses
class RemoteBasic(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteBaremetal' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.test_attributes(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteBasic)
class RemoteStonithd(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteStonithd' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.fail_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return False
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return True
def errorstoignore(self):
ignore_pats = [
r"Lost connection to Pacemaker Remote node",
r"Software caused connection abort",
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
r"error: Result of monitor operation for .* on remote-.*: No executor connection",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteStonithd)
class RemoteMigrate(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteMigrate' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.migrate_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteMigrate)
class RemoteRscFailure(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteRscFailure' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
# This is an important step. We are migrating the connection
# before failing the resource. This verifies that the migration
# has properly maintained control over the remote-node.
self.migrate_connection(node)
self.fail_rsc(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
ignore_pats = [
r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
r"Dummy.*: No process state file found",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteRscFailure)
# vim:ts=4:sw=4:et:
diff --git a/cts/cts-exec.in b/cts/cts-exec.in
index 8cc203fcb9..592d850b4e 100644
--- a/cts/cts-exec.in
+++ b/cts/cts-exec.in
@@ -1,1219 +1,1219 @@
#!@PYTHON@
""" Regression tests for Pacemaker's pacemaker-execd
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2012-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import io
import os
import stat
import sys
import subprocess
import shlex
import shutil
import time
# Where to find test binaries
# Prefer the source tree if available
BUILD_DIR = "@abs_top_builddir@"
TEST_DIR = sys.path[0]
SBIN_DIR = "@sbindir@"
# File permissions for executable scripts we create
EXECMODE = stat.S_IRUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH
# These values must be kept in sync with include/crm/crm.h
class CrmExit(object):
OK = 0
ERROR = 1
INVALID_PARAM = 2
UNIMPLEMENT_FEATURE = 3
INSUFFICIENT_PRIV = 4
NOT_INSTALLED = 5
NOT_CONFIGURED = 6
NOT_RUNNING = 7
USAGE = 64
DATAERR = 65
NOINPUT = 66
NOUSER = 67
NOHOST = 68
UNAVAILABLE = 69
SOFTWARE = 70
OSERR = 71
OSFILE = 72
CANTCREAT = 73
IOERR = 74
TEMPFAIL = 75
PROTOCOL = 76
NOPERM = 77
CONFIG = 78
FATAL = 100
PANIC = 101
DISCONNECT = 102
SOLO = 103
DIGEST = 104
NOSUCH = 105
QUORUM = 106
UNSAFE = 107
EXISTS = 108
MULTIPLE = 109
OLD = 110
TIMEOUT = 124
MAX = 255
def update_path():
""" Set the PATH environment variable appropriately for the tests """
new_path = os.environ['PATH']
if os.path.exists("%s/cts-exec.in" % TEST_DIR):
print("Running tests from the source tree: %s (%s)" % (BUILD_DIR, TEST_DIR))
# For pacemaker-execd, cts-exec-helper, and pacemaker-remoted
new_path = "%s/daemons/execd:%s" % (BUILD_DIR, new_path)
new_path = "%s/tools:%s" % (BUILD_DIR, new_path) # For crm_resource
# For pacemaker-fenced
new_path = "%s/daemons/fenced:%s" % (BUILD_DIR, new_path)
# For cts-support
new_path = "%s/cts:%s" % (BUILD_DIR, new_path)
else:
print("Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % TEST_DIR)
# For cts-exec-helper, cts-support, pacemaker-execd, pacemaker-fenced,
# and pacemaker-remoted
new_path = "@CRM_DAEMON_DIR@:%s" % (new_path)
print('Using PATH="{}"'.format(new_path))
os.environ['PATH'] = new_path
def pipe_output(pipes, stdout=True, stderr=False):
""" Wrapper to get text output from pipes regardless of Python version """
output = ""
pipe_outputs = pipes.communicate()
if sys.version_info < (3,):
if stdout:
output = output + pipe_outputs[0]
if stderr:
output = output + pipe_outputs[1]
else:
if stdout:
output = output + pipe_outputs[0].decode(sys.stdout.encoding)
if stderr:
output = output + pipe_outputs[1].decode(sys.stderr.encoding)
return output
def output_from_command(command):
""" Run a command, and return its standard output. """
test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
test.wait()
return pipe_output(test).split("\n")
class TestError(Exception):
""" Base class for exceptions in this module """
pass
class ExitCodeError(TestError):
""" Exception raised when command exit status is unexpected """
def __init__(self, exit_code):
self.exit_code = exit_code
def __str__(self):
return repr(self.exit_code)
class OutputNotFoundError(TestError):
""" Exception raised when command output does not contain wanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class OutputFoundError(TestError):
""" Exception raised when command output contains unwanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class Test(object):
""" Executor for a single pacemaker-execd regression test """
def __init__(self, name, description, verbose=0, tls=0):
self.name = name
self.description = description
self.cmds = []
if tls:
self.daemon_location = "pacemaker-remoted"
else:
self.daemon_location = "pacemaker-execd"
self.test_tool_location = "cts-exec-helper"
self.verbose = verbose
self.tls = tls
self.result_txt = ""
self.cmd_tool_output = ""
self.result_exitcode = CrmExit.OK
self.execd_process = None
self.stonith_process = None
self.executed = 0
def __new_cmd(self, cmd, args, exitcode, stdout_match="", no_wait=0, stdout_negative_match="", kill=None):
""" Add a command to be executed as part of this test """
if self.verbose and cmd == self.test_tool_location:
args = args + " -V "
if (cmd == self.test_tool_location) and self.tls:
args = args + " -S "
self.cmds.append(
{
"cmd" : cmd,
"kill" : kill,
"args" : args,
"expected_exitcode" : exitcode,
"stdout_match" : stdout_match,
"stdout_negative_match" : stdout_negative_match,
"no_wait" : no_wait,
"cmd_output" : "",
}
)
def start_environment(self):
""" Prepare the host for running a test """
### make sure we are in full control here ###
cmd = shlex.split("killall -q -9 pacemaker-fenced lt-pacemaker-fenced pacemaker-execd lt-pacemaker-execd cts-exec-helper lt-cts-exec-helper pacemaker-remoted")
test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
test.wait()
additional_args = ""
if self.tls == 0:
self.stonith_process = subprocess.Popen(shlex.split("pacemaker-fenced -s"))
if self.verbose:
additional_args = additional_args + " -V"
self.execd_process = subprocess.Popen(shlex.split("%s %s -l /tmp/pacemaker-execd-regression.log"
% (self.daemon_location, additional_args)))
time.sleep(1)
def clean_environment(self):
""" Clean up the host after running a test """
if self.execd_process:
self.execd_process.terminate()
self.execd_process.wait()
if self.verbose:
print("Daemon output")
logfile = io.open('/tmp/pacemaker-execd-regression.log', 'rt', errors='replace')
for line in logfile:
print(line.strip().encode('utf-8', 'replace'))
os.remove('/tmp/pacemaker-execd-regression.log')
if self.stonith_process:
self.stonith_process.terminate()
self.stonith_process.wait()
self.execd_process = None
self.stonith_process = None
def add_sys_cmd(self, cmd, args):
""" Add a simple command to be executed as part of this test """
self.__new_cmd(cmd, args, CrmExit.OK, "")
def add_cmd_check_stdout(self, args, match, no_match=""):
""" Add a command with expected output to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, match, 0, no_match)
def add_cmd(self, args):
""" Add a cts-exec-helper command to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "")
def add_cmd_and_kill(self, kill_proc, args):
""" Add a cts-exec-helper command and system command to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "", kill=kill_proc)
def add_expected_fail_cmd(self, args, exitcode=CrmExit.ERROR):
""" Add a cts-exec-helper command to be executed as part of this test and expected to fail """
self.__new_cmd(self.test_tool_location, args, exitcode, "")
def get_exitcode(self):
""" Return the exit status of the last test execution """
return self.result_exitcode
def print_result(self, filler):
""" Print the result of the last test execution """
print("%s%s" % (filler, self.result_txt))
def run_cmd(self, args):
""" Execute a command as part of this test """
cmd = shlex.split(args['args'])
cmd.insert(0, args['cmd'])
if self.verbose:
print("\n\nRunning: "+" ".join(cmd))
test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
if args['kill']:
if self.verbose:
print("Also running: "+args['kill'])
### Typically, the kill argument is used to detect some sort of
### failure. Without yielding for a few seconds here, the process
### launched earlier that is listening for the failure may not have
### time to connect to pacemaker-execd.
time.sleep(2)
subprocess.Popen(shlex.split(args['kill']))
if args['no_wait'] == 0:
test.wait()
else:
return CrmExit.OK
output = pipe_output(test)
args['cmd_output'] = output
if test.returncode != args['expected_exitcode']:
raise ExitCodeError(test.returncode)
if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0:
raise OutputNotFoundError(output)
if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0:
raise OutputFoundError(output)
def set_error(self, step, cmd):
""" Record failure of this test """
msg = "FAILURE - '%s' failed at step %d. Command: %s %s"
self.result_txt = msg % (self.name, step, cmd['cmd'], cmd['args'])
self.result_exitcode = CrmExit.ERROR
def run(self):
""" Execute this test. """
res = 0
i = 1
if self.tls and self.name.count("stonith") != 0:
self.result_txt = "SKIPPED - '%s' - disabled when testing pacemaker_remote" % (self.name)
print(self.result_txt)
return res
self.start_environment()
if self.verbose:
print("\n--- START TEST - %s" % self.name)
self.result_txt = "SUCCESS - '%s'" % (self.name)
self.result_exitcode = CrmExit.OK
for cmd in self.cmds:
try:
self.run_cmd(cmd)
except ExitCodeError as e:
print(cmd['cmd_output'])
print("Step %d FAILED - command returned %s, expected %d" % (i, e, cmd['expected_exitcode']))
self.set_error(i, cmd);
break
except OutputNotFoundError as e:
print("Step %d FAILED - '%s' was not found in command output: %s" % (i, cmd['stdout_match'], e))
self.set_error(i, cmd);
break
except OutputFoundError as e:
print("Step %d FAILED - '%s' was found in command output: %s" % (i, cmd['stdout_negative_match'], e))
self.set_error(i, cmd);
break
if self.verbose:
print(cmd['cmd_output'].strip())
print("Step %d SUCCESS" % (i))
i = i + 1
self.clean_environment()
print(self.result_txt)
if self.verbose:
print("--- END TEST - %s\n" % self.name)
self.executed = 1
return res
class Tests(object):
""" Collection of all pacemaker-execd regression tests """
def __init__(self, verbose=0, tls=0):
self.tests = []
self.verbose = verbose
self.tls = tls
self.rsc_classes = output_from_command("crm_resource --list-standards")
self.rsc_classes = self.rsc_classes[:-1] # Strip trailing empty line
self.installed_files = []
self.action_timeout = " -t 9000 "
if self.tls:
self.rsc_classes.remove("stonith")
if "systemd" in self.rsc_classes:
try:
# This code doesn't need this import, but pacemaker-cts-dummyd
# does, so ensure the dependency is available rather than cause
# all systemd tests to fail.
import systemd.daemon
except ImportError:
print("Python systemd bindings not found.")
print("The tests for systemd class are not going to be run.")
self.rsc_classes.remove("systemd")
print("Testing resource classes", repr(self.rsc_classes))
self.common_cmds = {
"ocf_reg_line" : "-c register_rsc -r ocf_test_rsc "+self.action_timeout+" -C ocf -P pacemaker -T Dummy",
"ocf_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"",
"ocf_unreg_line" : "-c unregister_rsc -r \"ocf_test_rsc\" "+self.action_timeout,
"ocf_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"",
"ocf_start_line" : "-c exec -r \"ocf_test_rsc\" -a \"start\" "+self.action_timeout,
"ocf_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:start rc:ok op_status:complete\" ",
"ocf_stop_line" : "-c exec -r \"ocf_test_rsc\" -a \"stop\" "+self.action_timeout,
"ocf_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:stop rc:ok op_status:complete\" ",
"ocf_monitor_line" : '-c exec -r ocf_test_rsc -a monitor -i 2s ' + self.action_timeout,
"ocf_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"ocf_cancel_line" : '-c cancel -r ocf_test_rsc -a monitor -i 2s ' + self.action_timeout,
"ocf_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"systemd_reg_line" : "-c register_rsc -r systemd_test_rsc " +
self.action_timeout +
" -C systemd -T pacemaker-cts-dummyd@3",
"systemd_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"",
"systemd_unreg_line" : "-c unregister_rsc -r \"systemd_test_rsc\" "+self.action_timeout,
"systemd_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"",
"systemd_start_line" : "-c exec -r \"systemd_test_rsc\" -a \"start\" "+self.action_timeout,
"systemd_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:start rc:ok op_status:complete\" ",
"systemd_stop_line" : "-c exec -r \"systemd_test_rsc\" -a \"stop\" "+self.action_timeout,
"systemd_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:stop rc:ok op_status:complete\" ",
"systemd_monitor_line" : '-c exec -r systemd_test_rsc -a monitor -i 2s ' + self.action_timeout,
"systemd_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:complete\" -t 15000 ",
"systemd_cancel_line" : '-c cancel -r systemd_test_rsc -a monitor -i 2s ' + self.action_timeout,
"systemd_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"upstart_reg_line" : "-c register_rsc -r upstart_test_rsc "+self.action_timeout+" -C upstart -T pacemaker-cts-dummyd",
"upstart_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"",
"upstart_unreg_line" : "-c unregister_rsc -r \"upstart_test_rsc\" "+self.action_timeout,
"upstart_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"",
"upstart_start_line" : "-c exec -r \"upstart_test_rsc\" -a \"start\" "+self.action_timeout,
"upstart_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:start rc:ok op_status:complete\" ",
"upstart_stop_line" : "-c exec -r \"upstart_test_rsc\" -a \"stop\" "+self.action_timeout,
"upstart_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:stop rc:ok op_status:complete\" ",
"upstart_monitor_line" : '-c exec -r upstart_test_rsc -a monitor -i 2s ' + self.action_timeout,
"upstart_monitor_event" : '-l "NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:complete" -t 15000',
"upstart_cancel_line" : '-c cancel -r upstart_test_rsc -a monitor -i 2s ' + self.action_timeout,
"upstart_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"service_reg_line" : "-c register_rsc -r service_test_rsc "+self.action_timeout+" -C service -T LSBDummy",
"service_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:service_test_rsc action:none rc:ok op_status:complete\"",
"service_unreg_line" : "-c unregister_rsc -r \"service_test_rsc\" "+self.action_timeout,
"service_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:service_test_rsc action:none rc:ok op_status:complete\"",
"service_start_line" : "-c exec -r \"service_test_rsc\" -a \"start\" "+self.action_timeout,
"service_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:start rc:ok op_status:complete\" ",
"service_stop_line" : "-c exec -r \"service_test_rsc\" -a \"stop\" "+self.action_timeout,
"service_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:stop rc:ok op_status:complete\" ",
"service_monitor_line" : '-c exec -r service_test_rsc -a monitor -i 2s ' + self.action_timeout,
"service_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"service_cancel_line" : '-c cancel -r service_test_rsc -a monitor -i 2s ' + self.action_timeout,
"service_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"lsb_reg_line" : "-c register_rsc -r lsb_test_rsc "+self.action_timeout+" -C lsb -T LSBDummy",
"lsb_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\" ",
"lsb_unreg_line" : "-c unregister_rsc -r \"lsb_test_rsc\" "+self.action_timeout,
"lsb_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\"",
"lsb_start_line" : "-c exec -r \"lsb_test_rsc\" -a \"start\" "+self.action_timeout,
"lsb_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:start rc:ok op_status:complete\" ",
"lsb_stop_line" : "-c exec -r \"lsb_test_rsc\" -a \"stop\" "+self.action_timeout,
"lsb_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:stop rc:ok op_status:complete\" ",
"lsb_monitor_line" : '-c exec -r lsb_test_rsc -a status -i 2s ' + self.action_timeout,
"lsb_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:complete\" "+self.action_timeout,
"lsb_cancel_line" : '-c cancel -r lsb_test_rsc -a status -i 2s ' + self.action_timeout,
"lsb_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:Cancelled\" ",
"stonith_reg_line" : "-c register_rsc -r stonith_test_rsc " + self.action_timeout +
" -C stonith -P pacemaker -T fence_dummy",
"stonith_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\" ",
"stonith_unreg_line" : "-c unregister_rsc -r \"stonith_test_rsc\" "+self.action_timeout,
"stonith_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\"",
"stonith_start_line" : '-c exec -r stonith_test_rsc -a start ' + self.action_timeout,
"stonith_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:start rc:ok op_status:complete\" ",
"stonith_stop_line" : "-c exec -r \"stonith_test_rsc\" -a \"stop\" "+self.action_timeout,
"stonith_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:stop rc:ok op_status:complete\" ",
"stonith_monitor_line" : '-c exec -r stonith_test_rsc -a monitor -i 2s ' + self.action_timeout,
"stonith_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"stonith_cancel_line" : '-c cancel -r stonith_test_rsc -a monitor -i 2s ' + self.action_timeout,
"stonith_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
}
def new_test(self, name, description):
""" Create a named test """
test = Test(name, description, self.verbose, self.tls)
self.tests.append(test)
return test
def setup_test_environment(self):
""" Prepare the host before executing any tests """
os.system("service pacemaker_remote stop")
self.cleanup_test_environment()
if self.tls and not os.path.isfile("/etc/pacemaker/authkey"):
print("Installing /etc/pacemaker/authkey ...")
os.system("mkdir -p /etc/pacemaker")
os.system("dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1")
self.installed_files.append("/etc/pacemaker/authkey")
# If we're in build directory, install agents if not already installed
if os.path.exists("%s/cts/cts-exec.in" % BUILD_DIR):
if not os.path.exists("@OCF_RA_DIR@/pacemaker"):
# @TODO remember which components were created and remove them
os.makedirs("@OCF_RA_DIR@/pacemaker", 0o755)
for agent in ["Dummy", "Stateful", "ping"]:
agent_source = "%s/extra/resources/%s" % (BUILD_DIR, agent)
agent_dest = "@OCF_RA_DIR@/pacemaker/%s" % (agent)
if not os.path.exists(agent_dest):
print("Installing %s ..." % (agent_dest))
shutil.copyfile(agent_source, agent_dest)
os.chmod(agent_dest, EXECMODE)
self.installed_files.append(agent_dest)
subprocess.call(["cts-support", "install"])
def cleanup_test_environment(self):
""" Clean up the host after executing desired tests """
for installed_file in self.installed_files:
print("Removing %s ..." % (installed_file))
os.remove(installed_file)
subprocess.call(["cts-support", "uninstall"])
def build_generic_tests(self):
""" Register tests that apply to all resource classes """
common_cmds = self.common_cmds
### register/unregister tests ###
for rsc in self.rsc_classes:
test = self.new_test("generic_registration_%s" % (rsc),
"Simple resource registration test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### start/stop tests ###
for rsc in self.rsc_classes:
test = self.new_test("generic_start_stop_%s" % (rsc), "Simple start and stop test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### monitor cancel test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_monitor_cancel_%s" % (rsc),
"Simple monitor cancel test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### monitor duplicate test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_monitor_duplicate_%s" % (rsc),
"Test creation and canceling of duplicate monitors for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
# Add the duplicate monitors
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
# verify we still get update events
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
# cancel the monitor, if the duplicate merged with the original, we should no longer see monitor updates
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### stop implies cancel test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_stop_implies_cancel_%s" % (rsc),
"Verify stopping a resource implies cancel of recurring ops for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
def build_multi_rsc_tests(self):
""" Register complex tests that involve managing multiple resouces of different types """
common_cmds = self.common_cmds
# do not use service and systemd at the same time, it is the same resource.
### register start monitor stop unregister resources of each type at the same time. ###
test = self.new_test("multi_rsc_start_stop_all",
"Start, monitor, and stop resources of multiple types and classes")
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
for rsc in self.rsc_classes:
### If this fails, that means the monitor is not being rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
def build_negative_tests(self):
""" Register tests related to how pacemaker-execd handles failures """
### ocf start timeout test ###
test = self.new_test("ocf_start_timeout", "Force start timeout to occur, verify start failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
# -t must be less than self.action_timeout
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" -k \"op_sleep\" -v \"5\" -t 1000 -w")
- test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:Timed Out" '
+ test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:error op_status:Timed Out" '
+ self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### stonith start timeout test ###
test = self.new_test("stonith_start_timeout", "Force start timeout to occur, verify start failure.")
test.add_cmd('-c register_rsc -r test_rsc ' +
'-C stonith -P pacemaker -T fence_dummy ' +
self.action_timeout +
'-l "NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete"')
test.add_cmd('-c exec -r test_rsc -a start -k monitor_delay -v 30 ' +
'-t 1000 -w') # -t must be less than self.action_timeout
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:OCF_TIMEOUT op_status:Timed Out" '
+ self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### stonith component fail ###
common_cmds = self.common_cmds
test = self.new_test("stonith_component_fail", "Kill stonith component after pacemaker-execd connects")
test.add_cmd(common_cmds["stonith_reg_line"] + " " + common_cmds["stonith_reg_event"])
test.add_cmd(common_cmds["stonith_start_line"] + " " + common_cmds["stonith_start_event"])
test.add_cmd('-c exec -r stonith_test_rsc -a monitor -i 600s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete" '
+ self.action_timeout)
test.add_cmd_and_kill("killall -9 -q pacemaker-fenced lt-pacemaker-fenced",
- '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:unknown error op_status:error" -t 15000')
+ '-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:error op_status:error" -t 15000')
test.add_cmd(common_cmds["stonith_unreg_line"] + " " + common_cmds["stonith_unreg_event"])
### monitor fail for ocf resources ###
test = self.new_test("monitor_fail_ocf", "Force ocf monitor to fail, verify failure is reported.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"')
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"'
+ self.action_timeout)
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"'
+ self.action_timeout)
test.add_cmd_and_kill("rm -f @localstatedir@/run/Dummy-test_rsc.state",
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete" ' + self.action_timeout)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s ' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "
+ self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "
+ self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### verify notify changes only for monitor operation. ###
test = self.new_test("monitor_changes_only", "Verify when flag is set, only monitor changes are notified.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+" -o "
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
' -o -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete" ')
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd_and_kill('rm -f @localstatedir@/run/Dummy-test_rsc.state', '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s'
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd('-c unregister_rsc -r "test_rsc" ' + self.action_timeout +
'-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete"')
### monitor fail for systemd resource ###
if "systemd" in self.rsc_classes:
test = self.new_test("monitor_fail_systemd", "Force systemd monitor to fail, verify failure is reported..")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T pacemaker-cts-dummyd@3 " +
self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd_and_kill("killall -9 -q pacemaker-cts-dummyd",
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### monitor fail for upstart resource ###
if "upstart" in self.rsc_classes:
test = self.new_test("monitor_fail_upstart", "Force upstart monitor to fail, verify failure is reported..")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T pacemaker-cts-dummyd "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd_and_kill('killall -9 -q dd', '-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete"' + self.action_timeout)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s'
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Cancel non-existent operation on a resource ###
test = self.new_test("cancel_non_existent_op", "Attempt to cancel the wrong monitor operation, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
### interval is wrong, should fail
test.add_expected_fail_cmd('-c cancel -r test_rsc -a monitor -i 2s' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
### action name is wrong, should fail
test.add_expected_fail_cmd('-c cancel -r test_rsc -a stop -i 1s' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Attempt to invoke non-existent rsc id ###
test = self.new_test("invoke_non_existent_rsc", "Attempt to perform operations on a non-existent rsc id.")
test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
- "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:complete\" ")
+ "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:error op_status:complete\" ")
test.add_expected_fail_cmd("-c exec -r test_rsc -a stop "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_expected_fail_cmd('-c exec -r test_rsc -a monitor -i 6s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_expected_fail_cmd("-c cancel -r test_rsc -a start "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register and start a resource that doesn't exist, systemd ###
if "systemd" in self.rsc_classes:
test = self.new_test("start_uninstalled_systemd", "Register uninstalled systemd agent, try to start, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
if "upstart" in self.rsc_classes:
test = self.new_test("start_uninstalled_upstart", "Register uninstalled upstart agent, try to start, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register and start a resource that doesn't exist, ocf ###
test = self.new_test("start_uninstalled_ocf", "Register uninstalled ocf agent, try to start, verify expected failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pacemaker -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register ocf with non-existent provider ###
test = self.new_test("start_ocf_bad_provider", "Register ocf agent with a non-existent provider, verify expected failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pancakes -T Dummy "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register ocf with empty provider field ###
test = self.new_test("start_ocf_no_provider", "Register ocf agent with a no provider, verify expected failure.")
test.add_expected_fail_cmd("-c register_rsc -r \"test_rsc\" -C ocf -T Dummy "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Error\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
def build_stress_tests(self):
""" Register stress tests """
timeout = "-t 20000"
iterations = 25
test = self.new_test("ocf_stress", "Verify OCF agent handling works under load")
for i in range(iterations):
test.add_cmd("-c register_rsc -r rsc_%s %s -C ocf -P heartbeat -T Dummy -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i))
for i in range(iterations):
test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
if "systemd" in self.rsc_classes:
test = self.new_test("systemd_stress", "Verify systemd dbus connection works under load")
for i in range(iterations):
test.add_cmd("-c register_rsc -r rsc_%s %s -C systemd -T pacemaker-cts-dummyd@3 -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i))
for i in range(iterations):
test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
iterations = 9
timeout = "-t 30000"
### Verify recurring op in-flight collision is handled in series properly
test = self.new_test("rsc_inflight_collision", "Verify recurring ops do not collide with other operations for the same rsc.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a start %s -k op_sleep -v 1 -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\"" % (timeout))
for i in range(iterations):
test.add_cmd('-c exec -r test_rsc -a monitor %s -i 100%dms '
'-k op_sleep -v 2 '
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"' % (timeout, i))
test.add_cmd("-c exec -r test_rsc -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\"" % (timeout))
test.add_cmd("-c unregister_rsc -r test_rsc %s -l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\"" % (timeout))
def build_custom_tests(self):
""" Register tests that target specific cases """
### verify resource temporary folder is created and used by OCF agents. ###
test = self.new_test("rsc_tmp_dir", "Verify creation and use of rsc temporary state directory")
test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@")
test.add_cmd("-c register_rsc -r test_rsc -P heartbeat -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a start -t 4000")
test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@")
test.add_sys_cmd("ls", "@CRM_RSCTMP_DIR@/Dummy-test_rsc.state")
test.add_cmd("-c exec -r test_rsc -a stop -t 4000")
test.add_cmd("-c unregister_rsc -r test_rsc "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### start delay then stop test ###
test = self.new_test("start_delay", "Verify start delay works as expected.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -s 6000 -a start -w -t 6000")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 2000", CrmExit.TIMEOUT)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 6000")
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### start delay, but cancel before it gets a chance to start. ###
test = self.new_test("start_delay_cancel", "Using start_delay, start a rsc, but cancel the start op before execution.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -s 5000 -a start -w -t 4000")
test.add_cmd("-c cancel -r test_rsc -a start " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 5000", CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register a bunch of resources, verify we can get info on them ###
test = self.new_test("verify_get_rsc_info", "Register multiple resources, verify retrieval of rsc info.")
if "systemd" in self.rsc_classes:
test.add_cmd("-c register_rsc -r rsc1 -C systemd -T pacemaker-cts-dummyd@3 "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ")
if "upstart" in self.rsc_classes:
test.add_cmd("-c register_rsc -r rsc1 -C upstart -T pacemaker-cts-dummyd "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc2 ")
test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ")
### Register duplicate, verify only one entry exists and can still be removed.
test = self.new_test("duplicate_registration", "Register resource multiple times, verify only one entry exists and can be removed.")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Stateful -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Stateful")
test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ")
### verify the option to only send notification to the original client. ###
test = self.new_test("notify_orig_client_only", "Verify option to only send notifications to the client originating the action.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r \"test_rsc\" -a \"monitor\" -i 1s '
+ self.action_timeout + ' -n '
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"')
# this will fail because the monitor notifications should only go to the original caller, which no longer exists.
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 ')
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### get metadata ###
test = self.new_test("get_ocf_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Dummy\"",
"resource-agent name=\"Dummy\"")
test.add_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Stateful\"")
test.add_expected_fail_cmd("-c metadata -P \"pacemaker\" -T \"Stateful\"")
test.add_expected_fail_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"fake_agent\"")
### get metadata ###
test = self.new_test("get_lsb_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"lsb\" -T \"LSBDummy\"",
"resource-agent name='LSBDummy'")
### get stonith metadata ###
test = self.new_test("get_stonith_metadata", "Retrieve stonith metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"stonith\" -P \"pacemaker\" -T \"fence_dummy\"",
"resource-agent name=\"fence_dummy\"")
### get metadata ###
if "systemd" in self.rsc_classes:
test = self.new_test("get_systemd_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"systemd\" -T \"pacemaker-cts-dummyd@\"",
"resource-agent name=\"pacemaker-cts-dummyd@\"")
### get metadata ###
if "upstart" in self.rsc_classes:
test = self.new_test("get_upstart_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"upstart\" -T \"pacemaker-cts-dummyd\"",
"resource-agent name=\"pacemaker-cts-dummyd\"")
### get ocf providers ###
test = self.new_test("list_ocf_providers",
"Retrieve list of available resource providers, verifies pacemaker is a provider.")
test.add_cmd_check_stdout("-c list_ocf_providers ", "pacemaker")
test.add_cmd_check_stdout("-c list_ocf_providers -T ping", "pacemaker")
### Verify agents only exist in their lists ###
test = self.new_test("verify_agent_lists", "Verify the agent lists contain the right data.")
test.add_cmd_check_stdout("-c list_agents ", "Stateful") ### ocf ###
test.add_cmd_check_stdout("-c list_agents -C ocf", "Stateful")
test.add_cmd_check_stdout("-c list_agents -C lsb", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C service", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents ", "LSBDummy") ### init.d ###
test.add_cmd_check_stdout("-c list_agents -C lsb", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C lsb", "", "fence_dummy") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C service", "", "fence_dummy") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "fence_dummy") ### should not exist
if "systemd" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd@") ### systemd ###
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C systemd", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C systemd", "pacemaker-cts-dummyd@")
test.add_cmd_check_stdout("-c list_agents -C systemd", "", "fence_dummy") ### should not exist
if "upstart" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd") ### upstart ###
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C upstart", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C upstart", "pacemaker-cts-dummyd")
test.add_cmd_check_stdout("-c list_agents -C upstart", "", "fence_dummy") ### should not exist
if "stonith" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents -C stonith", "fence_dummy") ### stonith ###
test.add_cmd_check_stdout("-c list_agents -C stonith", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C stonith", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents ", "fence_dummy")
def print_list(self):
""" List all registered tests """
print("\n==== %d TESTS FOUND ====" % (len(self.tests)))
print("%35s - %s" % ("TEST NAME", "TEST DESCRIPTION"))
print("%35s - %s" % ("--------------------", "--------------------"))
for test in self.tests:
print("%35s - %s" % (test.name, test.description))
print("==== END OF LIST ====\n")
def run_single(self, name):
""" Run a single named test """
for test in self.tests:
if test.name == name:
test.run()
break
def run_tests_matching(self, pattern):
""" Run all tests whose name matches a pattern """
for test in self.tests:
if test.name.count(pattern) != 0:
test.run()
def run_tests(self):
""" Run all tests """
for test in self.tests:
test.run()
def exit(self):
""" Exit (with error status code if any test failed) """
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
sys.exit(CrmExit.ERROR)
sys.exit(CrmExit.OK)
def print_results(self):
""" Print summary of results of executed tests """
failures = 0
success = 0
print("\n\n======= FINAL RESULTS ==========")
print("\n--- FAILURE RESULTS:")
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
failures = failures + 1
test.print_result(" ")
else:
success = success + 1
if failures == 0:
print(" None")
print("\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures))
class TestOptions(object):
""" Option handler """
def __init__(self):
self.options = {}
self.options['list-tests'] = 0
self.options['run-all'] = 1
self.options['run-only'] = ""
self.options['run-only-pattern'] = ""
self.options['verbose'] = 0
self.options['invalid-arg'] = ""
self.options['show-usage'] = 0
self.options['pacemaker-remote'] = 0
def build_options(self, argv):
""" Set options based on command-line arguments """
args = argv[1:]
skip = 0
for i in range(0, len(args)):
if skip:
skip = 0
continue
elif args[i] == "-h" or args[i] == "--help":
self.options['show-usage'] = 1
elif args[i] == "-l" or args[i] == "--list-tests":
self.options['list-tests'] = 1
elif args[i] == "-V" or args[i] == "--verbose":
self.options['verbose'] = 1
elif args[i] == "-R" or args[i] == "--pacemaker-remote":
self.options['pacemaker-remote'] = 1
elif args[i] == "-r" or args[i] == "--run-only":
self.options['run-only'] = args[i+1]
skip = 1
elif args[i] == "-p" or args[i] == "--run-only-pattern":
self.options['run-only-pattern'] = args[i+1]
skip = 1
def show_usage(self):
""" Show command usage """
print("usage: " + sys.argv[0] + " [options]")
print("If no options are provided, all tests will run")
print("Options:")
print("\t [--help | -h] Show usage")
print("\t [--list-tests | -l] Print out all registered tests.")
print("\t [--run-only | -r 'testname'] Run a specific test")
print("\t [--verbose | -V] Verbose output")
print("\t [--pacemaker-remote | -R Test pacemaker-remoted binary instead of pacemaker-execd")
print("\t [--run-only-pattern | -p 'string'] Run only tests containing the string value")
print("\n\tExample: Run only the test 'start_stop'")
print("\t\t " + sys.argv[0] + " --run-only start_stop")
print("\n\tExample: Run only the tests with the string 'systemd' present in them")
print("\t\t " + sys.argv[0] + " --run-only-pattern systemd")
def main(argv):
""" Run pacemaker-execd regression tests as specified by arguments """
update_path()
opts = TestOptions()
opts.build_options(argv)
tests = Tests(opts.options['verbose'], opts.options['pacemaker-remote'])
tests.build_generic_tests()
tests.build_multi_rsc_tests()
tests.build_negative_tests()
tests.build_custom_tests()
tests.build_stress_tests()
tests.setup_test_environment()
print("Starting ...")
if opts.options['list-tests']:
tests.print_list()
elif opts.options['show-usage']:
opts.show_usage()
elif opts.options['run-only-pattern'] != "":
tests.run_tests_matching(opts.options['run-only-pattern'])
tests.print_results()
elif opts.options['run-only'] != "":
tests.run_single(opts.options['run-only'])
tests.print_results()
else:
tests.run_tests()
tests.print_results()
tests.cleanup_test_environment()
tests.exit()
if __name__ == "__main__":
main(sys.argv)
diff --git a/cts/patterns.py b/cts/patterns.py
index b0b8784a02..96d6471c38 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -1,411 +1,413 @@
""" Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2008-2019 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import sys, os
from cts.CTSvars import *
patternvariants = {}
class BasePatterns(object):
def __init__(self, name):
self.name = name
patternvariants[name] = self
self.ignore = [
"avoid confusing Valgrind",
# Logging bug in some versions of libvirtd
r"libvirtd.*: internal error: Failed to parse PCI config address",
]
self.BadNews = []
self.components = {}
self.commands = {
"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
"CibQuery" : "cibadmin -Ql",
"CibAddXml" : "cibadmin --modify -c --xml-text %s",
"CibDelXpath" : "cibadmin --delete --xpath %s",
# 300,000 == 5 minutes
"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s",
"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",
# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
"ReduceCommCmd" : "",
"RestoreCommCmd" : "tc qdisc del dev lo root",
"MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''",
"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
}
self.search = {
"Pat:DC_IDLE" : "pacemaker-controld.*State transition.*-> S_IDLE",
# This won't work if we have multiple partitions
"Pat:Local_started" : "%s\W.*controller successfully started",
"Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC",
"Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE",
"Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN",
"Pat:They_stopped" : "%s\W.*LOST:.* %s ",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
"Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s",
"Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* targeting %s on .* for .*@.*: OK",
"Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s",
"Pat:Fencing_active" : r"pacemaker-schedulerd.*: Resource %s is active on .* nodes",
"Pat:Fencing_probe" : r"pacemaker-controld.* Result of probe operation for %s on .*: Error",
"Pat:RscOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?ok",
+ "Pat:RscOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ",
+ "Pat:CloneOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ",
"Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
"Pat:NodeFenced" : r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK",
"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
}
def get_component(self, key):
if key in self.components:
return self.components[key]
print("Unknown component '%s' for %s" % (key, self.name))
return []
def get_patterns(self, key):
if key == "BadNews":
return self.BadNews
elif key == "BadNewsIgnore":
return self.ignore
elif key == "Commands":
return self.commands
elif key == "Search":
return self.search
elif key == "Components":
return self.components
def __getitem__(self, key):
if key == "Name":
return self.name
elif key in self.commands:
return self.commands[key]
elif key in self.search:
return self.search[key]
else:
print("Unknown template '%s' for %s" % (key, self.name))
return None
class crm_corosync(BasePatterns):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"StartCmd" : "service corosync start && service pacemaker start",
"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop",
"EpochCmd" : "crm_node -e",
"QuorumCmd" : "crm_node -q",
"PartitionCmd" : "crm_node -p",
})
self.search.update({
# Close enough ... "Corosync Cluster Engine exiting normally" isn't
# printed reliably.
"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost",
"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
# "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:InfraUp" : "%s\W.*corosync.*Initializing transport",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
self.ignore = self.ignore + [
r"crm_mon:",
r"crmadmin:",
r"update_trace_data",
r"async_notify:.*strange, client not found",
r"Parse error: Ignoring unknown option .*nodename",
r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:",
r"getinfo response error: 1$",
r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)",
]
self.BadNews = [
- r"error:",
+ r"[^(]error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting",
r"schedulerd.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)",
r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)",
r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
r"Peer is not part of our cluster",
r"We appear to be in an election loop",
r"Unknown node -> we will not deliver message",
r"(Blackbox dump requested|Problem detected)",
r"pacemakerd.*Could not connect to Cluster Configuration Database API",
r"Receiving messages from a node we think is dead",
r"share the same cluster nodeid",
r"share the same name",
#r"crm_ipc_send:.*Request .* failed",
#r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received",
# Not inherently bad, but worth tracking
#r"No need to invoke the TE",
#r"ping.*: DEBUG: Updated connected = 0",
#r"Digest mis-match:",
r"pacemaker-controld:.*Transition failed: terminated",
r"Local CIB .* differs from .*:",
r"warn.*:\s*Continuing but .* will NOT be used",
r"warn.*:\s*Cluster configuration file .* is corrupt",
#r"Executing .* fencing operation",
r"Election storm",
r"stalled the FSA with pending inputs",
]
self.components["common-ignore"] = [
r"Pending action:",
r"resource( was|s were) active at shutdown",
r"pending LRM operations at shutdown",
r"Lost connection to the CIB manager",
r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported",
r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ",
r".*:\s*Executing .* fencing operation \(.*\) on ",
r".*:\s*Requesting fencing \([^)]+\) of node ",
r"(Blackbox dump requested|Problem detected)",
# "Resource .*stonith::.* is active on 2 nodes attempting recovery",
# "Transition .* ERRORs found during PE processing",
]
self.components["corosync-ignore"] = [
r"error:.*Connection to the CPG API failed: Library error",
r"\[[0-9]+\] exited with status [0-9]+ \(",
r"pacemaker-based.*error:.*Corosync connection lost",
r"pacemaker-fenced.*error:.*Corosync connection terminated",
r"pacemaker-controld.*State transition .* S_RECOVERY",
r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state",
r"pacemaker-controld.*error:.*Could not recover from internal error",
r"error:.*Connection to cib_(shm|rw).* (failed|closed)",
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"crit: Fencing daemon connection failed",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["corosync"] = [
# We expect each daemon to lose its cluster connection.
# However, if the CIB manager loses its connection first,
# it's possible for another daemon to lose that connection and
# exit before losing the cluster connection.
r"pacemakerd.*:\s*(crit|error):.*Lost connection to cluster layer",
r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer",
r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)",
r"schedulerd.*Scheduling Node .* for STONITH",
r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK",
]
self.components["pacemaker-based"] = [
r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102",
r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1",
r"pacemakerd.* Respawning failed child process: pacemaker-attrd",
r"pacemakerd.* Respawning failed child process: pacemaker-based",
r"pacemakerd.* Respawning failed child process: pacemaker-controld",
r"pacemakerd.* Respawning failed child process: pacemaker-fenced",
r"pacemaker-.* Connection to cib_.* (failed|closed)",
r"pacemaker-attrd.*:.*Lost connection to the CIB manager",
r"pacemaker-controld.*:.*Lost connection to the CIB manager",
r"pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy",
r"pacemaker-controld.* State transition .* S_RECOVERY",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
r"pacemaker-controld.*Could not recover from internal error",
]
self.components["pacemaker-based-ignore"] = [
r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["pacemaker-execd"] = [
r"pacemaker-controld.*Connection to (pacemaker-execd|lrmd|executor) (failed|closed)",
r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy",
r"pacemaker-controld.*State transition .* S_RECOVERY",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
r"pacemaker-controld.*Could not recover from internal error",
r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1",
r"pacemakerd.*Respawning failed child process: pacemaker-execd",
r"pacemakerd.*Respawning failed child process: pacemaker-controld",
]
self.components["pacemaker-execd-ignore"] = [
r"pacemaker-attrd.*Connection to lrmd (failed|closed)",
r"pacemaker-(attrd|controld).*Could not execute alert",
]
self.components["pacemaker-controld"] = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# Only if the node wasn't the DC: "State transition S_IDLE",
"State transition .* -> S_IDLE",
]
self.components["pacemaker-controld-ignore"] = []
self.components["pacemaker-attrd"] = []
self.components["pacemaker-attrd-ignore"] = []
self.components["pacemaker-schedulerd"] = [
"State transition .* S_RECOVERY",
r"Respawning failed child process: pacemaker-controld",
r"pacemaker-controld\[[0-9]+\] exited with status 1 \(",
"Connection to pengine failed",
"Connection to pengine.* closed",
r"Connection to the scheduler failed",
"pacemaker-controld.*I_ERROR.*save_cib_contents",
r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover",
"pacemaker-controld.*Could not recover from internal error",
]
self.components["pacemaker-schedulerd-ignore"] = []
self.components["pacemaker-fenced"] = [
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"Fencing daemon connection failed",
r"pacemaker-controld.*Fencer successfully connected",
]
self.components["pacemaker-fenced-ignore"] = [
r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
r"crit:.*Fencing daemon connection failed",
r"error:.*Fencer connection failed \(will retry\)",
r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations",
r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pacemaker-schedulerd.* Calculated transition .*/pe-error",
]
self.components["pacemaker-fenced-ignore"].extend(self.components["common-ignore"])
class crm_corosync_docker(crm_corosync):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
crm_corosync.__init__(self, name)
self.commands.update({
"StartCmd" : "pcmk_start",
"StopCmd" : "pcmk_stop",
})
class PatternSelector(object):
def __init__(self, name=None):
self.name = name
self.base = BasePatterns("crm-base")
if not name:
crm_corosync("crm-corosync")
elif name == "crm-corosync":
crm_corosync(name)
elif name == "crm-corosync-docker":
crm_corosync_docker(name)
def get_variant(self, variant):
if variant in patternvariants:
return patternvariants[variant]
print("defaulting to crm-base for %s" % variant)
return self.base
def get_patterns(self, variant, kind):
return self.get_variant(variant).get_patterns(kind)
def get_template(self, variant, key):
v = self.get_variant(variant)
return v[key]
def get_component(self, variant, kind):
return self.get_variant(variant).get_component(kind)
def __getitem__(self, key):
return self.get_template(self.name, key)
# python cts/CTSpatt.py -k crm-corosync -t StartCmd
if __name__ == '__main__':
pdir=os.path.dirname(sys.path[0])
sys.path.insert(0, pdir) # So that things work from the source directory
kind=None
template=None
skipthis=None
args=sys.argv[1:]
for i in range(0, len(args)):
if skipthis:
skipthis=None
continue
elif args[i] == "-k" or args[i] == "--kind":
skipthis=1
kind = args[i+1]
elif args[i] == "-t" or args[i] == "--template":
skipthis=1
template = args[i+1]
else:
print("Illegal argument " + args[i])
print(PatternSelector(kind)[template])
diff --git a/doc/Clusters_from_Scratch/en-US/Author_Group.xml b/doc/Clusters_from_Scratch/en-US/Author_Group.xml
index 898da71261..ff907feee5 100644
--- a/doc/Clusters_from_Scratch/en-US/Author_Group.xml
+++ b/doc/Clusters_from_Scratch/en-US/Author_Group.xml
@@ -1,11 +1,10 @@
- AndrewBeekhof
- Red Hat
- Primary author
- andrew@beekhof.net
+
+ Written by the Pacemaker project contributors
+
diff --git a/doc/Clusters_from_Scratch/en-US/Book_Info.xml b/doc/Clusters_from_Scratch/en-US/Book_Info.xml
index 68be4424c1..9fcbdf0a10 100644
--- a/doc/Clusters_from_Scratch/en-US/Book_Info.xml
+++ b/doc/Clusters_from_Scratch/en-US/Book_Info.xml
@@ -1,69 +1,71 @@
%BOOK_ENTITIES;
]>
- Clusters from Scratch
- Step-by-Step Instructions for Building Your First High-Availability Cluster
- Pacemaker
- 2.0
-
- 10
- 2
-
-
- This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker.
-
-
- The example cluster will use:
-
-
-
- &DISTRO; &DISTRO_VERSION; as the host operating system
-
-
-
-
- Corosync to provide messaging and membership services,
-
-
-
-
- Pacemaker 1.1.18
- While this guide is part of the document set for
- Pacemaker 2.0, it demonstrates the version available in
- the standard &DISTRO; repositories.
-
-
-
-
- DRBD as a cost-effective alternative to shared storage,
-
-
-
-
- GFS2 as the cluster filesystem (in active/active mode)
-
-
-
-
-
- Given the graphical nature of the install process, a number of screenshots are included. However the guide is primarily composed of commands, the reasons for executing them and their expected outputs.
-
-
-
-
-
-
-
-
-
-
-
+ Clusters from Scratch
+ Step-by-Step Instructions for Building Your First High-Availability Cluster
+ Pacemaker
+ 2.0
+
+ 11
+ 0
+
+
+ This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker.
+
+
+ The example cluster will use:
+
+
+
+ &DISTRO; &DISTRO_VERSION; as the host operating system
+
+
+
+
+ Corosync to provide messaging and membership services,
+
+
+
+
+ Pacemaker 1.1.18
+ While this guide is part of the document set for
+ Pacemaker 2.0, it demonstrates the version available in
+ the standard &DISTRO; repositories.
+
+
+
+
+ DRBD as a cost-effective alternative to shared storage,
+
+
+
+
+ GFS2 as the cluster filesystem (in active/active mode)
+
+
+
+
+
+ Given the graphical nature of the install process, a number of
+ screenshots are included. However the guide is primarily composed of
+ commands, the reasons for executing them and their expected outputs.
+
+
+
+
+
+
+
+
+
+
+
diff --git a/doc/Clusters_from_Scratch/en-US/Revision_History.xml b/doc/Clusters_from_Scratch/en-US/Revision_History.xml
index 05ef1be65a..77a3b2cdde 100644
--- a/doc/Clusters_from_Scratch/en-US/Revision_History.xml
+++ b/doc/Clusters_from_Scratch/en-US/Revision_History.xml
@@ -1,93 +1,205 @@
%BOOK_ENTITIES;
]>
-
- Revision History
-
-
-
- 1-0
- Mon May 17 2010
- AndrewBeekhofandrew@beekhof.net
- Import from Pages.app
-
-
- 2-0
- Wed Sep 22 2010
- RaoulScarazzinirasca@miamammausalinux.org
- Italian translation
-
-
- 3-0
- Wed Feb 9 2011
- AndrewBeekhofandrew@beekhof.net
- Updated for Fedora 13
-
-
- 4-0
- Wed Oct 5 2011
- AndrewBeekhofandrew@beekhof.net
- Update the GFS2 section to use CMAN
-
-
- 5-0
- Fri Feb 10 2012
- AndrewBeekhofandrew@beekhof.net
- Generate docbook content from asciidoc sources
-
-
- 6-0
- Tues July 3 2012
- AndrewBeekhofandrew@beekhof.net
- Updated for Fedora 17
-
-
- 7-0
- Fri Sept 14 2012
- DavidVosseldavidvossel@gmail.com
- Updated for pcs
-
-
- 8-0
- Mon Jan 05 2015
- KenGaillotkgaillot@redhat.com
- Updated for Fedora 21
-
-
- 8-1
- Thu Jan 08 2015
- KenGaillotkgaillot@redhat.com
- Minor corrections, plus use include file for intro
-
-
- 9-0
- Fri Aug 14 2015
- KenGaillotkgaillot@redhat.com
- Update for CentOS 7.1 and leaving firewalld/SELinux enabled
-
-
- 10-0
- Fri Jan 12 2018
- KenGaillotkgaillot@redhat.com
- Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16
-
-
- 10-1
- Wed Sep 5 2018
- KenGaillotkgaillot@redhat.com
- Update for CentOS 7.5 with Pacemaker 1.1.18
-
-
- 10-2
- Fri Dec 7 2018
- KenGaillotkgaillot@redhat.com
- JanPokornýjpokorny@redhat.com
- ChrisLumensclumens@redhat.com
- Minor clarifications and formatting changes
-
-
-
+
+ Revision History
+
+
+
+ 1-0
+ Mon May 17 2010
+
+ AndrewBeekhof
+ andrew@beekhof.net
+
+
+
+ Import from Pages.app
+
+
+
+
+ 2-0
+ Wed Sep 22 2010
+
+ RaoulScarazzini
+ rasca@miamammausalinux.org
+
+
+
+ Italian translation
+
+
+
+
+ 3-0
+ Wed Feb 9 2011
+
+ Andrew
+ Beekhof
+ andrew@beekhof.net
+
+
+
+ Updated for Fedora 13
+
+
+
+
+ 4-0
+ Wed Oct 5 2011
+
+ Andrew
+ Beekhof
+ andrew@beekhof.net
+
+
+
+ Update the GFS2 section to use CMAN
+
+
+
+
+ 5-0
+ Fri Feb 10 2012
+
+ Andrew
+ Beekhof
+ andrew@beekhof.net
+
+
+
+ Generate docbook content from asciidoc sources
+
+
+
+
+ 6-0
+ Tues July 3 2012
+
+ AndrewBeekhof
+ andrew@beekhof.net
+
+
+
+ Updated for Fedora 17
+
+
+
+
+ 7-0
+ Fri Sept 14 2012
+
+ DavidVossel
+ davidvossel@gmail.com
+
+
+
+ Updated for pcs
+
+
+
+
+ 8-0
+ Mon Jan 05 2015
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+
+ Updated for Fedora 21
+
+
+
+ 8-1
+ Thu Jan 08 2015
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+
+ Minor corrections, plus use include file for intro
+
+
+
+
+ 9-0
+ Fri Aug 14 2015
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+
+ Update for CentOS 7.1 and leaving firewalld/SELinux enabled
+
+
+
+
+ 10-0
+ Fri Jan 12 2018
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+
+ Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16
+
+
+
+
+ 10-1
+ Wed Sep 5 2018
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+
+ Update for CentOS 7.5 with Pacemaker 1.1.18
+
+
+
+
+ 10-2
+ Fri Dec 7 2018
+
+ KenGaillot
+ kgaillot@redhat.com
+
+
+ JanPokorný
+ jpokorny@redhat.com
+
+
+ ChrisLumens
+ clumens@redhat.com
+
+
+
+ Minor clarifications and formatting changes
+
+
+
+
+ 11-0
+ Thu Jul 18 2019
+
+ TomasJelinek
+ tojeline@redhat.com
+
+
+
+ Note differences in pcs 0.10
+
+
+
+
+
diff --git a/doc/Pacemaker_Administration/en-US/Author_Group.xml b/doc/Pacemaker_Administration/en-US/Author_Group.xml
index a40b3adcc6..ff907feee5 100644
--- a/doc/Pacemaker_Administration/en-US/Author_Group.xml
+++ b/doc/Pacemaker_Administration/en-US/Author_Group.xml
@@ -1,17 +1,10 @@
- AndrewBeekhof
- Red Hat
- Co-author
- andrew@beekhof.net
-
-
- KenGaillot
- Red Hat
- Co-author
- kgaillot@redhat.com
+
+ Written by the Pacemaker project contributors
+
diff --git a/doc/Pacemaker_Administration/en-US/Book_Info.xml b/doc/Pacemaker_Administration/en-US/Book_Info.xml
index 5dcd86bb2b..5acc7868e7 100644
--- a/doc/Pacemaker_Administration/en-US/Book_Info.xml
+++ b/doc/Pacemaker_Administration/en-US/Book_Info.xml
@@ -1,36 +1,34 @@
%BOOK_ENTITIES;
]>
- Pacemaker Administration
- Managing Pacemaker Clusters
-
- 1
- 3
-
-
- This document has instructions and tips for system
- administrators who need to manage high-availability
- clusters using Pacemaker.
-
-
-
-
-
-
-
-
-
-
-
+ Pacemaker Administration
+ Managing Pacemaker Clusters
+
+ 2
+ 0
+
+
+ This document has instructions and tips for system administrators who
+ need to manage high-availability clusters using Pacemaker.
+
+
+
+
+
+
+
+
+
+
+
-
diff --git a/doc/Pacemaker_Administration/en-US/Revision_History.xml b/doc/Pacemaker_Administration/en-US/Revision_History.xml
index 6276ccd69b..3d8a310735 100644
--- a/doc/Pacemaker_Administration/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Administration/en-US/Revision_History.xml
@@ -1,69 +1,87 @@
%BOOK_ENTITIES;
]>
Revision History
1-0
Tue Jan 23 2018
+
+ AndrewBeekhof
+ andrew@beekhof.net
+
KenGaillot
kgaillot@redhat.com
Move administration-oriented information from
Pacemaker Explained into its own
book
1-1
Mon Jan 28 2019
KenGaillot
kgaillot@redhat.com
JanPokorný
jpokorny@redhat.com
Add "Troubleshooting" chapter, minor
clarifications and reformatting
1-2
Mon May 13 2019
KenGaillot
kgaillot@redhat.com
Overhaul configuration how-to
1-3
Tue Oct 15 2019
KenGaillot
kgaillot@redhat.com
Add Tools chapter
+
+ 2-0
+ Tue Nov 5 2019
+
+ ChrisLumens
+ clumens@redhat.com
+
+
+
+ Update for crm_mon changes in 2.0.3
+
+
+
+
diff --git a/doc/Pacemaker_Development/en-US/Author_Group.xml b/doc/Pacemaker_Development/en-US/Author_Group.xml
index 4f315d6ad0..ff907feee5 100644
--- a/doc/Pacemaker_Development/en-US/Author_Group.xml
+++ b/doc/Pacemaker_Development/en-US/Author_Group.xml
@@ -1,23 +1,10 @@
- AndrewBeekhof
- Red Hat
- Co-author
- andrew@beekhof.net
-
-
- KenGaillot
- Red Hat
- Co-author
- kgaillot@redhat.com
-
-
- JanPokorný
- Red Hat
- Co-author
- poki@redhat.com
+
+ Written by the Pacemaker project contributors
+
diff --git a/doc/Pacemaker_Development/en-US/Revision_History.xml b/doc/Pacemaker_Development/en-US/Revision_History.xml
index 32b37341ce..163867f785 100644
--- a/doc/Pacemaker_Development/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Development/en-US/Revision_History.xml
@@ -1,108 +1,112 @@
%BOOK_ENTITIES;
]>
Revision History
1-0
Tue Jul 26 2016
+
+ AndrewBeekhof
+ andrew@beekhof.net
+
KenGaillot
kgaillot@redhat.com
Convert coding guidelines and developer FAQ
to Publican document
1-1
Mon Aug 29 2016
KenGaillot
kgaillot@redhat.com
Add Python coding guidelines, and
more about licensing
2-0
Fri Jan 12 2018
KenGaillot
kgaillot@redhat.com
Drop support for Python 2.6
2-1
Tue Sep 18 2018
JanPokorný
poki@redhat.com
Start documenting notable evolutionary
points
2-2
Fri Dec 7 2018
KenGaillot
kgaillot@redhat.com
Update FAQ and C guidelines
2-3
Mon May 13 2019
KenGaillot
kgaillot@redhat.com
JanPokorný
poki@redhat.com
Update copyright notice policy, and some external references
2-4
Fri 17 May 2019
JanPokorný
poki@redhat.com
Start capturing hacking howto
for advanced contributors
diff --git a/doc/Pacemaker_Explained/en-US/Author_Group.xml b/doc/Pacemaker_Explained/en-US/Author_Group.xml
index 08fc1f72ed..ff907feee5 100644
--- a/doc/Pacemaker_Explained/en-US/Author_Group.xml
+++ b/doc/Pacemaker_Explained/en-US/Author_Group.xml
@@ -1,59 +1,10 @@
- AndrewBeekhof
- Red Hat
- Primary author
- andrew@beekhof.net
+
+ Written by the Pacemaker project contributors
+
-
- KenGaillot
- Red Hat
- Co-author
- kgaillot@redhat.com
-
-
- PhilippMarek
- LINBit
- Style and formatting updates. Indexing.
- philipp.marek@linbit.com
-
-
- TanjaRoth
- SUSE
- Utilization chapter
- Resource Templates chapter
- Multi-Site Clusters chapter
- taroth@suse.com
-
-
- LarsMarowsky-Bree
- SUSE
- Multi-Site Clusters chapter
- lmb@suse.com
-
-
- YanGao
- SUSE
- Utilization chapter
- Resource Templates chapter
- Multi-Site Clusters chapter
- ygao@suse.com
-
-
- ThomasSchraitle
- SUSE
- Utilization chapter
- Resource Templates chapter
- Multi-Site Clusters chapter
- toms@suse.com
-
-
- DejanMuhamedagic
- SUSE
- Resource Templates chapter
- dmuhamedagic@suse.com
-
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt b/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt
index 53d8793024..31b5044274 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Fencing.txt
@@ -1,1021 +1,1028 @@
:compat-mode: legacy
= Fencing =
////
We prefer [[ch-fencing]], but older versions of asciidoc don't deal well
with that construct for chapter headings
////
anchor:ch-fencing[Chapter 6, Fencing]
indexterm:[Fencing, Configuration]
indexterm:[STONITH, Configuration]
== What Is Fencing? ==
'Fencing' is the ability to make a node unable to run resources, even when that
node is unresponsive to cluster commands.
Fencing is also known as 'STONITH', an acronym for "Shoot The Other Node In The
Head", since the most common fencing method is cutting power to the node.
Another method is "fabric fencing", cutting the node's access to some
capability required to run resources (such as network access or a shared disk).
== Why Is Fencing Necessary? ==
Fencing protects your data from being corrupted by malfunctioning nodes or
unintentional concurrent access to shared resources.
Fencing protects against the "split brain" failure scenario, where cluster
nodes have lost the ability to reliably communicate with each other but are
still able to run resources. If the cluster just assumed that uncommunicative
nodes were down, then multiple instances of a resource could be started on
different nodes.
The effect of split brain depends on the resource type. For example, an IP
address brought up on two hosts on a network will cause packets to randomly be
sent to one or the other host, rendering the IP useless. For a database or
clustered file system, the effect could be much more severe, causing data
corruption or divergence.
Fencing also is used when a resource cannot otherwise be stopped. If a failed
resource fails to stop, it cannot be recovered elsewhere. Fencing the
resource's node is the only way to ensure the resource is recoverable.
Users may also configure the +on-fail+ property of any resource operation to
+fencing+, in which case the cluster will fence the resource's node if the
operation fails.
== Fence Devices ==
A 'fence device' (or 'fencing device') is a special type of resource that
provides the means to fence a node.
Examples of fencing devices include intelligent power switches and IPMI devices
that accept SNMP commands to cut power to a node, and iSCSI controllers that
allow SCSI reservations to be used to cut a node's access to a shared disk.
Since fencing devices will be used to recover from loss of networking
connectivity to other nodes, it is essential that they do not rely on the same
network as the cluster itself, otherwise that network becomes a single point of
failure.
Since loss of a node due to power outage is indistinguishable from loss of
network connectivity to that node, it is also essential that at least one fence
device for a node does not share power with that node. For example, an on-board
IPMI controller that shares power with its host should not be used as the sole
fencing device for that host.
Since fencing is used to isolate malfunctioning nodes, no fence device should
rely on its target functioning properly. This includes, for example, devices
that ssh into a node and issue a shutdown command (such devices might be
suitable for testing, but never for production).
== Fence Agents ==
A 'fence agent' (or 'fencing agent') is a +stonith+-class resource agent.
The fence agent standard provides commands (such as +off+ and +reboot+) that
the cluster can use to fence nodes. As with other resource agent classes,
this allows a layer of abstraction so that Pacemaker doesn't need any knowledge
about specific fencing technologies -- that knowledge is isolated in the agent.
== When a Fence Device Can Be Used ==
Fencing devices do not actually "run" like most services. Typically, they just
provide an interface for sending commands to an external device.
Additionally, fencing may be initiated by Pacemaker, by other cluster-aware software
such as DRBD or DLM, or manually by an administrator, at any point in the
cluster life cycle, including before any resources have been started.
To accommodate this, Pacemaker does not require the fence device resource to be
"started" in order to be used. Whether a fence device is started or not
determines whether a node runs any recurring monitor for the device, and gives
the node a slight preference for being chosen to execute fencing using that
device.
By default, any node can execute any fencing device. If a fence device is
disabled by setting its +target-role+ to Stopped, then no node can use that
device. If mandatory location constraints prevent a specific node from
"running" a fence device, then that node will never be chosen to execute
fencing using the device. A node may fence itself, but the cluster will choose
that only if no other nodes can do the fencing.
A common configuration scenario is to have one fence device per target node.
In such a case, users often configure anti-location constraints so that
the target node does not monitor its own device. The best practice is to make
the constraint optional (i.e. a finite negative score rather than +-INFINITY+),
so that the node can fence itself if no other nodes can.
== Limitations of Fencing Resources ==
Fencing resources have certain limitations that other resource classes don't:
* They may have only one set of meta-attributes and one set of instance
attributes.
* If <> are used to determine fencing resource options, these
may only be evaluated when first read, meaning that later changes to the
rules will have no effect. Therefore, it is better to avoid confusion and not
use rules at all with fencing resources.
These limitations could be revisited if there is significant user demand.
== Special Options for Fencing Resources ==
The table below lists special instance attributes that may be set for any
fencing resource ('not' meta-attributes, even though they are interpreted by
pacemaker rather than the fence agent). These are also listed in the man page
for +pacemaker-fenced+.
.Additional Properties of Fencing Resources
[width="95%",cols="8m,3,6,<12",options="header",align="center"]
|=========================================================
|Field
|Type
|Default
|Description
|stonith-timeout
|NA
|NA
a|Older versions used this to override the default period to wait for a STONITH (reboot, on, off) action to complete for this device.
It has been replaced by the +pcmk_reboot_timeout+ and +pcmk_off_timeout+ properties.
indexterm:[stonith-timeout,Fencing]
indexterm:[Fencing,Property,stonith-timeout]
////
(not yet implemented)
priority
integer
0
The priority of the STONITH resource. Devices are tried in order of highest priority to lowest.
indexterm priority,Fencing
indexterm Fencing,Property,priority
////
|provides
|string
|
|Any special capability provided by the fence device. Currently, only one such
capability is meaningful: +unfencing+ (see <>).
indexterm:[provides,Fencing]
indexterm:[Fencing,Property,provides]
|pcmk_host_map
|string
|
|A mapping of host names to ports numbers for devices that do not support host names.
Example: +node1:1;node2:2,3+ tells the cluster to use port 1 for
*node1* and ports 2 and 3 for *node2*. If +pcmk_host_check+ is explicitly set
to +static-list+, either this or +pcmk_host_list+ must be set.
indexterm:[pcmk_host_map,Fencing]
indexterm:[Fencing,Property,pcmk_host_map]
|pcmk_host_list
|string
|
|A list of machines controlled by this device. If +pcmk_host_check+ is
explicitly set to +static-list+, either this or +pcmk_host_map+ must be set.
indexterm:[pcmk_host_list,Fencing]
indexterm:[Fencing,Property,pcmk_host_list]
|pcmk_host_check
|string
-|static-list if either +pcmk_host_list+ or +pcmk_host_map+ is set,
- otherwise dynamic-list if the fence device supports the list action,
- otherwise status if the fence device supports the status action,
- otherwise none
+|A value appropriate to other configuration options and
+ device capabilities (see note below)
a|How to determine which machines are controlled by the device.
Allowed values:
* +dynamic-list:+ query the device via the "list" command
* +static-list:+ check the +pcmk_host_list+ or +pcmk_host_map+ attribute
* +status:+ query the device via the "status" command
* +none:+ assume every device can fence every machine
indexterm:[pcmk_host_check,Fencing]
indexterm:[Fencing,Property,pcmk_host_check]
|pcmk_delay_max
|time
|0s
|Enable a random delay of up to the time specified before executing fencing
actions. This is sometimes used in two-node clusters to ensure that the
nodes don't fence each other at the same time. The overall delay introduced
by pacemaker is derived from this random delay value adding a static delay so
that the sum is kept below the maximum delay.
indexterm:[pcmk_delay_max,Fencing]
indexterm:[Fencing,Property,pcmk_delay_max]
|pcmk_delay_base
|time
|0s
|Enable a static delay before executing fencing actions. This can be used
e.g. in two-node clusters to ensure that the nodes don't fence each other,
by having separate fencing resources with different values. The node that is
fenced with the shorter delay will lose a fencing race. The overall delay
introduced by pacemaker is derived from this value plus a random delay such
that the sum is kept below the maximum delay.
indexterm:[pcmk_delay_base,Fencing]
indexterm:[Fencing,Property,pcmk_delay_base]
|pcmk_action_limit
|integer
|1
|The maximum number of actions that can be performed in parallel on this
device, if the cluster option +concurrent-fencing+ is +true+. -1 is unlimited.
indexterm:[pcmk_action_limit,Fencing]
indexterm:[Fencing,Property,pcmk_action_limit]
|pcmk_host_argument
|string
|port
|'Advanced use only.' Which parameter should be supplied to the resource agent
to identify the node to be fenced. Some devices do not support the standard
+port+ parameter or may provide additional ones. Use this to specify an
alternate, device-specific parameter. A value of +none+ tells the
cluster not to supply any additional parameters.
indexterm:[pcmk_host_argument,Fencing]
indexterm:[Fencing,Property,pcmk_host_argument]
|pcmk_reboot_action
|string
|reboot
|'Advanced use only.' The command to send to the resource agent in order to
reboot a node. Some devices do not support the standard commands or may provide
additional ones. Use this to specify an alternate, device-specific command.
indexterm:[pcmk_reboot_action,Fencing]
indexterm:[Fencing,Property,pcmk_reboot_action]
|pcmk_reboot_timeout
|time
|60s
|'Advanced use only.' Specify an alternate timeout to use for `reboot` actions
instead of the value of +stonith-timeout+. Some devices need much more or less
time to complete than normal. Use this to specify an alternate, device-specific
timeout.
indexterm:[pcmk_reboot_timeout,Fencing]
indexterm:[Fencing,Property,pcmk_reboot_timeout]
indexterm:[stonith-timeout,Fencing]
indexterm:[Fencing,Property,stonith-timeout]
|pcmk_reboot_retries
|integer
|2
|'Advanced use only.' The maximum number of times to retry the `reboot` command
within the timeout period. Some devices do not support multiple connections, and
operations may fail if the device is busy with another task, so Pacemaker will
automatically retry the operation, if there is time remaining. Use this option
to alter the number of times Pacemaker retries before giving up.
indexterm:[pcmk_reboot_retries,Fencing]
indexterm:[Fencing,Property,pcmk_reboot_retries]
|pcmk_off_action
|string
|off
|'Advanced use only.' The command to send to the resource agent in order to
shut down a node. Some devices do not support the standard commands or may provide
additional ones. Use this to specify an alternate, device-specific command.
indexterm:[pcmk_off_action,Fencing]
indexterm:[Fencing,Property,pcmk_off_action]
|pcmk_off_timeout
|time
|60s
|'Advanced use only.' Specify an alternate timeout to use for `off` actions
instead of the value of +stonith-timeout+. Some devices need much more or less
time to complete than normal. Use this to specify an alternate, device-specific
timeout.
indexterm:[pcmk_off_timeout,Fencing]
indexterm:[Fencing,Property,pcmk_off_timeout]
indexterm:[stonith-timeout,Fencing]
indexterm:[Fencing,Property,stonith-timeout]
|pcmk_off_retries
|integer
|2
|'Advanced use only.' The maximum number of times to retry the `off` command
within the timeout period. Some devices do not support multiple connections, and
operations may fail if the device is busy with another task, so Pacemaker will
automatically retry the operation, if there is time remaining. Use this option
to alter the number of times Pacemaker retries before giving up.
indexterm:[pcmk_off_retries,Fencing]
indexterm:[Fencing,Property,pcmk_off_retries]
|pcmk_list_action
|string
|list
|'Advanced use only.' The command to send to the resource agent in order to
list nodes. Some devices do not support the standard commands or may provide
additional ones. Use this to specify an alternate, device-specific command.
indexterm:[pcmk_list_action,Fencing]
indexterm:[Fencing,Property,pcmk_list_action]
|pcmk_list_timeout
|time
|60s
|'Advanced use only.' Specify an alternate timeout to use for `list` actions
instead of the value of +stonith-timeout+. Some devices need much more or less
time to complete than normal. Use this to specify an alternate, device-specific
timeout.
indexterm:[pcmk_list_timeout,Fencing]
indexterm:[Fencing,Property,pcmk_list_timeout]
|pcmk_list_retries
|integer
|2
|'Advanced use only.' The maximum number of times to retry the `list` command
within the timeout period. Some devices do not support multiple connections, and
operations may fail if the device is busy with another task, so Pacemaker will
automatically retry the operation, if there is time remaining. Use this option
to alter the number of times Pacemaker retries before giving up.
indexterm:[pcmk_list_retries,Fencing]
indexterm:[Fencing,Property,pcmk_list_retries]
|pcmk_monitor_action
|string
|monitor
|'Advanced use only.' The command to send to the resource agent in order to
report extended status. Some devices do not support the standard commands or may provide
additional ones. Use this to specify an alternate, device-specific command.
indexterm:[pcmk_monitor_action,Fencing]
indexterm:[Fencing,Property,pcmk_monitor_action]
|pcmk_monitor_timeout
|time
|60s
|'Advanced use only.' Specify an alternate timeout to use for `monitor` actions
instead of the value of +stonith-timeout+. Some devices need much more or less
time to complete than normal. Use this to specify an alternate, device-specific
timeout.
indexterm:[pcmk_monitor_timeout,Fencing]
indexterm:[Fencing,Property,pcmk_monitor_timeout]
|pcmk_monitor_retries
|integer
|2
|'Advanced use only.' The maximum number of times to retry the `monitor` command
within the timeout period. Some devices do not support multiple connections, and
operations may fail if the device is busy with another task, so Pacemaker will
automatically retry the operation, if there is time remaining. Use this option
to alter the number of times Pacemaker retries before giving up.
indexterm:[pcmk_monitor_retries,Fencing]
indexterm:[Fencing,Property,pcmk_monitor_retries]
|pcmk_status_action
|string
|status
|'Advanced use only.' The command to send to the resource agent in order to
report status. Some devices do not support the standard commands or may provide
additional ones. Use this to specify an alternate, device-specific command.
indexterm:[pcmk_status_action,Fencing]
indexterm:[Fencing,Property,pcmk_status_action]
|pcmk_status_timeout
|time
|60s
|'Advanced use only.' Specify an alternate timeout to use for `status` actions
instead of the value of +stonith-timeout+. Some devices need much more or less
time to complete than normal. Use this to specify an alternate, device-specific
timeout.
indexterm:[pcmk_status_timeout,Fencing]
indexterm:[Fencing,Property,pcmk_status_timeout]
|pcmk_status_retries
|integer
|2
|'Advanced use only.' The maximum number of times to retry the `status` command
within the timeout period. Some devices do not support multiple connections, and
operations may fail if the device is busy with another task, so Pacemaker will
automatically retry the operation, if there is time remaining. Use this option
to alter the number of times Pacemaker retries before giving up.
indexterm:[pcmk_status_retries,Fencing]
indexterm:[Fencing,Property,pcmk_status_retries]
|=========================================================
+[NOTE]
+====
+The default value for +pcmk_host_check+ is +static-list+ if either
++pcmk_host_list+ or +pcmk_host_map+ is configured. If neither of those are
+configured, the default is +dynamic-list+ if the fence device supports the list
+action, or +status+ if the fence device supports the status action but not the
+list action. If none of those conditions apply, the default is +none+.
+====
+
[[s-unfencing]]
== Unfencing ==
With fabric fencing (such as cutting network or shared disk access rather than
power), it is expected that the cluster will fence the node, and
then a system administrator must manually investigate what went wrong, correct
any issues found, then reboot (or restart the cluster services on) the node.
Once the node reboots and rejoins the cluster, some fabric fencing devices
require an explicit command to restore the node's access. This capability is
called 'unfencing' and is typically implemented as the fence agent's +on+
command.
If any cluster resource has +requires+ set to +unfencing+, then that resource
will not be probed or started on a node until that node has been unfenced.
== Fence Devices Dependent on Other Resources ==
In some cases, a fence device may require some other cluster resource (such as
an IP address) to be active in order to function properly.
This is obviously undesirable in general: fencing may be required when the
depended-on resource is not active, or fencing may be required because the node
running the depended-on resource is no longer responding.
However, this may be acceptable under certain conditions:
* The dependent fence device should not be able to target any node that is
allowed to run the depended-on resource.
* The depended-on resource should not be disabled during production operation.
* The +concurrent-fencing+ cluster property should be set to +true+. Otherwise,
if both the node running the depended-on resource and some node targeted by
the dependent fence device need to be fenced, the fencing of the node
running the depended-on resource might be ordered first, making the second
fencing impossible and blocking further recovery. With concurrent fencing,
the dependent fence device might fail at first due to the depended-on
resource being unavailable, but it will be retried and eventually succeed
once the resource is brought back up.
Even under those conditions, there is one unlikely problem scenario. The DC
always schedules fencing of itself after any other fencing needed, to avoid
unnecessary repeated DC elections. If the dependent fence device targets the
DC, and both the DC and a different node running the depended-on resource need
to be fenced, the DC fencing will always fail and block further recovery. Note,
however, that losing a DC node entirely causes some other node to become DC and
schedule the fencing, so this is only a risk when a stop or other operation
with +on-fail+ set to +fencing+ fails on the DC.
== Configuring Fencing ==
. Find the correct driver:
+
----
# stonith_admin --list-installed
----
. Find the required parameters associated with the device
(replacing $AGENT_NAME with the name obtained from the previous step):
+
----
# stonith_admin --metadata --agent $AGENT_NAME
----
. Create a file called +stonith.xml+ containing a primitive resource
with a class of +stonith+, a type equal to the agent name obtained earlier,
and a parameter for each of the values returned in the previous step.
. If the device does not know how to fence nodes based on their uname,
you may also need to set the special +pcmk_host_map+ parameter. See
`man pacemaker-fenced` for details.
. If the device does not support the `list` command, you may also need
to set the special +pcmk_host_list+ and/or +pcmk_host_check+
parameters. See `man pacemaker-fenced` for details.
. If the device does not expect the victim to be specified with the
`port` parameter, you may also need to set the special
+pcmk_host_argument+ parameter. See `man pacemaker-fenced` for details.
. Upload it into the CIB using cibadmin:
+
----
# cibadmin -C -o resources --xml-file stonith.xml
----
. Set +stonith-enabled+ to true:
+
----
# crm_attribute -t crm_config -n stonith-enabled -v true
----
. Once the stonith resource is running, you can test it by executing the
following (although you might want to stop the cluster on that machine
first):
+
----
# stonith_admin --reboot nodename
----
=== Example Fencing Configuration ===
Assume we have a chassis containing four nodes and an IPMI device
active on 192.0.2.1. We would choose the `fence_ipmilan` driver,
and obtain the following list of parameters:
.Obtaining a list of Fence Agent Parameters
====
----
# stonith_admin --metadata -a fence_ipmilan
----
[source,XML]
----
----
====
Based on that, we would create a fencing resource fragment that might look
like this:
.An IPMI-based Fencing Resource
====
[source,XML]
----
----
====
Finally, we need to enable fencing:
----
# crm_attribute -t crm_config -n stonith-enabled -v true
----
== Fencing Topologies ==
Pacemaker supports fencing nodes with multiple devices through a feature called
'fencing topologies'. Fencing topologies may be used to provide alternative
devices in case one fails, or to require multiple devices to all be executed
successfully in order to consider the node successfully fenced, or even a
combination of the two.
Create the individual devices as you normally would, then define one or more
+fencing-level+ entries in the +fencing-topology+ section of the configuration.
* Each fencing level is attempted in order of ascending +index+. Allowed
values are 1 through 9.
* If a device fails, processing terminates for the current level.
No further devices in that level are exercised, and the next level is attempted instead.
* If the operation succeeds for all the listed devices in a level, the level is deemed to have passed.
* The operation is finished when a level has passed (success), or all levels have been attempted (failed).
* If the operation failed, the next step is determined by the scheduler
and/or the controller.
Some possible uses of topologies include:
* Try on-board IPMI, then an intelligent power switch if that fails
* Try fabric fencing of both disk and network, then fall back to power fencing
if either fails
* Wait up to a certain time for a kernel dump to complete, then cut power to
the node
.Properties of Fencing Levels
[width="95%",cols="1m,<3",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the level
indexterm:[id,fencing-level]
indexterm:[Fencing,fencing-level,id]
|target
|The name of a single node to which this level applies
indexterm:[target,fencing-level]
indexterm:[Fencing,fencing-level,target]
|target-pattern
|An extended regular expression (as defined in
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04[POSIX])
matching the names of nodes to which this level applies
indexterm:[target-pattern,fencing-level]
indexterm:[Fencing,fencing-level,target-pattern]
|target-attribute
|The name of a node attribute that is set (to +target-value+) for nodes to
which this level applies
indexterm:[target-attribute,fencing-level]
indexterm:[Fencing,fencing-level,target-attribute]
|target-value
|The node attribute value (of +target-attribute+) that is set for nodes to
which this level applies
indexterm:[target-attribute,fencing-level]
indexterm:[Fencing,fencing-level,target-attribute]
|index
|The order in which to attempt the levels.
Levels are attempted in ascending order 'until one succeeds'.
Valid values are 1 through 9.
indexterm:[index,fencing-level]
indexterm:[Fencing,fencing-level,index]
|devices
|A comma-separated list of devices that must all be tried for this level
indexterm:[devices,fencing-level]
indexterm:[Fencing,fencing-level,devices]
|=========================================================
.Fencing topology with different devices for different nodes
====
[source,XML]
----
...
...
----
====
=== Example Dual-Layer, Dual-Device Fencing Topologies ===
The following example illustrates an advanced use of +fencing-topology+ in a cluster with the following properties:
* 3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes)
* the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2
* the active nodes also have two independent PSUs (Power Supply Units)
connected to two independent PDUs (Power Distribution Units) reached at
198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11)
* the first fencing method uses the `fence_ipmi` agent
* the second fencing method uses the `fence_apc_snmp` agent targetting 2 fencing devices (one per PSU, either port 10 or 11)
* fencing is only implemented for the active nodes and has location constraints
* fencing topology is set to try IPMI fencing first then default to a "sure-kill" dual PDU fencing
In a normal failure scenario, STONITH will first select +fence_ipmi+ to try to kill the faulty node.
Using a fencing topology, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice:
* once for the first PDU
* again for the second PDU
The fence action is considered successful only if both PDUs report the required status. If any of them fails, STONITH loops back to the first fencing method, +fence_ipmi+, and so on until the node is fenced or fencing action is cancelled.
.First fencing method: single IPMI device
Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives:
[source,XML]
----
----
.Second fencing method: dual PDU devices
Each cluster node also has two distinct power channels controlled by two
distinct PDUs. That means a total of 4 fencing devices configured as follows:
- Node 1, PDU 1, PSU 1 @ port 10
- Node 1, PDU 2, PSU 2 @ port 10
- Node 2, PDU 1, PSU 1 @ port 11
- Node 2, PDU 2, PSU 2 @ port 11
The matching fencing agents are configured as follows:
[source,XML]
----
----
.Location Constraints
To prevent STONITH from trying to run a fencing agent on the same node it is
supposed to fence, constraints are placed on all the fencing primitives:
[source,XML]
----
----
.Fencing topology
Now that all the fencing resources are defined, it's time to create the right topology.
We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node.
[source,XML]
----
----
Please note, in +fencing-topology+, the lowest +index+ value determines the priority of the first fencing method.
.Final configuration
Put together, the configuration looks like this:
[source,XML]
----
...
...
----
== Remapping Reboots ==
When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because
a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to
other commands in two cases:
. If the chosen fencing device does not support the +reboot+ command, the cluster
will ask it to perform +off+ instead.
. If a fencing topology level with multiple devices must be executed, the cluster
will ask all the devices to perform +off+, then ask the devices to perform +on+.
To understand the second case, consider the example of a node with redundant
power supplies connected to intelligent power switches. Rebooting one switch
and then the other would have no effect on the node. Turning both switches off,
and then on, actually reboots the node.
In such a case, the fencing operation will be treated as successful as long as
the +off+ commands succeed, because then it is safe for the cluster to recover
any resources that were on the node. Timeouts and errors in the +on+ phase will
be logged but ignored.
When a reboot operation is remapped, any action-specific timeout for the
remapped action will be used (for example, +pcmk_off_timeout+ will be used when
executing the +off+ command, not +pcmk_reboot_timeout+).
diff --git a/doc/Pacemaker_Explained/en-US/Revision_History.xml b/doc/Pacemaker_Explained/en-US/Revision_History.xml
index 93fa71334f..4a80cb4fb6 100644
--- a/doc/Pacemaker_Explained/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Explained/en-US/Revision_History.xml
@@ -1,199 +1,249 @@
Revision History
1-0
19 Oct 2009
AndrewBeekhofandrew@beekhof.net
Import from Pages.app
2-0
26 Oct 2009
AndrewBeekhofandrew@beekhof.net
Cleanup and reformatting of docbook xml complete
3-0
Tue Nov 12 2009
AndrewBeekhofandrew@beekhof.net
Split book into chapters and pass validation
Re-organize book for use with Publican
+
+ 3-1
+ Tue Nov 12 2009
+
+ TanjaRoth
+ SUSE
+ taroth@suse.com
+
+
+ LarsMarowsky-Bree
+ SUSE
+ lmb@suse.com
+
+
+ YanGao
+ SUSE
+ ygao@suse.com
+
+
+ ThomasSchraitle
+ SUSE
+ toms@suse.com
+
+
+ DejanMuhamedagic
+ SUSE
+ dmuhamedagic@suse.com
+
+
+
+ Utilization chapter
+ Resource Templates chapter
+ Multi-Site Clusters chapter
+
+
+
+
+ 3-2
+ Fri Nov 4 2011
+
+ PhilippMarek
+ LINBit
+ philipp.marek@linbit.com
+
+
+
+ Extensive style, formatting, and indexing updates
+
+
+
4-0
Mon Oct 8 2012
AndrewBeekhofandrew@beekhof.net
Converted to asciidoc
(which is converted to docbook for use with
Publican)
5-0
Mon Feb 23 2015
KenGaillotkgaillot@redhat.com
Update for clarity, stylistic consistency and current command-line syntax
6-0
Tue Dec 8 2015
KenGaillotkgaillot@redhat.com
Update for Pacemaker 1.1.14
7-0
Tue May 3 2016
KenGaillotkgaillot@redhat.com
Update for Pacemaker 1.1.15
7-1
Fri Oct 28 2016
KenGaillotkgaillot@redhat.com
Overhaul upgrade documentation, and document node health strategies
8-0
Tue Oct 25 2016
KenGaillotkgaillot@redhat.com
Update for Pacemaker 1.1.16
9-0
Tue Jul 11 2017
KenGaillotkgaillot@redhat.com
Update for Pacemaker 1.1.17
10-0
Fri Oct 6 2017
KenGaillotkgaillot@redhat.com
Update for Pacemaker 1.1.18
11-0
Fri Jan 12 2018
KenGaillotkgaillot@redhat.com
Update for Pacemaker 2.0.0
12-0
Mon Jan 28 2019
KenGaillotkgaillot@redhat.com
ReidWahlnwahl@redhat.com
JanPokornýjpokorny@redhat.com
Update for Pacemaker 2.0.1, remove "Further Reading" and "FAQ" sections,
and add minor clarifications and reformatting
12-1
Mon May 13 2019
KenGaillotkgaillot@redhat.com
MaciejSobkowiakmsobkowiak@egnyte.com
Document podman support, cluster-name cluster option, and new
HealthIOWait agent, with other minor clarifications and
corrections
12-2
Wed Jun 19 2019
KenGaillotkgaillot@redhat.com
Add chapter for ACLs
13-0
Tue Oct 15 2019
KenGaillot
kgaillot@redhat.com
Overhaul fencing, rules, and constraints chapters,
elaborate on various options, and update for Pacemaker 2.0.3
diff --git a/doc/Pacemaker_Remote/en-US/Author_Group.xml b/doc/Pacemaker_Remote/en-US/Author_Group.xml
index 1de3082be1..ff907feee5 100644
--- a/doc/Pacemaker_Remote/en-US/Author_Group.xml
+++ b/doc/Pacemaker_Remote/en-US/Author_Group.xml
@@ -1,11 +1,10 @@
- DavidVossel
- Red Hat
- Primary author
- davidvossel@gmail.com
+
+ Written by the Pacemaker project contributors
+
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 0000000000..594d9e5717
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,80 @@
+# Pacemaker Documentation
+
+Pacemaker has multiple forms of documentation:
+
+* The primary end-user documentation is a series of "books":
+
+ * Clusters From Scratch: Simplified walk-through of setting up a
+ cluster for the first time
+ * Pacemaker Administration: Tips for managing a cluster
+ * Pacemaker Development: How to work on the Pacemaker code base
+ * Pacemaker Explained: Configuration reference guide
+ * Pacemaker Remote: Configuration and walk-throughs for extended
+ clusters
+
+ The source for these is kept beneath this directory. Generated versions
+ are available online in epub, PDF, and HTML format at:
+
+ https://clusterlabs.org/pacemaker/doc/
+
+* Annotated source code in HTML format can be generated by running
+ "make global" in this directory, which requires the gtags and htags tools.
+ This is generated for releases and can be viewed online at:
+
+ https://clusterlabs.org/pacemaker/global/
+
+* Pacemaker manual pages are generated and installed automatically when
+ building the software. HTML versions can be generated by running
+ "make manhtml" in this directory, which requires the groff tool.
+ This is generated for releases and can be viewed online at:
+
+ https://clusterlabs.org/pacemaker/man/
+
+* For developers, documentation for Pacemaker's public C API is generated
+ by running "make doxygen" in this directory, which requires the doxygen tool.
+ This is generated for releases and can be viewed online at:
+
+ https://clusterlabs.org/pacemaker/doxygen/
+
+* Also for developers, a report of Pacemaker ABI compatibility between any two
+ commits can be generated by running in this directory:
+
+ make LAST_RELEASE=$EARLIER_COMMIT TAG=$NEWER_COMMIT abi-www
+
+ which requires the abi-compliance-checker tool. This is generated for each
+ release compared to the previous release and can be viewed online at:
+
+ https://clusterlabs.org/pacemaker/abi/
+
+* In addition, there are a few old text files in this directory focusing on
+ particular characteristics of Pacemaker clusters. These are mostly outdated
+ but do still have some useful information. The plan is to incorporate an
+ updated version of them into the books.
+
+## Editing the Books
+
+Each book's sources are kept in a subdirectory by title. Each book subdirectory
+has an en-US subdirectory with the master sources, and potentially other
+subdirectories for translations. However, the translations are not currently
+actively maintained and so are disabled.
+
+Each book's en-US subdirectory has text files with the chapter sources in
+asciidoc format. The file asciidoc.reference in this directory has a quick
+guide to asciidoc; search online for more detailed help.
+
+Once you have edited the asciidoc as desired, run "make" in this directory
+to generate all the books locally. You view the results by pointing your
+web browser to (replacing BOOK\_TITLE appropriately):
+
+ file:///path/to/checkout/doc/BOOK\_TITLE/publish/desktop/en-US/index.html
+
+Each en-US subdirectory also contains some raw XML files of lesser interest:
+
+* Author\_Group.xml: This contains the author listing at the top of the
+ generated book. To avoid clutter, we just put "Pacemaker project
+ contributors" here and list individual authors in the revision history, so
+ this typically does not need to be edited.
+* Book\_Info.xml: Revision numbers, etc. This is generally updated only for
+ official releases.
+* Revision\_History.xml: This can be updated for any change, listing the
+ individual authors.
diff --git a/include/crm/services.h b/include/crm/services.h
index 8fe9bc918c..c35aa2454d 100644
--- a/include/crm/services.h
+++ b/include/crm/services.h
@@ -1,431 +1,431 @@
/*
* Copyright 2010-2019 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef __PCMK_SERVICES__
# define __PCMK_SERVICES__
#ifdef __cplusplus
extern "C" {
#endif
/**
* \file
* \brief Services API
* \ingroup core
*/
# include
# include
# include
# include
# include
# ifndef OCF_ROOT_DIR
# define OCF_ROOT_DIR "/usr/lib/ocf"
# endif
# ifndef LSB_ROOT_DIR
# define LSB_ROOT_DIR "/etc/init.d"
# endif
/* TODO: Autodetect these two ?*/
# ifndef SYSTEMCTL
# define SYSTEMCTL "/bin/systemctl"
# endif
/* Known resource classes */
#define PCMK_RESOURCE_CLASS_OCF "ocf"
#define PCMK_RESOURCE_CLASS_SERVICE "service"
#define PCMK_RESOURCE_CLASS_LSB "lsb"
#define PCMK_RESOURCE_CLASS_SYSTEMD "systemd"
#define PCMK_RESOURCE_CLASS_UPSTART "upstart"
#define PCMK_RESOURCE_CLASS_NAGIOS "nagios"
#define PCMK_RESOURCE_CLASS_STONITH "stonith"
/* This is the string passed in the OCF_EXIT_REASON_PREFIX environment variable.
* The stderr output that occurs after this prefix is encountered is considered
* the exit reason for a completed operation.
*/
#define PCMK_OCF_REASON_PREFIX "ocf-exit-reason:"
// Agent version to use if agent doesn't specify one
#define PCMK_DEFAULT_AGENT_VERSION "0.1"
enum lsb_exitcode {
PCMK_LSB_OK = 0,
PCMK_LSB_UNKNOWN_ERROR = 1,
PCMK_LSB_INVALID_PARAM = 2,
PCMK_LSB_UNIMPLEMENT_FEATURE = 3,
PCMK_LSB_INSUFFICIENT_PRIV = 4,
PCMK_LSB_NOT_INSTALLED = 5,
PCMK_LSB_NOT_CONFIGURED = 6,
PCMK_LSB_NOT_RUNNING = 7,
};
/* The return codes for the status operation are not the same for other
* operatios - go figure
*/
enum lsb_status_exitcode {
PCMK_LSB_STATUS_OK = 0,
PCMK_LSB_STATUS_VAR_PID = 1,
PCMK_LSB_STATUS_VAR_LOCK = 2,
PCMK_LSB_STATUS_NOT_RUNNING = 3,
PCMK_LSB_STATUS_UNKNOWN = 4,
/* custom codes should be in the 150-199 range reserved for application use */
PCMK_LSB_STATUS_NOT_INSTALLED = 150,
PCMK_LSB_STATUS_INSUFFICIENT_PRIV = 151,
};
/* Uniform exit codes
* Everything is mapped to its OCF equivalent so that Pacemaker only deals with one set of codes
*/
enum ocf_exitcode {
PCMK_OCF_OK = 0,
PCMK_OCF_UNKNOWN_ERROR = 1,
PCMK_OCF_INVALID_PARAM = 2,
PCMK_OCF_UNIMPLEMENT_FEATURE = 3,
PCMK_OCF_INSUFFICIENT_PRIV = 4,
PCMK_OCF_NOT_INSTALLED = 5,
PCMK_OCF_NOT_CONFIGURED = 6,
PCMK_OCF_NOT_RUNNING = 7, /* End of overlap with LSB */
PCMK_OCF_RUNNING_MASTER = 8,
PCMK_OCF_FAILED_MASTER = 9,
/* 150-199 reserved for application use */
PCMK_OCF_CONNECTION_DIED = 189, // Deprecated (see PCMK_LRM_OP_NOT_CONNECTED)
PCMK_OCF_DEGRADED = 190, /* Active resource that is no longer 100% functional */
PCMK_OCF_DEGRADED_MASTER = 191, /* Promoted resource that is no longer 100% functional */
PCMK_OCF_EXEC_ERROR = 192, /* Generic problem invoking the agent */
PCMK_OCF_UNKNOWN = 193, /* State of the service is unknown - used for recording in-flight operations */
PCMK_OCF_SIGNAL = 194,
PCMK_OCF_NOT_SUPPORTED = 195,
PCMK_OCF_PENDING = 196,
PCMK_OCF_CANCELLED = 197,
PCMK_OCF_TIMEOUT = 198,
PCMK_OCF_OTHER_ERROR = 199, /* Keep the same codes as PCMK_LSB */
};
enum op_status {
PCMK_LRM_OP_UNKNOWN = -2,
PCMK_LRM_OP_PENDING = -1,
PCMK_LRM_OP_DONE,
PCMK_LRM_OP_CANCELLED,
PCMK_LRM_OP_TIMEOUT,
PCMK_LRM_OP_NOTSUPPORTED,
PCMK_LRM_OP_ERROR,
PCMK_LRM_OP_ERROR_HARD,
PCMK_LRM_OP_ERROR_FATAL,
PCMK_LRM_OP_NOT_INSTALLED,
PCMK_LRM_OP_NOT_CONNECTED,
PCMK_LRM_OP_INVALID,
};
enum nagios_exitcode {
NAGIOS_STATE_OK = 0,
NAGIOS_STATE_WARNING = 1,
NAGIOS_STATE_CRITICAL = 2,
NAGIOS_STATE_UNKNOWN = 3,
NAGIOS_STATE_DEPENDENT = 4,
NAGIOS_INSUFFICIENT_PRIV = 100,
NAGIOS_NOT_INSTALLED = 101,
};
enum svc_action_flags {
/* On timeout, only kill pid, do not kill entire pid group */
SVC_ACTION_LEAVE_GROUP = 0x01,
SVC_ACTION_NON_BLOCKED = 0x02,
};
typedef struct svc_action_private_s svc_action_private_t;
typedef struct svc_action_s {
char *id;
char *rsc;
char *action;
guint interval_ms;
char *standard;
char *provider;
char *agent;
int timeout;
GHashTable *params; /* used for setting up environment for ocf-ra &
alert agents
and to be sent via stdin for fence-agents
*/
int rc;
int pid;
int cancel;
int status;
int sequence;
int expected_rc;
int synchronous;
enum svc_action_flags flags;
char *stderr_data;
char *stdout_data;
/*!
* Data stored by the creator of the action.
*
* This may be used to hold data that is needed later on by a callback,
* for example.
*/
void *cb_data;
svc_action_private_t *opaque;
} svc_action_t;
/**
* \brief Get a list of files or directories in a given path
*
* \param[in] root full path to a directory to read
* \param[in] files return list of files if TRUE or directories if FALSE
* \param[in] executable if TRUE and files is TRUE, only return executable files
*
* \return a list of what was found. The list items are char *.
* \note It is the caller's responsibility to free the result with g_list_free_full(list, free).
*/
GList *get_directory_list(const char *root, gboolean files, gboolean executable);
/**
* Get a list of services
*
* \return a list of services. The list items are gchar *. This list _must_
* be destroyed using g_list_free_full(list, free).
*/
GList *services_list(void);
/**
* \brief Get a list of providers
*
* \param[in] standard list providers of this standard (e.g. ocf, lsb, etc.)
*
* \return a list of providers as char * list items (or NULL if standard does not support providers)
* \note The caller is responsible for freeing the result using g_list_free_full(list, free).
*/
GList *resources_list_providers(const char *standard);
/**
* \brief Get a list of resource agents
*
* \param[in] standard list agents using this standard (e.g. ocf, lsb, etc.) (or NULL for all)
* \param[in] provider list agents from this provider (or NULL for all)
*
* \return a list of resource agents. The list items are char *.
* \note The caller is responsible for freeing the result using g_list_free_full(list, free).
*/
GList *resources_list_agents(const char *standard, const char *provider);
/**
* Get list of available standards
*
* \return a list of resource standards. The list items are char *. This list _must_
* be destroyed using g_list_free_full(list, free).
*/
GList *resources_list_standards(void);
/**
* Does the given standard, provider, and agent describe a resource that can exist?
*
* \param[in] standard Which class of agent does the resource belong to?
* \param[in] provider What provides the agent (NULL for most standards)?
* \param[in] agent What is the name of the agent?
*
* \return A boolean
*/
gboolean resources_agent_exists(const char *standard, const char *provider, const char *agent);
svc_action_t *services_action_create(const char *name, const char *action,
guint interval_ms, int timeout /* ms */);
/**
* \brief Create a new resource action
*
* \param[in] name Name of resource
* \param[in] standard Resource agent standard (ocf, lsb, etc.)
* \param[in] provider Resource agent provider
* \param[in] agent Resource agent name
* \param[in] action action (start, stop, monitor, etc.)
* \param[in] interval_ms How often to repeat this action (if 0, execute once)
* \param[in] timeout Consider action failed if it does not complete in this many milliseconds
* \param[in] params Action parameters
*
* \return newly allocated action instance
*
* \post After the call, 'params' is owned, and later free'd by the svc_action_t result
* \note The caller is responsible for freeing the return value using
* services_action_free().
*/
svc_action_t *resources_action_create(const char *name, const char *standard,
const char *provider, const char *agent,
const char *action, guint interval_ms,
int timeout /* ms */, GHashTable *params,
enum svc_action_flags flags);
/**
* Kick a recurring action so it is scheduled immediately for re-execution
*/
gboolean services_action_kick(const char *name, const char *action,
guint interval_ms);
const char *resources_find_service_class(const char *agent);
/**
* Utilize services API to execute an arbitrary command.
*
* This API has useful infrastructure in place to be able to run a command
* in the background and get notified via a callback when the command finishes.
*
* \param[in] exec command to execute
* \param[in] args arguments to the command, NULL terminated
*
* \return a svc_action_t object, used to pass to the execute function
* (services_action_sync() or services_action_async()) and is
* provided to the callback.
*/
svc_action_t *services_action_create_generic(const char *exec, const char *args[]);
void services_action_cleanup(svc_action_t * op);
void services_action_free(svc_action_t * op);
int services_action_user(svc_action_t *op, const char *user);
gboolean services_action_sync(svc_action_t * op);
/**
* Run an action asynchronously.
*
* \param[in] op services action data
* \param[in] action_callback callback for when the action completes
* \param[in] action_fork_callback callback for when action forked successfully
*
* \retval TRUE succesfully started execution
* \retval FALSE failed to start execution, no callback will be received
*/
gboolean services_action_async_fork_notify(svc_action_t * op,
void (*action_callback) (svc_action_t *),
void (*action_fork_callback) (svc_action_t *));
gboolean services_action_async(svc_action_t * op,
void (*action_callback) (svc_action_t *));
gboolean services_action_cancel(const char *name, const char *action,
guint interval_ms);
/* functions for alert agents */
svc_action_t *services_alert_create(const char *id, const char *exec,
int timeout, GHashTable *params,
int sequence, void *cb_data);
gboolean services_alert_async(svc_action_t *action,
void (*cb)(svc_action_t *op));
static inline const char *services_lrm_status_str(enum op_status status) {
switch (status) {
case PCMK_LRM_OP_PENDING:
return "pending";
case PCMK_LRM_OP_DONE:return "complete";
case PCMK_LRM_OP_CANCELLED:return "Cancelled";
case PCMK_LRM_OP_TIMEOUT:return "Timed Out";
case PCMK_LRM_OP_NOTSUPPORTED:return "NOT SUPPORTED";
case PCMK_LRM_OP_ERROR:return "Error";
case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed";
case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection";
case PCMK_LRM_OP_INVALID:return "Cannot execute now";
default:return "UNKNOWN!";
}
}
static inline const char *services_ocf_exitcode_str(enum ocf_exitcode code) {
switch (code) {
case PCMK_OCF_OK:
return "ok";
case PCMK_OCF_UNKNOWN_ERROR:
- return "unknown error";
+ return "error";
case PCMK_OCF_INVALID_PARAM:
return "invalid parameter";
case PCMK_OCF_UNIMPLEMENT_FEATURE:
return "unimplemented feature";
case PCMK_OCF_INSUFFICIENT_PRIV:
return "insufficient privileges";
case PCMK_OCF_NOT_INSTALLED:
return "not installed";
case PCMK_OCF_NOT_CONFIGURED:
return "not configured";
case PCMK_OCF_NOT_RUNNING:
return "not running";
case PCMK_OCF_RUNNING_MASTER:
return "master";
case PCMK_OCF_FAILED_MASTER:
return "master (failed)";
case PCMK_OCF_SIGNAL:
return "OCF_SIGNAL";
case PCMK_OCF_NOT_SUPPORTED:
return "OCF_NOT_SUPPORTED";
case PCMK_OCF_PENDING:
return "OCF_PENDING";
case PCMK_OCF_CANCELLED:
return "OCF_CANCELLED";
case PCMK_OCF_TIMEOUT:
return "OCF_TIMEOUT";
case PCMK_OCF_OTHER_ERROR:
return "OCF_OTHER_ERROR";
case PCMK_OCF_DEGRADED:
return "OCF_DEGRADED";
case PCMK_OCF_DEGRADED_MASTER:
return "OCF_DEGRADED_MASTER";
default:
return "unknown";
}
}
/**
* \brief Get OCF equivalent of LSB exit code
*
* \param[in] action LSB action that produced exit code
* \param[in] lsb_exitcode Exit code of LSB action
*
* \return PCMK_OCF_* constant that corresponds to LSB exit code
*/
static inline enum ocf_exitcode
services_get_ocf_exitcode(const char *action, int lsb_exitcode)
{
/* For non-status actions, LSB and OCF share error code meaning <= 7 */
if (action && strcmp(action, "status") && strcmp(action, "monitor")) {
if ((lsb_exitcode < 0) || (lsb_exitcode > PCMK_LSB_NOT_RUNNING)) {
return PCMK_OCF_UNKNOWN_ERROR;
}
return (enum ocf_exitcode)lsb_exitcode;
}
/* status has different return codes */
switch (lsb_exitcode) {
case PCMK_LSB_STATUS_OK:
return PCMK_OCF_OK;
case PCMK_LSB_STATUS_NOT_INSTALLED:
return PCMK_OCF_NOT_INSTALLED;
case PCMK_LSB_STATUS_INSUFFICIENT_PRIV:
return PCMK_OCF_INSUFFICIENT_PRIV;
case PCMK_LSB_STATUS_VAR_PID:
case PCMK_LSB_STATUS_VAR_LOCK:
case PCMK_LSB_STATUS_NOT_RUNNING:
return PCMK_OCF_NOT_RUNNING;
}
return PCMK_OCF_UNKNOWN_ERROR;
}
# ifdef __cplusplus
}
# endif
#endif /* __PCMK_SERVICES__ */
diff --git a/lib/common/iso8601.c b/lib/common/iso8601.c
index 7f64edd020..9ed0027520 100644
--- a/lib/common/iso8601.c
+++ b/lib/common/iso8601.c
@@ -1,1718 +1,1720 @@
/*
* Copyright 2005-2019 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
/*
* References:
* https://en.wikipedia.org/wiki/ISO_8601
* http://www.staff.science.uu.nl/~gent0113/calendar/isocalendar.htm
*/
#include
#include
#include
#include
#include
#include
#include
#include
/*
* Andrew's code was originally written for OSes whose "struct tm" contains:
* long tm_gmtoff; :: Seconds east of UTC
* const char *tm_zone; :: Timezone abbreviation
* Some OSes lack these, instead having:
* time_t (or long) timezone;
:: "difference between UTC and local standard time"
* char *tzname[2] = { "...", "..." };
* I (David Lee) confess to not understanding the details. So my attempted
* generalisations for where their use is necessary may be flawed.
*
* 1. Does "difference between ..." subtract the same or opposite way?
* 2. Should it use "altzone" instead of "timezone"?
* 3. Should it use tzname[0] or tzname[1]? Interaction with timezone/altzone?
*/
#if defined(HAVE_STRUCT_TM_TM_GMTOFF)
# define GMTOFF(tm) ((tm)->tm_gmtoff)
#else
/* Note: extern variable; macro argument not actually used. */
# define GMTOFF(tm) (-timezone+daylight)
#endif
#define HOUR_SECONDS (60 * 60)
#define DAY_SECONDS (HOUR_SECONDS * 24)
// A date/time or duration
struct crm_time_s {
int years; // Calendar year (date/time) or number of years (duration)
int months; // Number of months (duration only)
int days; // Ordinal day of year (date/time) or number of days (duration)
int seconds; // Seconds of day (date/time) or number of seconds (duration)
int offset; // Seconds offset from UTC (date/time only)
bool duration; // True if duration
};
char *crm_time_as_string(crm_time_t * date_time, int flags);
static crm_time_t *parse_date(const char *date_str);
gboolean check_for_ordinal(const char *str);
static crm_time_t *
crm_get_utc_time(crm_time_t *dt)
{
crm_time_t *utc = NULL;
if (dt == NULL) {
errno = EINVAL;
return NULL;
}
utc = crm_time_new_undefined();
utc->years = dt->years;
utc->days = dt->days;
utc->seconds = dt->seconds;
utc->offset = 0;
if (dt->offset) {
crm_time_add_seconds(utc, -dt->offset);
} else {
/* Durations (which are the only things that can include months, never have a timezone */
utc->months = dt->months;
}
crm_time_log(LOG_TRACE, "utc-source", dt,
crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone);
crm_time_log(LOG_TRACE, "utc-target", utc,
crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone);
return utc;
}
crm_time_t *
crm_time_new(const char *date_time)
{
time_t tm_now;
crm_time_t *dt = NULL;
tzset();
if (date_time == NULL) {
tm_now = time(NULL);
dt = crm_time_new_undefined();
crm_time_set_timet(dt, &tm_now);
} else {
dt = parse_date(date_time);
}
return dt;
}
/*!
* \brief Allocate memory for an uninitialized time object
*
* \return Newly allocated time object
* \note The caller is responsible for freeing the return value using
* crm_time_free().
*/
crm_time_t *
crm_time_new_undefined()
{
crm_time_t *result = calloc(1, sizeof(crm_time_t));
CRM_ASSERT(result != NULL);
return result;
}
/*!
* \brief Check whether a time object has been initialized yet
*
* \param[in] t Time object to check
*
* \return TRUE if time object has been initialized, FALSE otherwise
*/
bool
crm_time_is_defined(const crm_time_t *t)
{
// Any nonzero member indicates something has been done to t
return (t != NULL) && (t->years || t->months || t->days || t->seconds
|| t->offset || t->duration);
}
void
crm_time_free(crm_time_t * dt)
{
if (dt == NULL) {
return;
}
free(dt);
}
static int
year_days(int year)
{
int d = 365;
if (crm_time_leapyear(year)) {
d++;
}
return d;
}
/* From http://myweb.ecu.edu/mccartyr/ISOwdALG.txt :
*
* 5. Find the Jan1Weekday for Y (Monday=1, Sunday=7)
* YY = (Y-1) % 100
* C = (Y-1) - YY
* G = YY + YY/4
* Jan1Weekday = 1 + (((((C / 100) % 4) x 5) + G) % 7)
*/
int
crm_time_january1_weekday(int year)
{
int YY = (year - 1) % 100;
int C = (year - 1) - YY;
int G = YY + YY / 4;
int jan1 = 1 + (((((C / 100) % 4) * 5) + G) % 7);
crm_trace("YY=%d, C=%d, G=%d", YY, C, G);
crm_trace("January 1 %.4d: %d", year, jan1);
return jan1;
}
int
crm_time_weeks_in_year(int year)
{
int weeks = 52;
int jan1 = crm_time_january1_weekday(year);
/* if jan1 == thursday */
if (jan1 == 4) {
weeks++;
} else {
jan1 = crm_time_january1_weekday(year + 1);
/* if dec31 == thursday aka. jan1 of next year is a friday */
if (jan1 == 5) {
weeks++;
}
}
return weeks;
}
// Jan-Dec plus Feb of leap years
static int month_days[13] = {
31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 29
};
/*!
* \brief Return number of days in given month of given year
*
* \param[in] Ordinal month (1-12)
* \param[in] Gregorian year
*
* \return Number of days in given month (0 if given month is invalid)
*/
int
crm_time_days_in_month(int month, int year)
{
if ((month < 1) || (month > 12)) {
return 0;
}
if ((month == 2) && crm_time_leapyear(year)) {
month = 13;
}
return month_days[month - 1];
}
bool
crm_time_leapyear(int year)
{
gboolean is_leap = FALSE;
if (year % 4 == 0) {
is_leap = TRUE;
}
if (year % 100 == 0 && year % 400 != 0) {
is_leap = FALSE;
}
return is_leap;
}
static uint32_t
get_ordinal_days(uint32_t y, uint32_t m, uint32_t d)
{
int lpc;
for (lpc = 1; lpc < m; lpc++) {
d += crm_time_days_in_month(lpc, y);
}
return d;
}
void
crm_time_log_alias(int log_level, const char *file, const char *function, int line,
const char *prefix, crm_time_t * date_time, int flags)
{
char *date_s = crm_time_as_string(date_time, flags);
if (log_level < LOG_CRIT) {
printf("%s%s%s\n",
(prefix? prefix : ""), (prefix? ": " : ""), date_s);
} else {
do_crm_log_alias(log_level, file, function, line, "%s%s%s",
(prefix? prefix : ""), (prefix? ": " : ""), date_s);
}
free(date_s);
}
static int
crm_time_get_sec(int sec, uint * h, uint * m, uint * s)
{
uint hours, minutes, seconds;
if (sec < 0) {
seconds = 0 - sec;
} else {
seconds = sec;
}
hours = seconds / HOUR_SECONDS;
seconds -= HOUR_SECONDS * hours;
minutes = seconds / 60;
seconds -= 60 * minutes;
crm_trace("%d == %.2d:%.2d:%.2d", sec, hours, minutes, seconds);
*h = hours;
*m = minutes;
*s = seconds;
return TRUE;
}
int
crm_time_get_timeofday(crm_time_t * dt, uint * h, uint * m, uint * s)
{
return crm_time_get_sec(dt->seconds, h, m, s);
}
int
crm_time_get_timezone(crm_time_t * dt, uint * h, uint * m)
{
uint s;
return crm_time_get_sec(dt->seconds, h, m, &s);
}
long long
crm_time_get_seconds(crm_time_t * dt)
{
int lpc;
crm_time_t *utc = NULL;
long long in_seconds = 0;
if (dt == NULL) {
return 0;
}
utc = crm_get_utc_time(dt);
if (utc == NULL) {
return 0;
}
for (lpc = 1; lpc < utc->years; lpc++) {
int dmax = year_days(lpc);
in_seconds += DAY_SECONDS * dmax;
}
/* utc->months is an offset that can only be set for a duration.
* By definition, the value is variable depending on the date to
* which it is applied.
*
* Force 30-day months so that something vaguely sane happens
* for anyone that tries to use a month in this way.
*/
if (utc->months > 0) {
in_seconds += DAY_SECONDS * 30 * utc->months;
}
if (utc->days > 0) {
in_seconds += DAY_SECONDS * (utc->days - 1);
}
in_seconds += utc->seconds;
crm_time_free(utc);
return in_seconds;
}
#define EPOCH_SECONDS 62135596800ULL /* Calculated using crm_time_get_seconds() */
long long
crm_time_get_seconds_since_epoch(crm_time_t * dt)
{
return (dt == NULL)? 0 : (crm_time_get_seconds(dt) - EPOCH_SECONDS);
}
int
crm_time_get_gregorian(crm_time_t * dt, uint * y, uint * m, uint * d)
{
int months = 0;
int days = dt->days;
if(dt->years != 0) {
for (months = 1; months <= 12 && days > 0; months++) {
int mdays = crm_time_days_in_month(months, dt->years);
if (mdays >= days) {
break;
} else {
days -= mdays;
}
}
} else if (dt->months) {
/* This is a duration including months, don't convert the days field */
months = dt->months;
} else {
/* This is a duration not including months, still don't convert the days field */
}
*y = dt->years;
*m = months;
*d = days;
crm_trace("%.4d-%.3d -> %.4d-%.2d-%.2d", dt->years, dt->days, dt->years, months, days);
return TRUE;
}
int
crm_time_get_ordinal(crm_time_t * dt, uint * y, uint * d)
{
*y = dt->years;
*d = dt->days;
return TRUE;
}
int
crm_time_get_isoweek(crm_time_t * dt, uint * y, uint * w, uint * d)
{
/*
* Monday 29 December 2008 is written "2009-W01-1"
* Sunday 3 January 2010 is written "2009-W53-7"
*/
int year_num = 0;
int jan1 = crm_time_january1_weekday(dt->years);
int h = -1;
CRM_CHECK(dt->days > 0, return FALSE);
/* 6. Find the Weekday for Y M D */
h = dt->days + jan1 - 1;
*d = 1 + ((h - 1) % 7);
/* 7. Find if Y M D falls in YearNumber Y-1, WeekNumber 52 or 53 */
if (dt->days <= (8 - jan1) && jan1 > 4) {
crm_trace("year--, jan1=%d", jan1);
year_num = dt->years - 1;
*w = crm_time_weeks_in_year(year_num);
} else {
year_num = dt->years;
}
/* 8. Find if Y M D falls in YearNumber Y+1, WeekNumber 1 */
if (year_num == dt->years) {
int dmax = year_days(year_num);
int correction = 4 - *d;
if ((dmax - dt->days) < correction) {
crm_trace("year++, jan1=%d, i=%d vs. %d", jan1, dmax - dt->days, correction);
year_num = dt->years + 1;
*w = 1;
}
}
/* 9. Find if Y M D falls in YearNumber Y, WeekNumber 1 through 53 */
if (year_num == dt->years) {
int j = dt->days + (7 - *d) + (jan1 - 1);
*w = j / 7;
if (jan1 > 4) {
*w -= 1;
}
}
*y = year_num;
crm_trace("Converted %.4d-%.3d to %.4d-W%.2d-%d", dt->years, dt->days, *y, *w, *d);
return TRUE;
}
#define DATE_MAX 128
#define s_if_plural(i) (((i) == 1)? "" : "s")
static void
crm_duration_as_string(crm_time_t *dt, char *result)
{
size_t offset = 0;
if (dt->years) {
offset += snprintf(result + offset, DATE_MAX - offset, "%4d year%s ",
dt->years, s_if_plural(dt->years));
}
if (dt->months) {
offset += snprintf(result + offset, DATE_MAX - offset, "%2d month%s ",
dt->months, s_if_plural(dt->months));
}
if (dt->days) {
offset += snprintf(result + offset, DATE_MAX - offset, "%2d day%s ",
dt->days, s_if_plural(dt->days));
}
if (((offset == 0) || (dt->seconds != 0))
&& (dt->seconds > -60) && (dt->seconds < 60)) {
offset += snprintf(result + offset, DATE_MAX - offset, "%d second%s",
dt->seconds, s_if_plural(dt->seconds));
} else if (dt->seconds) {
uint h = 0, m = 0, s = 0;
offset += snprintf(result + offset, DATE_MAX - offset, "%d seconds (",
dt->seconds);
crm_time_get_sec(dt->seconds, &h, &m, &s);
if (h) {
offset += snprintf(result + offset, DATE_MAX - offset, "%u hour%s%s",
h, s_if_plural(h), ((m || s)? " " : ""));
}
if (m) {
offset += snprintf(result + offset, DATE_MAX - offset, "%u minute%s%s",
m, s_if_plural(m), (s? " " : ""));
}
if (s) {
offset += snprintf(result + offset, DATE_MAX - offset, "%u second%s",
s, s_if_plural(s));
}
offset += snprintf(result + offset, DATE_MAX - offset, ")");
}
}
char *
crm_time_as_string(crm_time_t * date_time, int flags)
{
crm_time_t *dt = NULL;
crm_time_t *utc = NULL;
char result[DATE_MAX] = { '\0', };
char *result_copy = NULL;
size_t offset = 0;
// Convert to UTC if local timezone was not requested
if (date_time && date_time->offset
&& is_not_set(flags, crm_time_log_with_timezone)) {
crm_trace("UTC conversion");
utc = crm_get_utc_time(date_time);
dt = utc;
} else {
dt = date_time;
}
if (!crm_time_is_defined(dt)) {
strcpy(result, "");
goto done;
}
// Simple cases: as duration, seconds, or seconds since epoch
if (flags & crm_time_log_duration) {
crm_duration_as_string(date_time, result);
goto done;
}
if (flags & crm_time_seconds) {
snprintf(result, DATE_MAX, "%lld", crm_time_get_seconds(date_time));
goto done;
}
if (flags & crm_time_epoch) {
snprintf(result, DATE_MAX, "%lld",
crm_time_get_seconds_since_epoch(date_time));
goto done;
}
// As readable string
if (flags & crm_time_log_date) {
if (flags & crm_time_weeks) { // YYYY-WW-D
uint y, w, d;
if (crm_time_get_isoweek(dt, &y, &w, &d)) {
offset += snprintf(result + offset, DATE_MAX - offset,
"%u-W%.2u-%u", y, w, d);
}
} else if (flags & crm_time_ordinal) { // YYYY-DDD
uint y, d;
if (crm_time_get_ordinal(dt, &y, &d)) {
offset += snprintf(result + offset, DATE_MAX - offset,
"%u-%.3u", y, d);
}
} else { // YYYY-MM-DD
uint y, m, d;
if (crm_time_get_gregorian(dt, &y, &m, &d)) {
offset += snprintf(result + offset, DATE_MAX - offset,
"%.4u-%.2u-%.2u", y, m, d);
}
}
}
if (flags & crm_time_log_timeofday) {
uint h = 0, m = 0, s = 0;
if (offset > 0) {
offset += snprintf(result + offset, DATE_MAX - offset, " ");
}
if (crm_time_get_timeofday(dt, &h, &m, &s)) {
offset += snprintf(result + offset, DATE_MAX - offset,
"%.2u:%.2u:%.2u", h, m, s);
}
if ((flags & crm_time_log_with_timezone) && (dt->offset != 0)) {
crm_time_get_sec(dt->offset, &h, &m, &s);
offset += snprintf(result + offset, DATE_MAX - offset,
" %c%.2u:%.2u",
((dt->offset < 0)? '-' : '+'), h, m);
} else {
offset += snprintf(result + offset, DATE_MAX - offset, "Z");
}
}
done:
crm_time_free(utc);
result_copy = strdup(result);
CRM_ASSERT(result_copy != NULL);
return result_copy;
}
/*!
* \internal
* \brief Determine number of seconds from an hour:minute:second string
*
* \param[in] time_str Time specification string
* \param[out] result Number of seconds equivalent to time_str
*
* \return TRUE if specification was valid, FALSE (and set errno) otherwise
* \note This may return the number of seconds in a day (which is out of bounds
* for a time object) if given 24:00:00.
*/
static bool
crm_time_parse_sec(const char *time_str, int *result)
{
int rc;
uint hour = 0;
uint minute = 0;
uint second = 0;
*result = 0;
// Must have at least hour, but minutes and seconds are optional
rc = sscanf(time_str, "%d:%d:%d", &hour, &minute, &second);
if (rc == 1) {
rc = sscanf(time_str, "%2d%2d%2d", &hour, &minute, &second);
}
if (rc == 0) {
crm_err("%s is not a valid ISO 8601 time specification", time_str);
errno = EINVAL;
return FALSE;
}
crm_trace("Got valid time: %.2d:%.2d:%.2d", hour, minute, second);
if ((hour == 24) && (minute == 0) && (second == 0)) {
// Equivalent to 00:00:00 of next day, return number of seconds in day
} else if (hour >= 24) {
crm_err("%s is not a valid ISO 8601 time specification "
"because %d is not a valid hour", time_str, hour);
errno = EINVAL;
return FALSE;
}
if (minute >= 60) {
crm_err("%s is not a valid ISO 8601 time specification "
"because %d is not a valid minute", time_str, minute);
errno = EINVAL;
return FALSE;
}
if (second >= 60) {
crm_err("%s is not a valid ISO 8601 time specification "
"because %d is not a valid second", time_str, second);
errno = EINVAL;
return FALSE;
}
*result = (hour * HOUR_SECONDS) + (minute * 60) + second;
return TRUE;
}
static bool
crm_time_parse_offset(const char *offset_str, int *offset)
{
tzset();
if (offset_str == NULL) {
// Use local offset
#if defined(HAVE_STRUCT_TM_TM_GMTOFF)
time_t now = time(NULL);
struct tm *now_tm = localtime(&now);
#endif
int h_offset = GMTOFF(now_tm) / HOUR_SECONDS;
int m_offset = (GMTOFF(now_tm) - (HOUR_SECONDS * h_offset)) / 60;
if (h_offset < 0 && m_offset < 0) {
m_offset = 0 - m_offset;
}
*offset = (HOUR_SECONDS * h_offset) + (60 * m_offset);
return TRUE;
}
if (offset_str[0] == 'Z') { // @TODO invalid if anything after?
*offset = 0;
return TRUE;
}
*offset = 0;
if ((offset_str[0] == '+') || (offset_str[0] == '-')
|| isdigit((int)offset_str[0])) {
gboolean negate = FALSE;
if (offset_str[0] == '-') {
negate = TRUE;
offset_str++;
}
if (crm_time_parse_sec(offset_str, offset) == FALSE) {
return FALSE;
}
if (negate) {
*offset = 0 - *offset;
}
} // @TODO else invalid?
return TRUE;
}
/*!
* \internal
* \brief Parse the time portion of an ISO 8601 date/time string
*
* \param[in] time_str Time portion of specification (after any 'T')
* \param[in,out] a_time Time object to parse into
*
* \return TRUE if valid time was parsed, FALSE (and set errno) otherwise
* \note This may add a day to a_time (if the time is 24:00:00).
*/
static bool
crm_time_parse(const char *time_str, crm_time_t *a_time)
{
uint h, m, s;
char *offset_s = NULL;
tzset();
if (time_str) {
if (crm_time_parse_sec(time_str, &(a_time->seconds)) == FALSE) {
return FALSE;
}
offset_s = strstr(time_str, "Z");
if (offset_s == NULL) {
offset_s = strstr(time_str, " ");
if (offset_s) {
while (isspace(offset_s[0])) {
offset_s++;
}
}
}
}
if (crm_time_parse_offset(offset_s, &(a_time->offset)) == FALSE) {
return FALSE;
}
crm_time_get_sec(a_time->offset, &h, &m, &s);
crm_trace("Got tz: %c%2.d:%.2d", ((a_time->offset < 0)? '-' : '+'), h, m);
if (a_time->seconds == DAY_SECONDS) {
// 24:00:00 == 00:00:00 of next day
a_time->seconds = 0;
crm_time_add_days(a_time, 1);
}
return TRUE;
}
/*
* \internal
* \brief Parse a time object from an ISO 8601 date/time specification
*
* \param[in] date_str ISO 8601 date/time specification (or "epoch")
*
* \return New time object on success, NULL (and set errno) otherwise
*/
static crm_time_t *
parse_date(const char *date_str)
{
const char *time_s = NULL;
crm_time_t *dt = NULL;
int year = 0;
int month = 0;
int week = 0;
int day = 0;
int rc = 0;
if ((date_str == NULL) || (date_str[0] == '\0')) {
crm_err("No ISO 8601 date/time specification given");
goto invalid;
}
if ((date_str[0] == 'T') || (date_str[2] == ':')) {
/* Just a time supplied - Infer current date */
dt = crm_time_new(NULL);
if (date_str[0] == 'T') {
time_s = date_str + 1;
} else {
time_s = date_str;
}
goto parse_time;
}
dt = crm_time_new_undefined();
if (!strncasecmp("epoch", date_str, 5)
&& ((date_str[5] == '\0') || (date_str[5] == '/') || isspace(date_str[5]))) {
dt->days = 1;
dt->years = 1970;
crm_time_log(LOG_TRACE, "Unpacked", dt, crm_time_log_date | crm_time_log_timeofday);
return dt;
}
/* YYYY-MM-DD */
rc = sscanf(date_str, "%d-%d-%d", &year, &month, &day);
if (rc == 1) {
/* YYYYMMDD */
rc = sscanf(date_str, "%4d%2d%2d", &year, &month, &day);
}
if (rc == 3) {
if (month > 12) {
crm_err("'%s' is not a valid ISO 8601 date/time specification "
"because '%d' is not a valid month", date_str, month);
goto invalid;
} else if (day > crm_time_days_in_month(month, year)) {
crm_err("'%s' is not a valid ISO 8601 date/time specification "
"because '%d' is not a valid day of the month",
date_str, day);
goto invalid;
} else {
dt->years = year;
dt->days = get_ordinal_days(year, month, day);
crm_trace("Parsed Gregorian date '%.4d-%.3d' from date string '%s'",
year, dt->days, date_str);
}
goto parse_time;
}
/* YYYY-DDD */
rc = sscanf(date_str, "%d-%d", &year, &day);
if (rc == 2) {
if (day > year_days(year)) {
crm_err("'%s' is not a valid ISO 8601 date/time specification "
"because '%d' is not a valid day of the year (max %d)",
date_str, day, year_days(year));
goto invalid;
}
crm_trace("Parsed ordinal year %d and days %d from date string '%s'",
year, day, date_str);
dt->days = day;
dt->years = year;
goto parse_time;
}
/* YYYY-Www-D */
rc = sscanf(date_str, "%d-W%d-%d", &year, &week, &day);
if (rc == 3) {
if (week > crm_time_weeks_in_year(year)) {
crm_err("'%s' is not a valid ISO 8601 date/time specification "
"because '%d' is not a valid week of the year (max %d)",
date_str, week, crm_time_weeks_in_year(year));
goto invalid;
} else if (day < 1 || day > 7) {
crm_err("'%s' is not a valid ISO 8601 date/time specification "
"because '%d' is not a valid day of the week",
date_str, day);
goto invalid;
} else {
/*
* See https://en.wikipedia.org/wiki/ISO_week_date
*
* Monday 29 December 2008 is written "2009-W01-1"
* Sunday 3 January 2010 is written "2009-W53-7"
* Saturday 27 September 2008 is written "2008-W37-6"
*
* If 1 January is on a Monday, Tuesday, Wednesday or Thursday, it is in week 01.
* If 1 January is on a Friday, Saturday or Sunday, it is in week 52 or 53 of the previous year.
*/
int jan1 = crm_time_january1_weekday(year);
crm_trace("Got year %d (Jan 1 = %d), week %d, and day %d from date string '%s'",
year, jan1, week, day, date_str);
dt->years = year;
crm_time_add_days(dt, (week - 1) * 7);
if (jan1 <= 4) {
crm_time_add_days(dt, 1 - jan1);
} else {
crm_time_add_days(dt, 8 - jan1);
}
crm_time_add_days(dt, day);
}
goto parse_time;
}
crm_err("'%s' is not a valid ISO 8601 date/time specification", date_str);
goto invalid;
parse_time:
if (time_s == NULL) {
time_s = date_str + strspn(date_str, "0123456789-W");
if ((time_s[0] == ' ') || (time_s[0] == 'T')) {
++time_s;
} else {
time_s = NULL;
}
}
if ((time_s != NULL) && (crm_time_parse(time_s, dt) == FALSE)) {
goto invalid;
}
crm_time_log(LOG_TRACE, "Unpacked", dt, crm_time_log_date | crm_time_log_timeofday);
if (crm_time_check(dt) == FALSE) {
crm_err("'%s' is not a valid ISO 8601 date/time specification",
date_str);
goto invalid;
}
return dt;
invalid:
crm_time_free(dt);
errno = EINVAL;
return NULL;
}
// Parse an ISO 8601 numeric value and return number of characters consumed
// @TODO This cannot handle >INT_MAX int values
// @TODO Fractions appear to be not working
// @TODO Error out on invalid specifications
static int
parse_int(const char *str, int field_width, int upper_bound, int *result)
{
int lpc = 0;
int offset = 0;
int intermediate = 0;
gboolean fraction = FALSE;
gboolean negate = FALSE;
*result = 0;
if (*str == '\0') {
return 0;
}
if (str[offset] == 'T') {
offset++;
}
if (str[offset] == '.' || str[offset] == ',') {
fraction = TRUE;
field_width = -1;
offset++;
} else if (str[offset] == '-') {
negate = TRUE;
offset++;
} else if (str[offset] == '+' || str[offset] == ':') {
offset++;
}
for (; (fraction || lpc < field_width) && isdigit((int)str[offset]); lpc++) {
if (fraction) {
intermediate = (str[offset] - '0') / (10 ^ lpc);
} else {
*result *= 10;
intermediate = str[offset] - '0';
}
*result += intermediate;
offset++;
}
if (fraction) {
*result = (int)(*result * upper_bound);
} else if (upper_bound > 0 && *result > upper_bound) {
*result = upper_bound;
}
if (negate) {
*result = 0 - *result;
}
if (lpc > 0) {
crm_trace("Found int: %d. Stopped at str[%d]='%c'", *result, lpc, str[lpc]);
return offset;
}
return 0;
}
/*!
* \brief Parse a time duration from an ISO 8601 duration specification
*
* \param[in] period_s ISO 8601 duration specification (optionally followed by
* whitespace, after which the rest of the string will be
* ignored)
*
* \return New time object on success, NULL (and set errno) otherwise
* \note It is the caller's responsibility to return the result using
* crm_time_free().
*/
crm_time_t *
crm_time_parse_duration(const char *period_s)
{
gboolean is_time = FALSE;
crm_time_t *diff = NULL;
if ((period_s == NULL) || (period_s[0] == '\0')) {
crm_err("No ISO 8601 time duration given");
goto invalid;
}
if (period_s[0] != 'P') {
crm_err("'%s' is not a valid ISO 8601 time duration "
"because it does not start with a 'P'", period_s);
goto invalid;
}
if ((period_s[1] == '\0') || isspace(period_s[1])) {
crm_err("'%s' is not a valid ISO 8601 time duration "
"because nothing follows 'P'", period_s);
goto invalid;
}
diff = crm_time_new_undefined();
diff->duration = TRUE;
for (const char *current = period_s + 1;
current[0] && (current[0] != '/') && !isspace(current[0]);
++current) {
int an_int = 0, rc;
if (current[0] == 'T') {
/* A 'T' separates year/month/day from hour/minute/seconds. We don't
* require it strictly, but just use it to differentiate month from
* minutes.
*/
is_time = TRUE;
continue;
}
// An integer must be next
rc = parse_int(current, 10, 0, &an_int);
if (rc == 0) {
crm_err("'%s' is not a valid ISO 8601 time duration "
"because no integer at '%s'", period_s, current);
goto invalid;
}
current += rc;
// A time unit must be next (we're not strict about the order)
switch (current[0]) {
case 'Y':
diff->years = an_int;
break;
case 'M':
if (is_time) {
/* Minutes */
diff->seconds += an_int * 60;
} else {
diff->months = an_int;
}
break;
case 'W':
diff->days += an_int * 7;
break;
case 'D':
diff->days += an_int;
break;
case 'H':
diff->seconds += an_int * HOUR_SECONDS;
break;
case 'S':
diff->seconds += an_int;
break;
case '\0':
crm_err("'%s' is not a valid ISO 8601 time duration "
"because no units after %d", period_s, an_int);
goto invalid;
default:
crm_err("'%s' is not a valid ISO 8601 time duration "
"because '%c' is not a valid time unit",
period_s, current[0]);
goto invalid;
}
}
if (!crm_time_is_defined(diff)) {
crm_err("'%s' is not a valid ISO 8601 time duration "
"because no amounts and units given", period_s);
goto invalid;
}
return diff;
invalid:
crm_time_free(diff);
errno = EINVAL;
return NULL;
}
/*!
* \brief Parse a time period from an ISO 8601 interval specification
*
* \param[in] period_str ISO 8601 interval specification (start/end,
* start/duration, or duration/end)
*
* \return New time period object on success, NULL (and set errno) otherwise
* \note The caller is responsible for freeing the result using
* crm_time_free_period().
*/
crm_time_period_t *
crm_time_parse_period(const char *period_str)
{
const char *original = period_str;
crm_time_period_t *period = NULL;
if ((period_str == NULL) || (period_str[0] == '\0')) {
crm_err("No ISO 8601 time period given");
goto invalid;
}
tzset();
period = calloc(1, sizeof(crm_time_period_t));
CRM_ASSERT(period != NULL);
if (period_str[0] == 'P') {
period->diff = crm_time_parse_duration(period_str);
if (period->diff == NULL) {
goto error;
}
} else {
period->start = parse_date(period_str);
if (period->start == NULL) {
goto error;
}
}
period_str = strstr(original, "/");
if (period_str) {
++period_str;
if (period_str[0] == 'P') {
if (period->diff != NULL) {
crm_err("'%s' is not a valid ISO 8601 time period "
"because it has two durations",
original);
goto invalid;
}
period->diff = crm_time_parse_duration(period_str);
if (period->diff == NULL) {
goto error;
}
} else {
period->end = parse_date(period_str);
if (period->end == NULL) {
goto error;
}
}
} else if (period->diff != NULL) {
// Only duration given, assume start is now
period->start = crm_time_new(NULL);
} else {
// Only start given
crm_err("'%s' is not a valid ISO 8601 time period "
"because it has no duration or ending time",
original);
goto invalid;
}
if (period->start == NULL) {
period->start = crm_time_subtract(period->end, period->diff);
} else if (period->end == NULL) {
period->end = crm_time_add(period->start, period->diff);
}
if (crm_time_check(period->start) == FALSE) {
crm_err("'%s' is not a valid ISO 8601 time period "
"because the start is invalid", period_str);
goto invalid;
}
if (crm_time_check(period->end) == FALSE) {
crm_err("'%s' is not a valid ISO 8601 time period "
"because the end is invalid", period_str);
goto invalid;
}
return period;
invalid:
errno = EINVAL;
error:
crm_time_free_period(period);
return NULL;
}
/*!
* \brief Free a dynamically allocated time period object
*
* \param[in] period Time period to free
*/
void
crm_time_free_period(crm_time_period_t *period)
{
if (period) {
crm_time_free(period->start);
crm_time_free(period->end);
crm_time_free(period->diff);
free(period);
}
}
void
crm_time_set(crm_time_t * target, crm_time_t * source)
{
crm_trace("target=%p, source=%p", target, source);
CRM_CHECK(target != NULL && source != NULL, return);
target->years = source->years;
target->days = source->days;
target->months = source->months; /* Only for durations */
target->seconds = source->seconds;
target->offset = source->offset;
crm_time_log(LOG_TRACE, "source", source,
crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone);
crm_time_log(LOG_TRACE, "target", target,
crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone);
}
static void
ha_set_tm_time(crm_time_t * target, struct tm *source)
{
int h_offset = 0;
int m_offset = 0;
/* Ensure target is fully initialized */
target->years = 0;
target->months = 0;
target->days = 0;
target->seconds = 0;
target->offset = 0;
target->duration = FALSE;
if (source->tm_year > 0) {
/* years since 1900 */
target->years = 1900 + source->tm_year;
}
if (source->tm_yday >= 0) {
/* days since January 1 [0-365] */
target->days = 1 + source->tm_yday;
}
if (source->tm_hour >= 0) {
target->seconds += HOUR_SECONDS * source->tm_hour;
}
if (source->tm_min >= 0) {
target->seconds += 60 * source->tm_min;
}
if (source->tm_sec >= 0) {
target->seconds += source->tm_sec;
}
/* tm_gmtoff == offset from UTC in seconds */
h_offset = GMTOFF(source) / HOUR_SECONDS;
m_offset = (GMTOFF(source) - (HOUR_SECONDS * h_offset)) / 60;
crm_trace("Offset (s): %ld, offset (hh:mm): %.2d:%.2d", GMTOFF(source), h_offset, m_offset);
target->offset += HOUR_SECONDS * h_offset;
target->offset += 60 * m_offset;
}
void
crm_time_set_timet(crm_time_t * target, time_t * source)
{
ha_set_tm_time(target, localtime(source));
}
crm_time_t *
crm_time_add(crm_time_t * dt, crm_time_t * value)
{
crm_time_t *utc = NULL;
crm_time_t *answer = NULL;
if ((dt == NULL) || (value == NULL)) {
errno = EINVAL;
return NULL;
}
answer = crm_time_new_undefined();
crm_time_set(answer, dt);
utc = crm_get_utc_time(value);
if (utc == NULL) {
crm_time_free(answer);
return NULL;
}
answer->years += utc->years;
crm_time_add_months(answer, utc->months);
crm_time_add_days(answer, utc->days);
crm_time_add_seconds(answer, utc->seconds);
crm_time_free(utc);
return answer;
}
crm_time_t *
crm_time_calculate_duration(crm_time_t * dt, crm_time_t * value)
{
crm_time_t *utc = NULL;
crm_time_t *answer = NULL;
if ((dt == NULL) || (value == NULL)) {
errno = EINVAL;
return NULL;
}
utc = crm_get_utc_time(value);
if (utc == NULL) {
return NULL;
}
answer = crm_get_utc_time(dt);
if (answer == NULL) {
crm_time_free(utc);
return NULL;
}
answer->duration = TRUE;
answer->years -= utc->years;
if(utc->months != 0) {
crm_time_add_months(answer, -utc->months);
}
crm_time_add_days(answer, -utc->days);
crm_time_add_seconds(answer, -utc->seconds);
crm_time_free(utc);
return answer;
}
crm_time_t *
crm_time_subtract(crm_time_t * dt, crm_time_t * value)
{
crm_time_t *utc = NULL;
crm_time_t *answer = NULL;
if ((dt == NULL) || (value == NULL)) {
errno = EINVAL;
return NULL;
}
utc = crm_get_utc_time(value);
if (utc == NULL) {
return NULL;
}
answer = crm_time_new_undefined();
crm_time_set(answer, dt);
answer->years -= utc->years;
if(utc->months != 0) {
crm_time_add_months(answer, -utc->months);
}
crm_time_add_days(answer, -utc->days);
crm_time_add_seconds(answer, -utc->seconds);
return answer;
}
/*!
* \brief Check whether a time object represents a sensible date/time
*
* \param[in] dt Date/time object to check
*
* \return TRUE if years, days, and seconds are sensible, FALSE otherwise
*/
bool
crm_time_check(crm_time_t * dt)
{
return (dt != NULL)
&& (dt->days > 0) && (dt->days <= year_days(dt->years))
&& (dt->seconds >= 0) && (dt->seconds < DAY_SECONDS);
}
#define do_cmp_field(l, r, field) \
if(rc == 0) { \
if(l->field > r->field) { \
crm_trace("%s: %d > %d", \
#field, l->field, r->field); \
rc = 1; \
} else if(l->field < r->field) { \
crm_trace("%s: %d < %d", \
#field, l->field, r->field); \
rc = -1; \
} \
}
int
crm_time_compare(crm_time_t *a, crm_time_t *b)
{
int rc = 0;
crm_time_t *t1 = crm_get_utc_time(a);
crm_time_t *t2 = crm_get_utc_time(b);
if ((t1 == NULL) && (t2 == NULL)) {
return 0;
} else if (t1 == NULL) {
return -1;
} else if (t2 == NULL) {
return 1;
}
do_cmp_field(t1, t2, years);
do_cmp_field(t1, t2, days);
do_cmp_field(t1, t2, seconds);
crm_time_free(t1);
crm_time_free(t2);
return rc;
}
/*!
* \brief Add a given number of seconds to a date/time or duration
*
* \param[in] a_time Date/time or duration to add seconds to
* \param[in] extra Number of seconds to add
*/
void
crm_time_add_seconds(crm_time_t *a_time, int extra)
{
int days = 0;
crm_trace("Adding %d seconds to %d (max=%d)",
extra, a_time->seconds, DAY_SECONDS);
a_time->seconds += extra;
days = a_time->seconds / DAY_SECONDS;
a_time->seconds %= DAY_SECONDS;
// Don't have negative seconds
if (a_time->seconds < 0) {
a_time->seconds += DAY_SECONDS;
--days;
}
crm_time_add_days(a_time, days);
}
void
crm_time_add_days(crm_time_t * a_time, int extra)
{
int lower_bound = 1;
int ydays = crm_time_leapyear(a_time->years) ? 366 : 365;
crm_trace("Adding %d days to %.4d-%.3d", extra, a_time->years, a_time->days);
a_time->days += extra;
while (a_time->days > ydays) {
a_time->years++;
a_time->days -= ydays;
ydays = crm_time_leapyear(a_time->years) ? 366 : 365;
}
if(a_time->duration) {
lower_bound = 0;
}
while (a_time->days < lower_bound) {
a_time->years--;
a_time->days += crm_time_leapyear(a_time->years) ? 366 : 365;
}
}
void
crm_time_add_months(crm_time_t * a_time, int extra)
{
int lpc;
uint32_t y, m, d, dmax;
crm_time_get_gregorian(a_time, &y, &m, &d);
crm_trace("Adding %d months to %.4d-%.2d-%.2d", extra, y, m, d);
if (extra > 0) {
for (lpc = extra; lpc > 0; lpc--) {
m++;
if (m == 13) {
m = 1;
y++;
}
}
} else {
for (lpc = -extra; lpc > 0; lpc--) {
m--;
if (m == 0) {
m = 12;
y--;
}
}
}
dmax = crm_time_days_in_month(m, y);
if (dmax < d) {
/* Preserve day-of-month unless the month doesn't have enough days */
d = dmax;
}
crm_trace("Calculated %.4d-%.2d-%.2d", y, m, d);
a_time->years = y;
a_time->days = get_ordinal_days(y, m, d);
crm_time_get_gregorian(a_time, &y, &m, &d);
crm_trace("Got %.4d-%.2d-%.2d", y, m, d);
}
void
crm_time_add_minutes(crm_time_t * a_time, int extra)
{
crm_time_add_seconds(a_time, extra * 60);
}
void
crm_time_add_hours(crm_time_t * a_time, int extra)
{
crm_time_add_seconds(a_time, extra * HOUR_SECONDS);
}
void
crm_time_add_weeks(crm_time_t * a_time, int extra)
{
crm_time_add_days(a_time, extra * 7);
}
void
crm_time_add_years(crm_time_t * a_time, int extra)
{
a_time->years += extra;
}
static void
ha_get_tm_time( struct tm *target, crm_time_t *source)
{
*target = (struct tm) {
.tm_year = source->years - 1900,
.tm_mday = source->days,
.tm_sec = source->seconds % 60,
.tm_min = ( source->seconds / 60 ) % 60,
.tm_hour = source->seconds / HOUR_SECONDS,
.tm_isdst = -1, /* don't adjust */
#if defined(HAVE_STRUCT_TM_TM_GMTOFF)
.tm_gmtoff = source->offset
#endif
};
mktime(target);
}
crm_time_hr_t *
crm_time_hr_convert(crm_time_hr_t *target, crm_time_t *dt)
{
crm_time_hr_t *hr_dt = NULL;
if (dt) {
hr_dt = target?target:calloc(1, sizeof(crm_time_hr_t));
CRM_ASSERT(hr_dt != NULL);
*hr_dt = (crm_time_hr_t) {
.years = dt->years,
.months = dt->months,
.days = dt->days,
.seconds = dt->seconds,
.offset = dt->offset,
.duration = dt->duration
};
}
return hr_dt;
}
void
crm_time_set_hr_dt(crm_time_t *target, crm_time_hr_t *hr_dt)
{
CRM_ASSERT((hr_dt) && (target));
*target = (crm_time_t) {
.years = hr_dt->years,
.months = hr_dt->months,
.days = hr_dt->days,
.seconds = hr_dt->seconds,
.offset = hr_dt->offset,
.duration = hr_dt->duration
};
}
crm_time_hr_t *
crm_time_timeval_hr_convert(crm_time_hr_t *target, struct timeval *tv)
{
crm_time_t dt;
crm_time_hr_t *ret;
crm_time_set_timet(&dt, &tv->tv_sec);
ret = crm_time_hr_convert(target, &dt);
if (ret) {
ret->useconds = tv->tv_usec;
}
return ret;
}
crm_time_hr_t *
crm_time_hr_new(const char *date_time)
{
crm_time_hr_t *hr_dt = NULL;
struct timeval tv_now;
if (!date_time) {
if (gettimeofday(&tv_now, NULL) == 0) {
hr_dt = crm_time_timeval_hr_convert(NULL, &tv_now);
}
} else {
crm_time_t *dt;
dt = parse_date(date_time);
hr_dt = crm_time_hr_convert(NULL, dt);
crm_time_free(dt);
}
return hr_dt;
}
void
crm_time_hr_free(crm_time_hr_t * hr_dt)
{
free(hr_dt);
}
char *
crm_time_format_hr(const char *format, crm_time_hr_t * hr_dt)
{
const char *mark_s;
int max = 128, scanned_pos = 0, printed_pos = 0, fmt_pos = 0,
date_len = 0, nano_digits = 0;
char nano_s[10], date_s[max+1], nanofmt_s[5] = "%", *tmp_fmt_s;
struct tm tm;
crm_time_t dt;
if (!format) {
return NULL;
}
crm_time_set_hr_dt(&dt, hr_dt);
ha_get_tm_time(&tm, &dt);
sprintf(nano_s, "%06d000", hr_dt->useconds);
while ((format[scanned_pos]) != '\0') {
mark_s = strchr(&format[scanned_pos], '%');
if (mark_s) {
int fmt_len = 1;
fmt_pos = mark_s - format;
while ((format[fmt_pos+fmt_len] != '\0') &&
(format[fmt_pos+fmt_len] >= '0') &&
(format[fmt_pos+fmt_len] <= '9')) {
fmt_len++;
}
scanned_pos = fmt_pos + fmt_len + 1;
if (format[fmt_pos+fmt_len] == 'N') {
nano_digits = atoi(&format[fmt_pos+1]);
nano_digits = (nano_digits > 6)?6:nano_digits;
nano_digits = (nano_digits < 0)?0:nano_digits;
sprintf(&nanofmt_s[1], ".%ds", nano_digits);
} else {
if (format[scanned_pos] != '\0') {
continue;
}
fmt_pos = scanned_pos; /* print till end */
}
} else {
scanned_pos = strlen(format);
fmt_pos = scanned_pos; /* print till end */
}
tmp_fmt_s = strndup(&format[printed_pos], fmt_pos - printed_pos);
#ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
#endif
date_len += strftime(&date_s[date_len], max-date_len, tmp_fmt_s, &tm);
#ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED
#pragma GCC diagnostic pop
#endif
printed_pos = scanned_pos;
free(tmp_fmt_s);
if (nano_digits) {
#ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
#endif
date_len += snprintf(&date_s[date_len], max-date_len,
nanofmt_s, nano_s);
#ifdef GCC_FORMAT_NONLITERAL_CHECKING_ENABLED
#pragma GCC diagnostic pop
#endif
nano_digits = 0;
}
}
return (date_len == 0)?NULL:strdup(date_s);
}
/*!
* \internal
- * \brief Return human-friendly string representing current time
+ * \brief Return human-friendly string corresponding to a time
*
- * \return Current time as string (as by ctime() but without newline) on success
- * or "Could not determine current time" on error
+ * \param[in] when Pointer to epoch time value (or NULL for current time)
+ *
+ * \return Current time as string (as by ctime() but without newline) on
+ * success, NULL otherwise
* \note The return value points to a statically allocated string which might be
* overwritten by subsequent calls to any of the C library date and time functions.
*/
const char *
crm_now_string(time_t *when)
{
char *since_epoch = NULL;
if (when == NULL) {
time_t a_time = time(NULL);
if (a_time == (time_t) -1) {
return NULL;
} else {
since_epoch = ctime(&a_time);
}
} else {
since_epoch = ctime(when);
}
if (since_epoch == NULL) {
return NULL;
} else {
return crm_strip_trailing_newline(since_epoch);
}
}
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 64c1979bf8..d33775838c 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -1,3774 +1,3842 @@
/*
* Copyright 2004-2019 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include
+#include
+#include
#include
#include
#include
#include
#include
#include
#include
#include
+#include
#include
#include
CRM_TRACE_INIT_DATA(pe_status);
#define set_config_flag(data_set, option, flag) do { \
const char *tmp = pe_pref(data_set->config_hash, option); \
if(tmp) { \
if(crm_is_true(tmp)) { \
set_bit(data_set->flags, flag); \
} else { \
clear_bit(data_set->flags, flag); \
} \
} \
} while(0)
static void unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
xmlNode **last_failure,
enum action_fail_response *failed,
pe_working_set_t *data_set);
static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node);
static void add_node_attrs(xmlNode *attrs, pe_node_t *node, bool overwrite,
pe_working_set_t *data_set);
// Bitmask for warnings we only want to print once
uint32_t pe_wo = 0;
static gboolean
is_dangling_guest_node(node_t *node)
{
/* we are looking for a remote-node that was supposed to be mapped to a
* container resource, but all traces of that container have disappeared
* from both the config and the status section. */
if (pe__is_guest_or_remote_node(node) &&
node->details->remote_rsc &&
node->details->remote_rsc->container == NULL &&
is_set(node->details->remote_rsc->flags, pe_rsc_orphan_container_filler)) {
return TRUE;
}
return FALSE;
}
/*!
* \brief Schedule a fence action for a node
*
* \param[in,out] data_set Current working set of cluster
* \param[in,out] node Node to fence
* \param[in] reason Text description of why fencing is needed
*/
void
pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
{
CRM_CHECK(node, return);
/* A guest node is fenced by marking its container as failed */
if (pe__is_guest_node(node)) {
resource_t *rsc = node->details->remote_rsc->container;
if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
if (!is_set(rsc->flags, pe_rsc_managed)) {
crm_notice("Not fencing guest node %s "
"(otherwise would because %s): "
"its guest resource %s is unmanaged",
node->details->uname, reason, rsc->id);
} else {
crm_warn("Guest node %s will be fenced "
"(by recovering its guest resource %s): %s",
node->details->uname, rsc->id, reason);
/* We don't mark the node as unclean because that would prevent the
* node from running resources. We want to allow it to run resources
* in this transition if the recovery succeeds.
*/
node->details->remote_requires_reset = TRUE;
set_bit(rsc->flags, pe_rsc_failed);
}
}
} else if (is_dangling_guest_node(node)) {
crm_info("Cleaning up dangling connection for guest node %s: "
"fencing was already done because %s, "
"and guest resource no longer exists",
node->details->uname, reason);
set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
} else if (pe__is_remote_node(node)) {
resource_t *rsc = node->details->remote_rsc;
if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) {
crm_notice("Not fencing remote node %s "
"(otherwise would because %s): connection is unmanaged",
node->details->uname, reason);
} else if(node->details->remote_requires_reset == FALSE) {
node->details->remote_requires_reset = TRUE;
crm_warn("Remote node %s %s: %s",
node->details->uname,
pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
reason);
}
node->details->unclean = TRUE;
pe_fence_op(node, NULL, TRUE, reason, data_set);
} else if (node->details->unclean) {
crm_trace("Cluster node %s %s because %s",
node->details->uname,
pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean",
reason);
} else {
crm_warn("Cluster node %s %s: %s",
node->details->uname,
pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
reason);
node->details->unclean = TRUE;
pe_fence_op(node, NULL, TRUE, reason, data_set);
}
}
// @TODO xpaths can't handle templates, rules, or id-refs
// nvpair with provides or requires set to unfencing
#define XPATH_UNFENCING_NVPAIR XML_CIB_TAG_NVPAIR \
"[(@" XML_NVPAIR_ATTR_NAME "='" XML_RSC_ATTR_PROVIDES "'" \
"or @" XML_NVPAIR_ATTR_NAME "='" XML_RSC_ATTR_REQUIRES "') " \
"and @" XML_NVPAIR_ATTR_VALUE "='unfencing']"
// unfencing in rsc_defaults or any resource
#define XPATH_ENABLE_UNFENCING \
"/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RESOURCES \
"//" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR \
"|/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RSCCONFIG \
"/" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR
static
void set_if_xpath(unsigned long long flag, const char *xpath,
pe_working_set_t *data_set)
{
xmlXPathObjectPtr result = NULL;
if (is_not_set(data_set->flags, flag)) {
result = xpath_search(data_set->input, xpath);
if (result && (numXpathResults(result) > 0)) {
set_bit(data_set->flags, flag);
}
freeXpathObject(result);
}
}
gboolean
unpack_config(xmlNode * config, pe_working_set_t * data_set)
{
const char *value = NULL;
GHashTable *config_hash = crm_str_table_new();
data_set->config_hash = config_hash;
pe__unpack_dataset_nvpairs(config, XML_CIB_TAG_PROPSET, NULL, config_hash,
CIB_OPTIONS_FIRST, FALSE, data_set);
verify_pe_options(data_set->config_hash);
set_config_flag(data_set, "enable-startup-probes", pe_flag_startup_probes);
if(is_not_set(data_set->flags, pe_flag_startup_probes)) {
crm_info("Startup probes: disabled (dangerous)");
}
value = pe_pref(data_set->config_hash, XML_ATTR_HAVE_WATCHDOG);
if (value && crm_is_true(value)) {
crm_notice("Watchdog will be used via SBD if fencing is required "
"and stonith-watchdog-timeout is nonzero");
set_bit(data_set->flags, pe_flag_have_stonith_resource);
}
/* Set certain flags via xpath here, so they can be used before the relevant
* configuration sections are unpacked.
*/
set_if_xpath(pe_flag_enable_unfencing, XPATH_ENABLE_UNFENCING, data_set);
value = pe_pref(data_set->config_hash, "stonith-timeout");
data_set->stonith_timeout = (int) crm_parse_interval_spec(value);
crm_debug("STONITH timeout: %d", data_set->stonith_timeout);
set_config_flag(data_set, "stonith-enabled", pe_flag_stonith_enabled);
crm_debug("STONITH of failed nodes is %s",
is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled");
data_set->stonith_action = pe_pref(data_set->config_hash, "stonith-action");
if (!strcmp(data_set->stonith_action, "poweroff")) {
pe_warn_once(pe_wo_poweroff,
"Support for stonith-action of 'poweroff' is deprecated "
"and will be removed in a future release (use 'off' instead)");
data_set->stonith_action = "off";
}
crm_trace("STONITH will %s nodes", data_set->stonith_action);
set_config_flag(data_set, "concurrent-fencing", pe_flag_concurrent_fencing);
crm_debug("Concurrent fencing is %s",
is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled");
set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything);
crm_debug("Stop all active resources: %s",
is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false");
set_config_flag(data_set, "symmetric-cluster", pe_flag_symmetric_cluster);
if (is_set(data_set->flags, pe_flag_symmetric_cluster)) {
crm_debug("Cluster is symmetric" " - resources can run anywhere by default");
}
value = pe_pref(data_set->config_hash, "no-quorum-policy");
if (safe_str_eq(value, "ignore")) {
data_set->no_quorum_policy = no_quorum_ignore;
} else if (safe_str_eq(value, "freeze")) {
data_set->no_quorum_policy = no_quorum_freeze;
} else if (safe_str_eq(value, "suicide")) {
if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
int do_panic = 0;
crm_element_value_int(data_set->input, XML_ATTR_QUORUM_PANIC,
&do_panic);
if (do_panic || is_set(data_set->flags, pe_flag_have_quorum)) {
data_set->no_quorum_policy = no_quorum_suicide;
} else {
crm_notice("Resetting no-quorum-policy to 'stop': cluster has never had quorum");
data_set->no_quorum_policy = no_quorum_stop;
}
} else {
crm_config_err("Resetting no-quorum-policy to 'stop': stonith is not configured");
data_set->no_quorum_policy = no_quorum_stop;
}
} else {
data_set->no_quorum_policy = no_quorum_stop;
}
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
crm_debug("On loss of quorum: Freeze resources");
break;
case no_quorum_stop:
crm_debug("On loss of quorum: Stop ALL resources");
break;
case no_quorum_suicide:
crm_notice("On loss of quorum: Fence all remaining nodes");
break;
case no_quorum_ignore:
crm_notice("On loss of quorum: Ignore");
break;
}
set_config_flag(data_set, "stop-orphan-resources", pe_flag_stop_rsc_orphans);
crm_trace("Orphan resources are %s",
is_set(data_set->flags, pe_flag_stop_rsc_orphans) ? "stopped" : "ignored");
set_config_flag(data_set, "stop-orphan-actions", pe_flag_stop_action_orphans);
crm_trace("Orphan resource actions are %s",
is_set(data_set->flags, pe_flag_stop_action_orphans) ? "stopped" : "ignored");
set_config_flag(data_set, "remove-after-stop", pe_flag_remove_after_stop);
crm_trace("Stopped resources are removed from the status section: %s",
is_set(data_set->flags, pe_flag_remove_after_stop) ? "true" : "false");
set_config_flag(data_set, "maintenance-mode", pe_flag_maintenance_mode);
crm_trace("Maintenance mode: %s",
is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false");
set_config_flag(data_set, "start-failure-is-fatal", pe_flag_start_failure_fatal);
crm_trace("Start failures are %s",
is_set(data_set->flags,
pe_flag_start_failure_fatal) ? "always fatal" : "handled by failcount");
if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
set_config_flag(data_set, "startup-fencing", pe_flag_startup_fencing);
}
if (is_set(data_set->flags, pe_flag_startup_fencing)) {
crm_trace("Unseen nodes will be fenced");
} else {
pe_warn_once(pe_wo_blind, "Blind faith: not fencing unseen nodes");
}
node_score_red = char2score(pe_pref(data_set->config_hash, "node-health-red"));
node_score_green = char2score(pe_pref(data_set->config_hash, "node-health-green"));
node_score_yellow = char2score(pe_pref(data_set->config_hash, "node-health-yellow"));
crm_debug("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s",
pe_pref(data_set->config_hash, "node-health-red"),
pe_pref(data_set->config_hash, "node-health-yellow"),
pe_pref(data_set->config_hash, "node-health-green"));
data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy");
crm_trace("Placement strategy: %s", data_set->placement_strategy);
return TRUE;
}
static void
destroy_digest_cache(gpointer ptr)
{
op_digest_cache_t *data = ptr;
free_xml(data->params_all);
free_xml(data->params_secure);
free_xml(data->params_restart);
free(data->digest_all_calc);
free(data->digest_restart_calc);
free(data->digest_secure_calc);
free(data);
}
node_t *
pe_create_node(const char *id, const char *uname, const char *type,
const char *score, pe_working_set_t * data_set)
{
node_t *new_node = NULL;
if (pe_find_node(data_set->nodes, uname) != NULL) {
crm_config_warn("Detected multiple node entries with uname=%s"
" - this is rarely intended", uname);
}
new_node = calloc(1, sizeof(node_t));
if (new_node == NULL) {
return NULL;
}
new_node->weight = char2score(score);
new_node->fixed = FALSE;
new_node->details = calloc(1, sizeof(struct pe_node_shared_s));
if (new_node->details == NULL) {
free(new_node);
return NULL;
}
crm_trace("Creating node for entry %s/%s", uname, id);
new_node->details->id = id;
new_node->details->uname = uname;
new_node->details->online = FALSE;
new_node->details->shutdown = FALSE;
new_node->details->rsc_discovery_enabled = TRUE;
new_node->details->running_rsc = NULL;
new_node->details->type = node_ping;
if (safe_str_eq(type, "remote")) {
new_node->details->type = node_remote;
set_bit(data_set->flags, pe_flag_have_remote_nodes);
} else if ((type == NULL) || safe_str_eq(type, "member")) {
new_node->details->type = node_member;
}
new_node->details->attrs = crm_str_table_new();
if (pe__is_guest_or_remote_node(new_node)) {
g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND),
strdup("remote"));
} else {
g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND),
strdup("cluster"));
}
new_node->details->utilization = crm_str_table_new();
new_node->details->digest_cache = g_hash_table_new_full(crm_str_hash,
g_str_equal, free,
destroy_digest_cache);
data_set->nodes = g_list_insert_sorted(data_set->nodes, new_node, sort_node_uname);
return new_node;
}
bool
remote_id_conflict(const char *remote_name, pe_working_set_t *data)
{
bool match = FALSE;
#if 1
pe_find_resource(data->resources, remote_name);
#else
if (data->name_check == NULL) {
data->name_check = g_hash_table_new(crm_str_hash, g_str_equal);
for (xml_rsc = __xml_first_child_element(parent); xml_rsc != NULL;
xml_rsc = __xml_next_element(xml_rsc)) {
const char *id = ID(xml_rsc);
/* avoiding heap allocation here because we know the duration of this hashtable allows us to */
g_hash_table_insert(data->name_check, (char *) id, (char *) id);
}
}
if (g_hash_table_lookup(data->name_check, remote_name)) {
match = TRUE;
}
#endif
if (match) {
crm_err("Invalid remote-node name, a resource called '%s' already exists.", remote_name);
return NULL;
}
return match;
}
static const char *
expand_remote_rsc_meta(xmlNode *xml_obj, xmlNode *parent, pe_working_set_t *data)
{
xmlNode *attr_set = NULL;
xmlNode *attr = NULL;
const char *container_id = ID(xml_obj);
const char *remote_name = NULL;
const char *remote_server = NULL;
const char *remote_port = NULL;
const char *connect_timeout = "60s";
const char *remote_allow_migrate=NULL;
const char *is_managed = NULL;
for (attr_set = __xml_first_child_element(xml_obj); attr_set != NULL;
attr_set = __xml_next_element(attr_set)) {
if (safe_str_neq((const char *)attr_set->name, XML_TAG_META_SETS)) {
continue;
}
for (attr = __xml_first_child_element(attr_set); attr != NULL;
attr = __xml_next_element(attr)) {
const char *value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE);
const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME);
if (safe_str_eq(name, XML_RSC_ATTR_REMOTE_NODE)) {
remote_name = value;
} else if (safe_str_eq(name, "remote-addr")) {
remote_server = value;
} else if (safe_str_eq(name, "remote-port")) {
remote_port = value;
} else if (safe_str_eq(name, "remote-connect-timeout")) {
connect_timeout = value;
} else if (safe_str_eq(name, "remote-allow-migrate")) {
remote_allow_migrate=value;
} else if (safe_str_eq(name, XML_RSC_ATTR_MANAGED)) {
is_managed = value;
}
}
}
if (remote_name == NULL) {
return NULL;
}
if (remote_id_conflict(remote_name, data)) {
return NULL;
}
pe_create_remote_xml(parent, remote_name, container_id,
remote_allow_migrate, is_managed,
connect_timeout, remote_server, remote_port);
return remote_name;
}
static void
handle_startup_fencing(pe_working_set_t *data_set, node_t *new_node)
{
if ((new_node->details->type == node_remote) && (new_node->details->remote_rsc == NULL)) {
/* Ignore fencing for remote nodes that don't have a connection resource
* associated with them. This happens when remote node entries get left
* in the nodes section after the connection resource is removed.
*/
return;
}
if (is_set(data_set->flags, pe_flag_startup_fencing)) {
// All nodes are unclean until we've seen their status entry
new_node->details->unclean = TRUE;
} else {
// Blind faith ...
new_node->details->unclean = FALSE;
}
/* We need to be able to determine if a node's status section
* exists or not separate from whether the node is unclean. */
new_node->details->unseen = TRUE;
}
gboolean
unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set)
{
xmlNode *xml_obj = NULL;
node_t *new_node = NULL;
const char *id = NULL;
const char *uname = NULL;
const char *type = NULL;
const char *score = NULL;
for (xml_obj = __xml_first_child_element(xml_nodes); xml_obj != NULL;
xml_obj = __xml_next_element(xml_obj)) {
if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_NODE, TRUE)) {
new_node = NULL;
id = crm_element_value(xml_obj, XML_ATTR_ID);
uname = crm_element_value(xml_obj, XML_ATTR_UNAME);
type = crm_element_value(xml_obj, XML_ATTR_TYPE);
score = crm_element_value(xml_obj, XML_RULE_ATTR_SCORE);
crm_trace("Processing node %s/%s", uname, id);
if (id == NULL) {
crm_config_err("Must specify id tag in ");
continue;
}
new_node = pe_create_node(id, uname, type, score, data_set);
if (new_node == NULL) {
return FALSE;
}
/* if(data_set->have_quorum == FALSE */
/* && data_set->no_quorum_policy == no_quorum_stop) { */
/* /\* start shutting resources down *\/ */
/* new_node->weight = -INFINITY; */
/* } */
handle_startup_fencing(data_set, new_node);
add_node_attrs(xml_obj, new_node, FALSE, data_set);
pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_UTILIZATION, NULL,
new_node->details->utilization, NULL,
FALSE, data_set);
crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME));
}
}
if (data_set->localhost && pe_find_node(data_set->nodes, data_set->localhost) == NULL) {
crm_info("Creating a fake local node");
pe_create_node(data_set->localhost, data_set->localhost, NULL, 0,
data_set);
}
return TRUE;
}
static void
setup_container(resource_t * rsc, pe_working_set_t * data_set)
{
const char *container_id = NULL;
if (rsc->children) {
GListPtr gIter = rsc->children;
for (; gIter != NULL; gIter = gIter->next) {
resource_t *child_rsc = (resource_t *) gIter->data;
setup_container(child_rsc, data_set);
}
return;
}
container_id = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_CONTAINER);
if (container_id && safe_str_neq(container_id, rsc->id)) {
resource_t *container = pe_find_resource(data_set->resources, container_id);
if (container) {
rsc->container = container;
set_bit(container->flags, pe_rsc_is_container);
container->fillers = g_list_append(container->fillers, rsc);
pe_rsc_trace(rsc, "Resource %s's container is %s", rsc->id, container_id);
} else {
pe_err("Resource %s: Unknown resource container (%s)", rsc->id, container_id);
}
}
}
gboolean
unpack_remote_nodes(xmlNode * xml_resources, pe_working_set_t * data_set)
{
xmlNode *xml_obj = NULL;
/* Create remote nodes and guest nodes from the resource configuration
* before unpacking resources.
*/
for (xml_obj = __xml_first_child_element(xml_resources); xml_obj != NULL;
xml_obj = __xml_next_element(xml_obj)) {
const char *new_node_id = NULL;
/* Check for remote nodes, which are defined by ocf:pacemaker:remote
* primitives.
*/
if (xml_contains_remote_node(xml_obj)) {
new_node_id = ID(xml_obj);
/* The "pe_find_node" check is here to make sure we don't iterate over
* an expanded node that has already been added to the node list. */
if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) {
crm_trace("Found remote node %s defined by resource %s",
new_node_id, ID(xml_obj));
pe_create_node(new_node_id, new_node_id, "remote", NULL,
data_set);
}
continue;
}
/* Check for guest nodes, which are defined by special meta-attributes
* of a primitive of any type (for example, VirtualDomain or Xen).
*/
if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RESOURCE, TRUE)) {
/* This will add an ocf:pacemaker:remote primitive to the
* configuration for the guest node's connection, to be unpacked
* later.
*/
new_node_id = expand_remote_rsc_meta(xml_obj, xml_resources, data_set);
if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) {
crm_trace("Found guest node %s in resource %s",
new_node_id, ID(xml_obj));
pe_create_node(new_node_id, new_node_id, "remote", NULL,
data_set);
}
continue;
}
/* Check for guest nodes inside a group. Clones are currently not
* supported as guest nodes.
*/
if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_GROUP, TRUE)) {
xmlNode *xml_obj2 = NULL;
for (xml_obj2 = __xml_first_child_element(xml_obj); xml_obj2 != NULL;
xml_obj2 = __xml_next_element(xml_obj2)) {
new_node_id = expand_remote_rsc_meta(xml_obj2, xml_resources, data_set);
if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) {
crm_trace("Found guest node %s in resource %s inside group %s",
new_node_id, ID(xml_obj2), ID(xml_obj));
pe_create_node(new_node_id, new_node_id, "remote", NULL,
data_set);
}
}
}
}
return TRUE;
}
/* Call this after all the nodes and resources have been
* unpacked, but before the status section is read.
*
* A remote node's online status is reflected by the state
* of the remote node's connection resource. We need to link
* the remote node to this connection resource so we can have
* easy access to the connection resource during the PE calculations.
*/
static void
link_rsc2remotenode(pe_working_set_t *data_set, resource_t *new_rsc)
{
node_t *remote_node = NULL;
if (new_rsc->is_remote_node == FALSE) {
return;
}
if (is_set(data_set->flags, pe_flag_quick_location)) {
/* remote_nodes and remote_resources are not linked in quick location calculations */
return;
}
remote_node = pe_find_node(data_set->nodes, new_rsc->id);
CRM_CHECK(remote_node != NULL, return;);
pe_rsc_trace(new_rsc, "Linking remote connection resource %s to node %s",
new_rsc->id, remote_node->details->uname);
remote_node->details->remote_rsc = new_rsc;
if (new_rsc->container == NULL) {
/* Handle start-up fencing for remote nodes (as opposed to guest nodes)
* the same as is done for cluster nodes.
*/
handle_startup_fencing(data_set, remote_node);
} else {
/* pe_create_node() marks the new node as "remote" or "cluster"; now
* that we know the node is a guest node, update it correctly.
*/
g_hash_table_replace(remote_node->details->attrs, strdup(CRM_ATTR_KIND),
strdup("container"));
}
}
static void
destroy_tag(gpointer data)
{
tag_t *tag = data;
if (tag) {
free(tag->id);
g_list_free_full(tag->refs, free);
free(tag);
}
}
/*!
* \internal
* \brief Parse configuration XML for resource information
*
* \param[in] xml_resources Top of resource configuration XML
* \param[in,out] data_set Where to put resource information
*
* \return TRUE
*
* \note unpack_remote_nodes() MUST be called before this, so that the nodes can
* be used when common_unpack() calls resource_location()
*/
gboolean
unpack_resources(xmlNode * xml_resources, pe_working_set_t * data_set)
{
xmlNode *xml_obj = NULL;
GListPtr gIter = NULL;
data_set->template_rsc_sets = g_hash_table_new_full(crm_str_hash,
g_str_equal, free,
destroy_tag);
for (xml_obj = __xml_first_child_element(xml_resources); xml_obj != NULL;
xml_obj = __xml_next_element(xml_obj)) {
resource_t *new_rsc = NULL;
if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RSC_TEMPLATE, TRUE)) {
const char *template_id = ID(xml_obj);
if (template_id && g_hash_table_lookup_extended(data_set->template_rsc_sets,
template_id, NULL, NULL) == FALSE) {
/* Record the template's ID for the knowledge of its existence anyway. */
g_hash_table_insert(data_set->template_rsc_sets, strdup(template_id), NULL);
}
continue;
}
crm_trace("Beginning unpack... <%s id=%s... >", crm_element_name(xml_obj), ID(xml_obj));
if (common_unpack(xml_obj, &new_rsc, NULL, data_set)) {
data_set->resources = g_list_append(data_set->resources, new_rsc);
pe_rsc_trace(new_rsc, "Added resource %s", new_rsc->id);
} else {
crm_config_err("Failed unpacking %s %s",
crm_element_name(xml_obj), crm_element_value(xml_obj, XML_ATTR_ID));
if (new_rsc != NULL && new_rsc->fns != NULL) {
new_rsc->fns->free(new_rsc);
}
}
}
for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
resource_t *rsc = (resource_t *) gIter->data;
setup_container(rsc, data_set);
link_rsc2remotenode(data_set, rsc);
}
data_set->resources = g_list_sort(data_set->resources, sort_rsc_priority);
if (is_set(data_set->flags, pe_flag_quick_location)) {
/* Ignore */
} else if (is_set(data_set->flags, pe_flag_stonith_enabled)
&& is_set(data_set->flags, pe_flag_have_stonith_resource) == FALSE) {
crm_config_err("Resource start-up disabled since no STONITH resources have been defined");
crm_config_err("Either configure some or disable STONITH with the stonith-enabled option");
crm_config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity");
}
return TRUE;
}
gboolean
unpack_tags(xmlNode * xml_tags, pe_working_set_t * data_set)
{
xmlNode *xml_tag = NULL;
data_set->tags = g_hash_table_new_full(crm_str_hash, g_str_equal, free,
destroy_tag);
for (xml_tag = __xml_first_child_element(xml_tags); xml_tag != NULL;
xml_tag = __xml_next_element(xml_tag)) {
xmlNode *xml_obj_ref = NULL;
const char *tag_id = ID(xml_tag);
if (crm_str_eq((const char *)xml_tag->name, XML_CIB_TAG_TAG, TRUE) == FALSE) {
continue;
}
if (tag_id == NULL) {
crm_config_err("Failed unpacking %s: %s should be specified",
crm_element_name(xml_tag), XML_ATTR_ID);
continue;
}
for (xml_obj_ref = __xml_first_child_element(xml_tag); xml_obj_ref != NULL;
xml_obj_ref = __xml_next_element(xml_obj_ref)) {
const char *obj_ref = ID(xml_obj_ref);
if (crm_str_eq((const char *)xml_obj_ref->name, XML_CIB_TAG_OBJ_REF, TRUE) == FALSE) {
continue;
}
if (obj_ref == NULL) {
crm_config_err("Failed unpacking %s for tag %s: %s should be specified",
crm_element_name(xml_obj_ref), tag_id, XML_ATTR_ID);
continue;
}
if (add_tag_ref(data_set->tags, tag_id, obj_ref) == FALSE) {
return FALSE;
}
}
}
return TRUE;
}
/* The ticket state section:
* "/cib/status/tickets/ticket_state" */
static gboolean
unpack_ticket_state(xmlNode * xml_ticket, pe_working_set_t * data_set)
{
const char *ticket_id = NULL;
const char *granted = NULL;
const char *last_granted = NULL;
const char *standby = NULL;
xmlAttrPtr xIter = NULL;
ticket_t *ticket = NULL;
ticket_id = ID(xml_ticket);
if (ticket_id == NULL || strlen(ticket_id) == 0) {
return FALSE;
}
crm_trace("Processing ticket state for %s", ticket_id);
ticket = g_hash_table_lookup(data_set->tickets, ticket_id);
if (ticket == NULL) {
ticket = ticket_new(ticket_id, data_set);
if (ticket == NULL) {
return FALSE;
}
}
for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) {
const char *prop_name = (const char *)xIter->name;
const char *prop_value = crm_element_value(xml_ticket, prop_name);
if (crm_str_eq(prop_name, XML_ATTR_ID, TRUE)) {
continue;
}
g_hash_table_replace(ticket->state, strdup(prop_name), strdup(prop_value));
}
granted = g_hash_table_lookup(ticket->state, "granted");
if (granted && crm_is_true(granted)) {
ticket->granted = TRUE;
crm_info("We have ticket '%s'", ticket->id);
} else {
ticket->granted = FALSE;
crm_info("We do not have ticket '%s'", ticket->id);
}
last_granted = g_hash_table_lookup(ticket->state, "last-granted");
if (last_granted) {
ticket->last_granted = crm_parse_int(last_granted, 0);
}
standby = g_hash_table_lookup(ticket->state, "standby");
if (standby && crm_is_true(standby)) {
ticket->standby = TRUE;
if (ticket->granted) {
crm_info("Granted ticket '%s' is in standby-mode", ticket->id);
}
} else {
ticket->standby = FALSE;
}
crm_trace("Done with ticket state for %s", ticket_id);
return TRUE;
}
static gboolean
unpack_tickets_state(xmlNode * xml_tickets, pe_working_set_t * data_set)
{
xmlNode *xml_obj = NULL;
for (xml_obj = __xml_first_child_element(xml_tickets); xml_obj != NULL;
xml_obj = __xml_next_element(xml_obj)) {
if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_TICKET_STATE, TRUE) == FALSE) {
continue;
}
unpack_ticket_state(xml_obj, data_set);
}
return TRUE;
}
static void
unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * data_set)
{
const char *resource_discovery_enabled = NULL;
xmlNode *attrs = NULL;
resource_t *rsc = NULL;
if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) {
return;
}
if ((this_node == NULL) || !pe__is_guest_or_remote_node(this_node)) {
return;
}
crm_trace("Processing remote node id=%s, uname=%s", this_node->details->id, this_node->details->uname);
this_node->details->remote_maintenance =
crm_atoi(crm_element_value(state, XML_NODE_IS_MAINTENANCE), "0");
rsc = this_node->details->remote_rsc;
if (this_node->details->remote_requires_reset == FALSE) {
this_node->details->unclean = FALSE;
this_node->details->unseen = FALSE;
}
attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE);
add_node_attrs(attrs, this_node, TRUE, data_set);
if (pe__shutdown_requested(this_node)) {
crm_info("Node %s is shutting down", this_node->details->uname);
this_node->details->shutdown = TRUE;
if (rsc) {
rsc->next_role = RSC_ROLE_STOPPED;
}
}
if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) {
crm_info("Node %s is in standby-mode", this_node->details->uname);
this_node->details->standby = TRUE;
}
if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance")) ||
(rsc && !is_set(rsc->flags, pe_rsc_managed))) {
crm_info("Node %s is in maintenance-mode", this_node->details->uname);
this_node->details->maintenance = TRUE;
}
resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY);
if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) {
if (pe__is_remote_node(this_node)
&& is_not_set(data_set->flags, pe_flag_stonith_enabled)) {
crm_warn("Ignoring %s attribute on remote node %s because stonith is disabled",
XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname);
} else {
/* This is either a remote node with fencing enabled, or a guest
* node. We don't care whether fencing is enabled when fencing guest
* nodes, because they are "fenced" by recovering their containing
* resource.
*/
crm_info("Node %s has resource discovery disabled", this_node->details->uname);
this_node->details->rsc_discovery_enabled = FALSE;
}
}
}
static bool
unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set)
{
bool changed = false;
xmlNode *lrm_rsc = NULL;
for (xmlNode *state = __xml_first_child_element(status); state != NULL;
state = __xml_next_element(state)) {
const char *id = NULL;
const char *uname = NULL;
node_t *this_node = NULL;
bool process = FALSE;
if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) {
continue;
}
id = crm_element_value(state, XML_ATTR_ID);
uname = crm_element_value(state, XML_ATTR_UNAME);
this_node = pe_find_node_any(data_set->nodes, id, uname);
if (this_node == NULL) {
crm_info("Node %s is unknown", id);
continue;
} else if (this_node->details->unpacked) {
crm_info("Node %s is already processed", id);
continue;
} else if (!pe__is_guest_or_remote_node(this_node)
&& is_set(data_set->flags, pe_flag_stonith_enabled)) {
// A redundant test, but preserves the order for regression tests
process = TRUE;
} else if (pe__is_guest_or_remote_node(this_node)) {
bool check = FALSE;
resource_t *rsc = this_node->details->remote_rsc;
if(fence) {
check = TRUE;
} else if(rsc == NULL) {
/* Not ready yet */
} else if (pe__is_guest_node(this_node)
&& rsc->role == RSC_ROLE_STARTED
&& rsc->container->role == RSC_ROLE_STARTED) {
/* Both the connection and its containing resource need to be
* known to be up before we process resources running in it.
*/
check = TRUE;
crm_trace("Checking node %s/%s/%s status %d/%d/%d", id, rsc->id, rsc->container->id, fence, rsc->role, RSC_ROLE_STARTED);
} else if (!pe__is_guest_node(this_node)
&& rsc->role == RSC_ROLE_STARTED) {
check = TRUE;
crm_trace("Checking node %s/%s status %d/%d/%d", id, rsc->id, fence, rsc->role, RSC_ROLE_STARTED);
}
if (check) {
determine_remote_online_status(data_set, this_node);
unpack_handle_remote_attrs(this_node, state, data_set);
process = TRUE;
}
} else if (this_node->details->online) {
process = TRUE;
} else if (fence) {
process = TRUE;
}
if(process) {
crm_trace("Processing lrm resource entries on %shealthy%s node: %s",
fence?"un":"",
(pe__is_guest_or_remote_node(this_node)? " remote" : ""),
this_node->details->uname);
changed = TRUE;
this_node->details->unpacked = TRUE;
lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE);
lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE);
unpack_lrm_resources(this_node, lrm_rsc, data_set);
}
}
return changed;
}
/* remove nodes that are down, stopping */
/* create positive rsc_to_node constraints between resources and the nodes they are running on */
/* anything else? */
gboolean
unpack_status(xmlNode * status, pe_working_set_t * data_set)
{
const char *id = NULL;
const char *uname = NULL;
xmlNode *state = NULL;
node_t *this_node = NULL;
crm_trace("Beginning unpack");
if (data_set->tickets == NULL) {
data_set->tickets = g_hash_table_new_full(crm_str_hash, g_str_equal,
free, destroy_ticket);
}
for (state = __xml_first_child_element(status); state != NULL;
state = __xml_next_element(state)) {
if (crm_str_eq((const char *)state->name, XML_CIB_TAG_TICKETS, TRUE)) {
unpack_tickets_state((xmlNode *) state, data_set);
} else if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE)) {
xmlNode *attrs = NULL;
const char *resource_discovery_enabled = NULL;
id = crm_element_value(state, XML_ATTR_ID);
uname = crm_element_value(state, XML_ATTR_UNAME);
this_node = pe_find_node_any(data_set->nodes, id, uname);
if (uname == NULL) {
/* error */
continue;
} else if (this_node == NULL) {
crm_config_warn("Node %s in status section no longer exists", uname);
continue;
} else if (pe__is_guest_or_remote_node(this_node)) {
/* online state for remote nodes is determined by the
* rsc state after all the unpacking is done. we do however
* need to mark whether or not the node has been fenced as this plays
* a role during unpacking cluster node resource state */
this_node->details->remote_was_fenced =
crm_atoi(crm_element_value(state, XML_NODE_IS_FENCED), "0");
continue;
}
crm_trace("Processing node id=%s, uname=%s", id, uname);
/* Mark the node as provisionally clean
* - at least we have seen it in the current cluster's lifetime
*/
this_node->details->unclean = FALSE;
this_node->details->unseen = FALSE;
attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE);
add_node_attrs(attrs, this_node, TRUE, data_set);
if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) {
crm_info("Node %s is in standby-mode", this_node->details->uname);
this_node->details->standby = TRUE;
}
if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance"))) {
crm_info("Node %s is in maintenance-mode", this_node->details->uname);
this_node->details->maintenance = TRUE;
}
resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY);
if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) {
crm_warn("ignoring %s attribute on node %s, disabling resource discovery is not allowed on cluster nodes",
XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname);
}
crm_trace("determining node state");
determine_online_status(state, this_node, data_set);
if (is_not_set(data_set->flags, pe_flag_have_quorum)
&& this_node->details->online
&& (data_set->no_quorum_policy == no_quorum_suicide)) {
/* Everything else should flow from this automatically
* At least until the PE becomes able to migrate off healthy resources
*/
pe_fence_node(data_set, this_node, "cluster does not have quorum");
}
}
}
while(unpack_node_loop(status, FALSE, data_set)) {
crm_trace("Start another loop");
}
// Now catch any nodes we didn't see
unpack_node_loop(status, is_set(data_set->flags, pe_flag_stonith_enabled), data_set);
/* Now that we know where resources are, we can schedule stops of containers
* with failed bundle connections
*/
if (data_set->stop_needed != NULL) {
for (GList *item = data_set->stop_needed; item; item = item->next) {
pe_resource_t *container = item->data;
pe_node_t *node = pe__current_node(container);
if (node) {
stop_action(container, node, FALSE);
}
}
g_list_free(data_set->stop_needed);
data_set->stop_needed = NULL;
}
for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
node_t *this_node = gIter->data;
if (this_node == NULL) {
continue;
} else if (!pe__is_guest_or_remote_node(this_node)) {
continue;
} else if(this_node->details->unpacked) {
continue;
}
determine_remote_online_status(data_set, this_node);
}
return TRUE;
}
static gboolean
determine_online_status_no_fencing(pe_working_set_t * data_set, xmlNode * node_state,
node_t * this_node)
{
gboolean online = FALSE;
const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE);
const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER);
const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER);
const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED);
if (!crm_is_true(in_cluster)) {
crm_trace("Node is down: in_cluster=%s", crm_str(in_cluster));
} else if (safe_str_eq(is_peer, ONLINESTATUS)) {
if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) {
online = TRUE;
} else {
crm_debug("Node is not ready to run resources: %s", join);
}
} else if (this_node->details->expected_up == FALSE) {
crm_trace("Controller is down: in_cluster=%s", crm_str(in_cluster));
crm_trace("\tis_peer=%s, join=%s, expected=%s",
crm_str(is_peer), crm_str(join), crm_str(exp_state));
} else {
/* mark it unclean */
pe_fence_node(data_set, this_node, "peer is unexpectedly down");
crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s",
crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state));
}
return online;
}
static gboolean
determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_state,
node_t * this_node)
{
gboolean online = FALSE;
gboolean do_terminate = FALSE;
bool crmd_online = FALSE;
const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE);
const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER);
const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER);
const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED);
const char *terminate = pe_node_attribute_raw(this_node, "terminate");
/*
- XML_NODE_IN_CLUSTER ::= true|false
- XML_NODE_IS_PEER ::= online|offline
- XML_NODE_JOIN_STATE ::= member|down|pending|banned
- XML_NODE_EXPECTED ::= member|down
*/
if (crm_is_true(terminate)) {
do_terminate = TRUE;
} else if (terminate != NULL && strlen(terminate) > 0) {
/* could be a time() value */
char t = terminate[0];
if (t != '0' && isdigit(t)) {
do_terminate = TRUE;
}
}
crm_trace("%s: in_cluster=%s, is_peer=%s, join=%s, expected=%s, term=%d",
this_node->details->uname, crm_str(in_cluster), crm_str(is_peer),
crm_str(join), crm_str(exp_state), do_terminate);
online = crm_is_true(in_cluster);
crmd_online = safe_str_eq(is_peer, ONLINESTATUS);
if (exp_state == NULL) {
exp_state = CRMD_JOINSTATE_DOWN;
}
if (this_node->details->shutdown) {
crm_debug("%s is shutting down", this_node->details->uname);
/* Slightly different criteria since we can't shut down a dead peer */
online = crmd_online;
} else if (in_cluster == NULL) {
pe_fence_node(data_set, this_node, "peer has not been seen by the cluster");
} else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) {
pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria");
} else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) {
if (crm_is_true(in_cluster) || crmd_online) {
crm_info("- Node %s is not ready to run resources", this_node->details->uname);
this_node->details->standby = TRUE;
this_node->details->pending = TRUE;
} else {
crm_trace("%s is down or still coming up", this_node->details->uname);
}
} else if (do_terminate && safe_str_eq(join, CRMD_JOINSTATE_DOWN)
&& crm_is_true(in_cluster) == FALSE && !crmd_online) {
crm_info("Node %s was just shot", this_node->details->uname);
online = FALSE;
} else if (crm_is_true(in_cluster) == FALSE) {
pe_fence_node(data_set, this_node, "peer is no longer part of the cluster");
} else if (!crmd_online) {
pe_fence_node(data_set, this_node, "peer process is no longer available");
/* Everything is running at this point, now check join state */
} else if (do_terminate) {
pe_fence_node(data_set, this_node, "termination was requested");
} else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) {
crm_info("Node %s is active", this_node->details->uname);
} else if (safe_str_eq(join, CRMD_JOINSTATE_PENDING)
|| safe_str_eq(join, CRMD_JOINSTATE_DOWN)) {
crm_info("Node %s is not ready to run resources", this_node->details->uname);
this_node->details->standby = TRUE;
this_node->details->pending = TRUE;
} else {
pe_fence_node(data_set, this_node, "peer was in an unknown state");
crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d",
this_node->details->uname, crm_str(in_cluster), crm_str(is_peer),
crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown);
}
return online;
}
static gboolean
determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node)
{
resource_t *rsc = this_node->details->remote_rsc;
resource_t *container = NULL;
pe_node_t *host = NULL;
/* If there is a node state entry for a (former) Pacemaker Remote node
* but no resource creating that node, the node's connection resource will
* be NULL. Consider it an offline remote node in that case.
*/
if (rsc == NULL) {
this_node->details->online = FALSE;
goto remote_online_done;
}
container = rsc->container;
if (container && (g_list_length(rsc->running_on) == 1)) {
host = rsc->running_on->data;
}
/* If the resource is currently started, mark it online. */
if (rsc->role == RSC_ROLE_STARTED) {
crm_trace("%s node %s presumed ONLINE because connection resource is started",
(container? "Guest" : "Remote"), this_node->details->id);
this_node->details->online = TRUE;
}
/* consider this node shutting down if transitioning start->stop */
if (rsc->role == RSC_ROLE_STARTED && rsc->next_role == RSC_ROLE_STOPPED) {
crm_trace("%s node %s shutting down because connection resource is stopping",
(container? "Guest" : "Remote"), this_node->details->id);
this_node->details->shutdown = TRUE;
}
/* Now check all the failure conditions. */
if(container && is_set(container->flags, pe_rsc_failed)) {
crm_trace("Guest node %s UNCLEAN because guest resource failed",
this_node->details->id);
this_node->details->online = FALSE;
this_node->details->remote_requires_reset = TRUE;
} else if(is_set(rsc->flags, pe_rsc_failed)) {
crm_trace("%s node %s OFFLINE because connection resource failed",
(container? "Guest" : "Remote"), this_node->details->id);
this_node->details->online = FALSE;
} else if (rsc->role == RSC_ROLE_STOPPED
|| (container && container->role == RSC_ROLE_STOPPED)) {
crm_trace("%s node %s OFFLINE because its resource is stopped",
(container? "Guest" : "Remote"), this_node->details->id);
this_node->details->online = FALSE;
this_node->details->remote_requires_reset = FALSE;
} else if (host && (host->details->online == FALSE)
&& host->details->unclean) {
crm_trace("Guest node %s UNCLEAN because host is unclean",
this_node->details->id);
this_node->details->online = FALSE;
this_node->details->remote_requires_reset = TRUE;
}
remote_online_done:
crm_trace("Remote node %s online=%s",
this_node->details->id, this_node->details->online ? "TRUE" : "FALSE");
return this_node->details->online;
}
gboolean
determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set)
{
gboolean online = FALSE;
const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED);
if (this_node == NULL) {
crm_config_err("No node to check");
return online;
}
this_node->details->shutdown = FALSE;
this_node->details->expected_up = FALSE;
if (pe__shutdown_requested(this_node)) {
this_node->details->shutdown = TRUE;
} else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) {
this_node->details->expected_up = TRUE;
}
if (this_node->details->type == node_ping) {
this_node->details->unclean = FALSE;
online = FALSE; /* As far as resource management is concerned,
* the node is safely offline.
* Anyone caught abusing this logic will be shot
*/
} else if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) {
online = determine_online_status_no_fencing(data_set, node_state, this_node);
} else {
online = determine_online_status_fencing(data_set, node_state, this_node);
}
if (online) {
this_node->details->online = TRUE;
} else {
/* remove node from contention */
this_node->fixed = TRUE;
this_node->weight = -INFINITY;
}
if (online && this_node->details->shutdown) {
/* don't run resources here */
this_node->fixed = TRUE;
this_node->weight = -INFINITY;
}
if (this_node->details->type == node_ping) {
crm_info("Node %s is not a pacemaker node", this_node->details->uname);
} else if (this_node->details->unclean) {
pe_proc_warn("Node %s is unclean", this_node->details->uname);
} else if (this_node->details->online) {
crm_info("Node %s is %s", this_node->details->uname,
this_node->details->shutdown ? "shutting down" :
this_node->details->pending ? "pending" :
this_node->details->standby ? "standby" :
this_node->details->maintenance ? "maintenance" : "online");
} else {
crm_trace("Node %s is offline", this_node->details->uname);
}
return online;
}
/*!
* \internal
* \brief Find the end of a resource's name, excluding any clone suffix
*
* \param[in] id Resource ID to check
*
* \return Pointer to last character of resource's base name
*/
const char *
pe_base_name_end(const char *id)
{
if (!crm_strlen_zero(id)) {
const char *end = id + strlen(id) - 1;
for (const char *s = end; s > id; --s) {
switch (*s) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
break;
case ':':
return (s == end)? s : (s - 1);
default:
return end;
}
}
return end;
}
return NULL;
}
/*!
* \internal
* \brief Get a resource name excluding any clone suffix
*
* \param[in] last_rsc_id Resource ID to check
*
* \return Pointer to newly allocated string with resource's base name
* \note It is the caller's responsibility to free() the result.
* This asserts on error, so callers can assume result is not NULL.
*/
char *
clone_strip(const char *last_rsc_id)
{
const char *end = pe_base_name_end(last_rsc_id);
char *basename = NULL;
CRM_ASSERT(end);
basename = strndup(last_rsc_id, end - last_rsc_id + 1);
CRM_ASSERT(basename);
return basename;
}
/*!
* \internal
* \brief Get the name of the first instance of a cloned resource
*
* \param[in] last_rsc_id Resource ID to check
*
* \return Pointer to newly allocated string with resource's base name plus :0
* \note It is the caller's responsibility to free() the result.
* This asserts on error, so callers can assume result is not NULL.
*/
char *
clone_zero(const char *last_rsc_id)
{
const char *end = pe_base_name_end(last_rsc_id);
size_t base_name_len = end - last_rsc_id + 1;
char *zero = NULL;
CRM_ASSERT(end);
zero = calloc(base_name_len + 3, sizeof(char));
CRM_ASSERT(zero);
memcpy(zero, last_rsc_id, base_name_len);
zero[base_name_len] = ':';
zero[base_name_len + 1] = '0';
return zero;
}
static resource_t *
create_fake_resource(const char *rsc_id, xmlNode * rsc_entry, pe_working_set_t * data_set)
{
resource_t *rsc = NULL;
xmlNode *xml_rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE);
copy_in_properties(xml_rsc, rsc_entry);
crm_xml_add(xml_rsc, XML_ATTR_ID, rsc_id);
crm_log_xml_debug(xml_rsc, "Orphan resource");
if (!common_unpack(xml_rsc, &rsc, NULL, data_set)) {
return NULL;
}
if (xml_contains_remote_node(xml_rsc)) {
node_t *node;
crm_debug("Detected orphaned remote node %s", rsc_id);
node = pe_find_node(data_set->nodes, rsc_id);
if (node == NULL) {
node = pe_create_node(rsc_id, rsc_id, "remote", NULL, data_set);
}
link_rsc2remotenode(data_set, rsc);
if (node) {
crm_trace("Setting node %s as shutting down due to orphaned connection resource", rsc_id);
node->details->shutdown = TRUE;
}
}
if (crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER)) {
/* This orphaned rsc needs to be mapped to a container. */
crm_trace("Detected orphaned container filler %s", rsc_id);
set_bit(rsc->flags, pe_rsc_orphan_container_filler);
}
set_bit(rsc->flags, pe_rsc_orphan);
data_set->resources = g_list_append(data_set->resources, rsc);
return rsc;
}
/*!
* \internal
* \brief Create orphan instance for anonymous clone resource history
*/
static pe_resource_t *
create_anonymous_orphan(pe_resource_t *parent, const char *rsc_id,
pe_node_t *node, pe_working_set_t *data_set)
{
pe_resource_t *top = pe__create_clone_child(parent, data_set);
// find_rsc() because we might be a cloned group
pe_resource_t *orphan = top->fns->find_rsc(top, rsc_id, NULL, pe_find_clone);
pe_rsc_debug(parent, "Created orphan %s for %s: %s on %s",
top->id, parent->id, rsc_id, node->details->uname);
return orphan;
}
/*!
* \internal
* \brief Check a node for an instance of an anonymous clone
*
* Return a child instance of the specified anonymous clone, in order of
* preference: (1) the instance running on the specified node, if any;
* (2) an inactive instance (i.e. within the total of clone-max instances);
* (3) a newly created orphan (i.e. clone-max instances are already active).
*
* \param[in] data_set Cluster information
* \param[in] node Node on which to check for instance
* \param[in] parent Clone to check
* \param[in] rsc_id Name of cloned resource in history (without instance)
*/
static resource_t *
find_anonymous_clone(pe_working_set_t * data_set, node_t * node, resource_t * parent,
const char *rsc_id)
{
GListPtr rIter = NULL;
pe_resource_t *rsc = NULL;
pe_resource_t *inactive_instance = NULL;
gboolean skip_inactive = FALSE;
CRM_ASSERT(parent != NULL);
CRM_ASSERT(pe_rsc_is_clone(parent));
CRM_ASSERT(is_not_set(parent->flags, pe_rsc_unique));
// Check for active (or partially active, for cloned groups) instance
pe_rsc_trace(parent, "Looking for %s on %s in %s", rsc_id, node->details->uname, parent->id);
for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) {
GListPtr locations = NULL;
resource_t *child = rIter->data;
/* Check whether this instance is already known to be active or pending
* anywhere, at this stage of unpacking. Because this function is called
* for a resource before the resource's individual operation history
* entries are unpacked, locations will generally not contain the
* desired node.
*
* However, there are three exceptions:
* (1) when child is a cloned group and we have already unpacked the
* history of another member of the group on the same node;
* (2) when we've already unpacked the history of another numbered
* instance on the same node (which can happen if globally-unique
* was flipped from true to false); and
* (3) when we re-run calculations on the same data set as part of a
* simulation.
*/
child->fns->location(child, &locations, 2);
if (locations) {
/* We should never associate the same numbered anonymous clone
* instance with multiple nodes, and clone instances can't migrate,
* so there must be only one location, regardless of history.
*/
CRM_LOG_ASSERT(locations->next == NULL);
if (((pe_node_t *)locations->data)->details == node->details) {
/* This child instance is active on the requested node, so check
* for a corresponding configured resource. We use find_rsc()
* instead of child because child may be a cloned group, and we
* need the particular member corresponding to rsc_id.
*
* If the history entry is orphaned, rsc will be NULL.
*/
rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone);
if (rsc) {
/* If there are multiple instance history entries for an
* anonymous clone in a single node's history (which can
* happen if globally-unique is switched from true to
* false), we want to consider the instances beyond the
* first as orphans, even if there are inactive instance
* numbers available.
*/
if (rsc->running_on) {
crm_notice("Active (now-)anonymous clone %s has "
"multiple (orphan) instance histories on %s",
parent->id, node->details->uname);
skip_inactive = TRUE;
rsc = NULL;
} else {
pe_rsc_trace(parent, "Resource %s, active", rsc->id);
}
}
}
g_list_free(locations);
} else {
pe_rsc_trace(parent, "Resource %s, skip inactive", child->id);
if (!skip_inactive && !inactive_instance
&& is_not_set(child->flags, pe_rsc_block)) {
// Remember one inactive instance in case we don't find active
inactive_instance = parent->fns->find_rsc(child, rsc_id, NULL,
pe_find_clone);
/* ... but don't use it if it was already associated with a
* pending action on another node
*/
if (inactive_instance && inactive_instance->pending_node
&& (inactive_instance->pending_node->details != node->details)) {
inactive_instance = NULL;
}
}
}
}
if ((rsc == NULL) && !skip_inactive && (inactive_instance != NULL)) {
pe_rsc_trace(parent, "Resource %s, empty slot", inactive_instance->id);
rsc = inactive_instance;
}
/* If the resource has "requires" set to "quorum" or "nothing", and we don't
* have a clone instance for every node, we don't want to consume a valid
* instance number for unclean nodes. Such instances may appear to be active
* according to the history, but should be considered inactive, so we can
* start an instance elsewhere. Treat such instances as orphans.
*
* An exception is instances running on guest nodes -- since guest node
* "fencing" is actually just a resource stop, requires shouldn't apply.
*
* @TODO Ideally, we'd use an inactive instance number if it is not needed
* for any clean instances. However, we don't know that at this point.
*/
if ((rsc != NULL) && is_not_set(rsc->flags, pe_rsc_needs_fencing)
&& (!node->details->online || node->details->unclean)
&& !pe__is_guest_node(node)
&& !pe__is_universal_clone(parent, data_set)) {
rsc = NULL;
}
if (rsc == NULL) {
rsc = create_anonymous_orphan(parent, rsc_id, node, data_set);
pe_rsc_trace(parent, "Resource %s, orphan", rsc->id);
}
return rsc;
}
static resource_t *
unpack_find_resource(pe_working_set_t * data_set, node_t * node, const char *rsc_id,
xmlNode * rsc_entry)
{
resource_t *rsc = NULL;
resource_t *parent = NULL;
crm_trace("looking for %s", rsc_id);
rsc = pe_find_resource(data_set->resources, rsc_id);
if (rsc == NULL) {
/* If we didn't find the resource by its name in the operation history,
* check it again as a clone instance. Even when clone-max=0, we create
* a single :0 orphan to match against here.
*/
char *clone0_id = clone_zero(rsc_id);
resource_t *clone0 = pe_find_resource(data_set->resources, clone0_id);
if (clone0 && is_not_set(clone0->flags, pe_rsc_unique)) {
rsc = clone0;
parent = uber_parent(clone0);
crm_trace("%s found as %s (%s)", rsc_id, clone0_id, parent->id);
} else {
crm_trace("%s is not known as %s either (orphan)",
rsc_id, clone0_id);
}
free(clone0_id);
} else if (rsc->variant > pe_native) {
crm_trace("Resource history for %s is orphaned because it is no longer primitive",
rsc_id);
return NULL;
} else {
parent = uber_parent(rsc);
}
if (pe_rsc_is_anon_clone(parent)) {
if (pe_rsc_is_bundled(parent)) {
rsc = pe__find_bundle_replica(parent->parent, node);
} else {
char *base = clone_strip(rsc_id);
rsc = find_anonymous_clone(data_set, node, parent, base);
free(base);
CRM_ASSERT(rsc != NULL);
}
}
if (rsc && safe_str_neq(rsc_id, rsc->id)
&& safe_str_neq(rsc_id, rsc->clone_name)) {
free(rsc->clone_name);
rsc->clone_name = strdup(rsc_id);
pe_rsc_debug(rsc, "Internally renamed %s on %s to %s%s",
rsc_id, node->details->uname, rsc->id,
(is_set(rsc->flags, pe_rsc_orphan)? " (ORPHAN)" : ""));
}
return rsc;
}
static resource_t *
process_orphan_resource(xmlNode * rsc_entry, node_t * node, pe_working_set_t * data_set)
{
resource_t *rsc = NULL;
const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID);
crm_debug("Detected orphan resource %s on %s", rsc_id, node->details->uname);
rsc = create_fake_resource(rsc_id, rsc_entry, data_set);
if (is_set(data_set->flags, pe_flag_stop_rsc_orphans) == FALSE) {
clear_bit(rsc->flags, pe_rsc_managed);
} else {
CRM_CHECK(rsc != NULL, return NULL);
pe_rsc_trace(rsc, "Added orphan %s", rsc->id);
resource_location(rsc, NULL, -INFINITY, "__orphan_do_not_run__", data_set);
}
return rsc;
}
static void
process_rsc_state(resource_t * rsc, node_t * node,
enum action_fail_response on_fail,
xmlNode * migrate_op, pe_working_set_t * data_set)
{
node_t *tmpnode = NULL;
char *reason = NULL;
CRM_ASSERT(rsc);
pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail));
/* process current state */
if (rsc->role != RSC_ROLE_UNKNOWN) {
resource_t *iter = rsc;
while (iter) {
if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) {
node_t *n = node_copy(node);
pe_rsc_trace(rsc, "%s (aka. %s) known on %s", rsc->id, rsc->clone_name,
n->details->uname);
g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n);
}
if (is_set(iter->flags, pe_rsc_unique)) {
break;
}
iter = iter->parent;
}
}
/* If a managed resource is believed to be running, but node is down ... */
if (rsc->role > RSC_ROLE_STOPPED
&& node->details->online == FALSE
&& node->details->maintenance == FALSE
&& is_set(rsc->flags, pe_rsc_managed)) {
gboolean should_fence = FALSE;
/* If this is a guest node, fence it (regardless of whether fencing is
* enabled, because guest node fencing is done by recovery of the
* container resource rather than by the fencer). Mark the resource
* we're processing as failed. When the guest comes back up, its
* operation history in the CIB will be cleared, freeing the affected
* resource to run again once we are sure we know its state.
*/
if (pe__is_guest_node(node)) {
set_bit(rsc->flags, pe_rsc_failed);
should_fence = TRUE;
} else if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
if (pe__is_remote_node(node) && node->details->remote_rsc
&& is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
/* Setting unseen means that fencing of the remote node will
* occur only if the connection resource is not going to start
* somewhere. This allows connection resources on a failed
* cluster node to move to another node without requiring the
* remote nodes to be fenced as well.
*/
node->details->unseen = TRUE;
reason = crm_strdup_printf("%s is active there (fencing will be"
" revoked if remote connection can "
"be re-established elsewhere)",
rsc->id);
}
should_fence = TRUE;
}
if (should_fence) {
if (reason == NULL) {
reason = crm_strdup_printf("%s is thought to be active there", rsc->id);
}
pe_fence_node(data_set, node, reason);
}
free(reason);
}
if (node->details->unclean) {
/* No extra processing needed
* Also allows resources to be started again after a node is shot
*/
on_fail = action_fail_ignore;
}
switch (on_fail) {
case action_fail_ignore:
/* nothing to do */
break;
case action_fail_fence:
/* treat it as if it is still running
* but also mark the node as unclean
*/
reason = crm_strdup_printf("%s failed there", rsc->id);
pe_fence_node(data_set, node, reason);
free(reason);
break;
case action_fail_standby:
node->details->standby = TRUE;
node->details->standby_onfail = TRUE;
break;
case action_fail_block:
/* is_managed == FALSE will prevent any
* actions being sent for the resource
*/
clear_bit(rsc->flags, pe_rsc_managed);
set_bit(rsc->flags, pe_rsc_block);
break;
case action_fail_migrate:
/* make sure it comes up somewhere else
* or not at all
*/
resource_location(rsc, node, -INFINITY, "__action_migration_auto__", data_set);
break;
case action_fail_stop:
rsc->next_role = RSC_ROLE_STOPPED;
break;
case action_fail_recover:
if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) {
set_bit(rsc->flags, pe_rsc_failed);
stop_action(rsc, node, FALSE);
}
break;
case action_fail_restart_container:
set_bit(rsc->flags, pe_rsc_failed);
if (rsc->container && pe_rsc_is_bundled(rsc)) {
/* A bundle's remote connection can run on a different node than
* the bundle's container. We don't necessarily know where the
* container is running yet, so remember it and add a stop
* action for it later.
*/
data_set->stop_needed = g_list_prepend(data_set->stop_needed,
rsc->container);
} else if (rsc->container) {
stop_action(rsc->container, node, FALSE);
} else if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) {
stop_action(rsc, node, FALSE);
}
break;
case action_fail_reset_remote:
set_bit(rsc->flags, pe_rsc_failed);
if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
tmpnode = NULL;
if (rsc->is_remote_node) {
tmpnode = pe_find_node(data_set->nodes, rsc->id);
}
if (tmpnode &&
pe__is_remote_node(tmpnode) &&
tmpnode->details->remote_was_fenced == 0) {
/* The remote connection resource failed in a way that
* should result in fencing the remote node.
*/
pe_fence_node(data_set, tmpnode,
"remote connection is unrecoverable");
}
}
/* require the stop action regardless if fencing is occurring or not. */
if (rsc->role > RSC_ROLE_STOPPED) {
stop_action(rsc, node, FALSE);
}
/* if reconnect delay is in use, prevent the connection from exiting the
* "STOPPED" role until the failure is cleared by the delay timeout. */
if (rsc->remote_reconnect_ms) {
rsc->next_role = RSC_ROLE_STOPPED;
}
break;
}
/* ensure a remote-node connection failure forces an unclean remote-node
* to be fenced. By setting unseen = FALSE, the remote-node failure will
* result in a fencing operation regardless if we're going to attempt to
* reconnect to the remote-node in this transition or not. */
if (is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) {
tmpnode = pe_find_node(data_set->nodes, rsc->id);
if (tmpnode && tmpnode->details->unclean) {
tmpnode->details->unseen = FALSE;
}
}
if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) {
if (is_set(rsc->flags, pe_rsc_orphan)) {
if (is_set(rsc->flags, pe_rsc_managed)) {
crm_config_warn("Detected active orphan %s running on %s",
rsc->id, node->details->uname);
} else {
crm_config_warn("Cluster configured not to stop active orphans."
" %s must be stopped manually on %s",
rsc->id, node->details->uname);
}
}
native_add_running(rsc, node, data_set);
if (on_fail != action_fail_ignore) {
set_bit(rsc->flags, pe_rsc_failed);
}
} else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) {
/* Only do this for older status sections that included instance numbers
* Otherwise stopped instances will appear as orphans
*/
pe_rsc_trace(rsc, "Resetting clone_name %s for %s (stopped)", rsc->clone_name, rsc->id);
free(rsc->clone_name);
rsc->clone_name = NULL;
} else {
GList *possible_matches = pe__resource_actions(rsc, node, RSC_STOP,
FALSE);
GListPtr gIter = possible_matches;
for (; gIter != NULL; gIter = gIter->next) {
action_t *stop = (action_t *) gIter->data;
stop->flags |= pe_action_optional;
}
g_list_free(possible_matches);
}
}
/* create active recurring operations as optional */
static void
process_recurring(node_t * node, resource_t * rsc,
int start_index, int stop_index,
GListPtr sorted_op_list, pe_working_set_t * data_set)
{
int counter = -1;
const char *task = NULL;
const char *status = NULL;
GListPtr gIter = sorted_op_list;
CRM_ASSERT(rsc);
pe_rsc_trace(rsc, "%s: Start index %d, stop index = %d", rsc->id, start_index, stop_index);
for (; gIter != NULL; gIter = gIter->next) {
xmlNode *rsc_op = (xmlNode *) gIter->data;
guint interval_ms = 0;
char *key = NULL;
const char *id = ID(rsc_op);
const char *interval_ms_s = NULL;
counter++;
if (node->details->online == FALSE) {
pe_rsc_trace(rsc, "Skipping %s/%s: node is offline", rsc->id, node->details->uname);
break;
/* Need to check if there's a monitor for role="Stopped" */
} else if (start_index < stop_index && counter <= stop_index) {
pe_rsc_trace(rsc, "Skipping %s/%s: resource is not active", id, node->details->uname);
continue;
} else if (counter < start_index) {
pe_rsc_trace(rsc, "Skipping %s/%s: old %d", id, node->details->uname, counter);
continue;
}
interval_ms_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL_MS);
interval_ms = crm_parse_ms(interval_ms_s);
if (interval_ms == 0) {
pe_rsc_trace(rsc, "Skipping %s/%s: non-recurring", id, node->details->uname);
continue;
}
status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS);
if (safe_str_eq(status, "-1")) {
pe_rsc_trace(rsc, "Skipping %s/%s: status", id, node->details->uname);
continue;
}
task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK);
/* create the action */
key = generate_op_key(rsc->id, task, interval_ms);
pe_rsc_trace(rsc, "Creating %s/%s", key, node->details->uname);
custom_action(rsc, key, task, node, TRUE, TRUE, data_set);
}
}
void
calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index)
{
int counter = -1;
int implied_monitor_start = -1;
int implied_clone_start = -1;
const char *task = NULL;
const char *status = NULL;
GListPtr gIter = sorted_op_list;
*stop_index = -1;
*start_index = -1;
for (; gIter != NULL; gIter = gIter->next) {
xmlNode *rsc_op = (xmlNode *) gIter->data;
counter++;
task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK);
status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS);
if (safe_str_eq(task, CRMD_ACTION_STOP)
&& safe_str_eq(status, "0")) {
*stop_index = counter;
} else if (safe_str_eq(task, CRMD_ACTION_START) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
*start_index = counter;
} else if ((implied_monitor_start <= *stop_index) && safe_str_eq(task, CRMD_ACTION_STATUS)) {
const char *rc = crm_element_value(rsc_op, XML_LRM_ATTR_RC);
if (safe_str_eq(rc, "0") || safe_str_eq(rc, "8")) {
implied_monitor_start = counter;
}
} else if (safe_str_eq(task, CRMD_ACTION_PROMOTE) || safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
implied_clone_start = counter;
}
}
if (*start_index == -1) {
if (implied_clone_start != -1) {
*start_index = implied_clone_start;
} else if (implied_monitor_start != -1) {
*start_index = implied_monitor_start;
}
}
}
static resource_t *
unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set)
{
GListPtr gIter = NULL;
int stop_index = -1;
int start_index = -1;
enum rsc_role_e req_role = RSC_ROLE_UNKNOWN;
const char *task = NULL;
const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID);
resource_t *rsc = NULL;
GListPtr op_list = NULL;
GListPtr sorted_op_list = NULL;
xmlNode *migrate_op = NULL;
xmlNode *rsc_op = NULL;
xmlNode *last_failure = NULL;
enum action_fail_response on_fail = FALSE;
enum rsc_role_e saved_role = RSC_ROLE_UNKNOWN;
crm_trace("[%s] Processing %s on %s",
crm_element_name(rsc_entry), rsc_id, node->details->uname);
/* extract operations */
op_list = NULL;
sorted_op_list = NULL;
for (rsc_op = __xml_first_child_element(rsc_entry); rsc_op != NULL;
rsc_op = __xml_next_element(rsc_op)) {
if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) {
op_list = g_list_prepend(op_list, rsc_op);
}
}
if (op_list == NULL) {
/* if there are no operations, there is nothing to do */
return NULL;
}
/* find the resource */
rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry);
if (rsc == NULL) {
rsc = process_orphan_resource(rsc_entry, node, data_set);
}
CRM_ASSERT(rsc != NULL);
/* process operations */
saved_role = rsc->role;
on_fail = action_fail_ignore;
rsc->role = RSC_ROLE_UNKNOWN;
sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
xmlNode *rsc_op = (xmlNode *) gIter->data;
task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK);
if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
migrate_op = rsc_op;
}
unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail, data_set);
}
/* create active recurring operations as optional */
calculate_active_ops(sorted_op_list, &start_index, &stop_index);
process_recurring(node, rsc, start_index, stop_index, sorted_op_list, data_set);
/* no need to free the contents */
g_list_free(sorted_op_list);
process_rsc_state(rsc, node, on_fail, migrate_op, data_set);
if (get_target_role(rsc, &req_role)) {
if (rsc->next_role == RSC_ROLE_UNKNOWN || req_role < rsc->next_role) {
pe_rsc_debug(rsc, "%s: Overwriting calculated next role %s"
" with requested next role %s",
rsc->id, role2text(rsc->next_role), role2text(req_role));
rsc->next_role = req_role;
} else if (req_role > rsc->next_role) {
pe_rsc_info(rsc, "%s: Not overwriting calculated next role %s"
" with requested next role %s",
rsc->id, role2text(rsc->next_role), role2text(req_role));
}
}
if (saved_role > rsc->role) {
rsc->role = saved_role;
}
return rsc;
}
static void
handle_orphaned_container_fillers(xmlNode * lrm_rsc_list, pe_working_set_t * data_set)
{
xmlNode *rsc_entry = NULL;
for (rsc_entry = __xml_first_child_element(lrm_rsc_list); rsc_entry != NULL;
rsc_entry = __xml_next_element(rsc_entry)) {
resource_t *rsc;
resource_t *container;
const char *rsc_id;
const char *container_id;
if (safe_str_neq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE)) {
continue;
}
container_id = crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER);
rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID);
if (container_id == NULL || rsc_id == NULL) {
continue;
}
container = pe_find_resource(data_set->resources, container_id);
if (container == NULL) {
continue;
}
rsc = pe_find_resource(data_set->resources, rsc_id);
if (rsc == NULL ||
is_set(rsc->flags, pe_rsc_orphan_container_filler) == FALSE ||
rsc->container != NULL) {
continue;
}
pe_rsc_trace(rsc, "Mapped container of orphaned resource %s to %s",
rsc->id, container_id);
rsc->container = container;
container->fillers = g_list_append(container->fillers, rsc);
}
}
gboolean
unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * data_set)
{
xmlNode *rsc_entry = NULL;
gboolean found_orphaned_container_filler = FALSE;
CRM_CHECK(node != NULL, return FALSE);
crm_trace("Unpacking resources on %s", node->details->uname);
for (rsc_entry = __xml_first_child_element(lrm_rsc_list); rsc_entry != NULL;
rsc_entry = __xml_next_element(rsc_entry)) {
if (crm_str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, TRUE)) {
resource_t *rsc = unpack_lrm_rsc_state(node, rsc_entry, data_set);
if (!rsc) {
continue;
}
if (is_set(rsc->flags, pe_rsc_orphan_container_filler)) {
found_orphaned_container_filler = TRUE;
}
}
}
/* now that all the resource state has been unpacked for this node
* we have to go back and map any orphaned container fillers to their
* container resource */
if (found_orphaned_container_filler) {
handle_orphaned_container_fillers(lrm_rsc_list, data_set);
}
return TRUE;
}
static void
set_active(resource_t * rsc)
{
resource_t *top = uber_parent(rsc);
if (top && is_set(top->flags, pe_rsc_promotable)) {
rsc->role = RSC_ROLE_SLAVE;
} else {
rsc->role = RSC_ROLE_STARTED;
}
}
static void
set_node_score(gpointer key, gpointer value, gpointer user_data)
{
node_t *node = value;
int *score = user_data;
node->weight = *score;
}
#define STATUS_PATH_MAX 1024
static xmlNode *
find_lrm_op(const char *resource, const char *op, const char *node, const char *source,
bool success_only, pe_working_set_t *data_set)
{
int offset = 0;
char xpath[STATUS_PATH_MAX];
xmlNode *xml = NULL;
offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//node_state[@uname='%s']", node);
offset +=
snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//" XML_LRM_TAG_RESOURCE "[@id='%s']",
resource);
/* Need to check against transition_magic too? */
if (source && safe_str_eq(op, CRMD_ACTION_MIGRATE)) {
offset +=
snprintf(xpath + offset, STATUS_PATH_MAX - offset,
"/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_target='%s']", op,
source);
} else if (source && safe_str_eq(op, CRMD_ACTION_MIGRATED)) {
offset +=
snprintf(xpath + offset, STATUS_PATH_MAX - offset,
"/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_source='%s']", op,
source);
} else {
offset +=
snprintf(xpath + offset, STATUS_PATH_MAX - offset,
"/" XML_LRM_TAG_RSC_OP "[@operation='%s']", op);
}
CRM_LOG_ASSERT(offset > 0);
xml = get_xpath_object(xpath, data_set->input, LOG_DEBUG);
if (xml && success_only) {
int rc = PCMK_OCF_UNKNOWN_ERROR;
int status = PCMK_LRM_OP_ERROR;
crm_element_value_int(xml, XML_LRM_ATTR_RC, &rc);
crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, &status);
if ((rc != PCMK_OCF_OK) || (status != PCMK_LRM_OP_DONE)) {
return NULL;
}
}
return xml;
}
static int
pe__call_id(xmlNode *op_xml)
{
int id = 0;
if (op_xml) {
crm_element_value_int(op_xml, XML_LRM_ATTR_CALLID, &id);
}
return id;
}
/*!
* \brief Check whether a stop happened on the same node after some event
*
* \param[in] rsc Resource being checked
* \param[in] node Node being checked
* \param[in] xml_op Event that stop is being compared to
* \param[in] data_set Cluster working set
*
* \return TRUE if stop happened after event, FALSE otherwise
*
* \note This is really unnecessary, but kept as a safety mechanism. We
* currently don't save more than one successful event in history, so this
* only matters when processing really old CIB files that we don't
* technically support anymore, or as preparation for logging an extended
* history in the future.
*/
static bool
stop_happened_after(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
pe_working_set_t *data_set)
{
xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP,
node->details->uname, NULL, TRUE, data_set);
return (stop_op && (pe__call_id(stop_op) > pe__call_id(xml_op)));
}
static void
unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
pe_working_set_t *data_set)
{
/* A successful migration sequence is:
* migrate_to on source node
* migrate_from on target node
* stop on source node
*
* If a migrate_to is followed by a stop, the entire migration (successful
* or failed) is complete, and we don't care what happened on the target.
*
* If no migrate_from has happened, the migration is considered to be
* "partial". If the migrate_from failed, make sure the resource gets
* stopped on both source and target (if up).
*
* If the migrate_to and migrate_from both succeeded (which also implies the
* resource is no longer running on the source), but there is no stop, the
* migration is considered to be "dangling". Schedule a stop on the source
* in this case.
*/
int from_rc = 0;
int from_status = 0;
pe_node_t *target_node = NULL;
pe_node_t *source_node = NULL;
xmlNode *migrate_from = NULL;
const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE);
const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET);
// Sanity check
CRM_CHECK(source && target && !strcmp(source, node->details->uname), return);
if (stop_happened_after(rsc, node, xml_op, data_set)) {
return;
}
// Clones are not allowed to migrate, so role can't be master
rsc->role = RSC_ROLE_STARTED;
target_node = pe_find_node(data_set->nodes, target);
source_node = pe_find_node(data_set->nodes, source);
// Check whether there was a migrate_from action on the target
migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target,
source, FALSE, data_set);
if (migrate_from) {
crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc);
crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, &from_status);
pe_rsc_trace(rsc, "%s op on %s exited with status=%d, rc=%d",
ID(migrate_from), target, from_status, from_rc);
}
if (migrate_from && from_rc == PCMK_OCF_OK
&& from_status == PCMK_LRM_OP_DONE) {
/* The migrate_to and migrate_from both succeeded, so mark the migration
* as "dangling". This will be used to schedule a stop action on the
* source without affecting the target.
*/
pe_rsc_trace(rsc, "Detected dangling migration op: %s on %s", ID(xml_op),
source);
rsc->role = RSC_ROLE_STOPPED;
rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node);
} else if (migrate_from && (from_status != PCMK_LRM_OP_PENDING)) { // Failed
if (target_node && target_node->details->online) {
pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node,
target_node->details->online);
native_add_running(rsc, target_node, data_set);
}
} else { // Pending, or complete but erased
if (target_node && target_node->details->online) {
pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node,
target_node->details->online);
native_add_running(rsc, target_node, data_set);
if (source_node && source_node->details->online) {
/* This is a partial migration: the migrate_to completed
* successfully on the source, but the migrate_from has not
* completed. Remember the source and target; if the newly
* chosen target remains the same when we schedule actions
* later, we may continue with the migration.
*/
rsc->partial_migration_target = target_node;
rsc->partial_migration_source = source_node;
}
} else {
/* Consider it failed here - forces a restart, prevents migration */
set_bit(rsc->flags, pe_rsc_failed);
clear_bit(rsc->flags, pe_rsc_allow_migrate);
}
}
}
static void
unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
pe_working_set_t *data_set)
{
int target_stop_id = 0;
int target_migrate_from_id = 0;
xmlNode *target_stop = NULL;
xmlNode *target_migrate_from = NULL;
const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE);
const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET);
// Sanity check
CRM_CHECK(source && target && !strcmp(source, node->details->uname), return);
/* If a migration failed, we have to assume the resource is active. Clones
* are not allowed to migrate, so role can't be master.
*/
rsc->role = RSC_ROLE_STARTED;
// Check for stop on the target
target_stop = find_lrm_op(rsc->id, CRMD_ACTION_STOP, target, NULL,
TRUE, data_set);
target_stop_id = pe__call_id(target_stop);
// Check for migrate_from on the target
target_migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target,
source, TRUE, data_set);
target_migrate_from_id = pe__call_id(target_migrate_from);
if ((target_stop == NULL) || (target_stop_id < target_migrate_from_id)) {
/* There was no stop on the source, or a stop that happened before a
* migrate_from, so assume the resource is still active on the target
* (if it is up).
*/
node_t *target_node = pe_find_node(data_set->nodes, target);
pe_rsc_trace(rsc, "stop (%d) + migrate_from (%d)",
target_stop_id, target_migrate_from_id);
if (target_node && target_node->details->online) {
native_add_running(rsc, target_node, data_set);
}
} else if (target_migrate_from == NULL) {
/* We know there was a stop on the target, but there may not have been a
* migrate_from (the stop could have happened before migrate_from was
* scheduled or attempted).
*
* That means this could be a "dangling" migration. But first, check
* whether there is a newer migrate_from or start on the source node --
* it's possible the failed migration was followed by a successful
* full restart or migration in the reverse direction, in which case we
* don't want to force it to stop.
*/
xmlNode *source_migrate_from = NULL;
xmlNode *source_start = NULL;
int source_migrate_to_id = pe__call_id(xml_op);
source_migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, source,
NULL, TRUE, data_set);
if (pe__call_id(source_migrate_from) > source_migrate_to_id) {
return;
}
source_start = find_lrm_op(rsc->id, CRMD_ACTION_START, source, NULL,
TRUE, data_set);
if (pe__call_id(source_start) > source_migrate_to_id) {
return;
}
// Mark node as having dangling migration so we can force a stop later
rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node);
}
}
static void
unpack_migrate_from_failure(pe_resource_t *rsc, pe_node_t *node,
xmlNode *xml_op, pe_working_set_t *data_set)
{
xmlNode *source_stop = NULL;
xmlNode *source_migrate_to = NULL;
const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE);
const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET);
// Sanity check
CRM_CHECK(source && target && !strcmp(target, node->details->uname), return);
/* If a migration failed, we have to assume the resource is active. Clones
* are not allowed to migrate, so role can't be master.
*/
rsc->role = RSC_ROLE_STARTED;
// Check for a stop on the source
source_stop = find_lrm_op(rsc->id, CRMD_ACTION_STOP, source, NULL,
TRUE, data_set);
// Check for a migrate_to on the source
source_migrate_to = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATE,
source, target, TRUE, data_set);
if ((source_stop == NULL)
|| (pe__call_id(source_stop) < pe__call_id(source_migrate_to))) {
/* There was no stop on the source, or a stop that happened before
* migrate_to, so assume the resource is still active on the source (if
* it is up).
*/
pe_node_t *source_node = pe_find_node(data_set->nodes, source);
if (source_node && source_node->details->online) {
native_add_running(rsc, source_node, data_set);
}
}
}
static void
record_failed_op(xmlNode *op, const pe_node_t *node,
const pe_resource_t *rsc, pe_working_set_t *data_set)
{
xmlNode *xIter = NULL;
const char *op_key = crm_element_value(op, XML_LRM_ATTR_TASK_KEY);
if (node->details->online == FALSE) {
return;
}
for (xIter = data_set->failed->children; xIter; xIter = xIter->next) {
const char *key = crm_element_value(xIter, XML_LRM_ATTR_TASK_KEY);
const char *uname = crm_element_value(xIter, XML_ATTR_UNAME);
if(safe_str_eq(op_key, key) && safe_str_eq(uname, node->details->uname)) {
crm_trace("Skipping duplicate entry %s on %s", op_key, node->details->uname);
return;
}
}
crm_trace("Adding entry %s on %s", op_key, node->details->uname);
crm_xml_add(op, XML_ATTR_UNAME, node->details->uname);
crm_xml_add(op, XML_LRM_ATTR_RSCID, rsc->id);
add_node_copy(data_set->failed, op);
}
static const char *get_op_key(xmlNode *xml_op)
{
const char *key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY);
if(key == NULL) {
key = ID(xml_op);
}
return key;
}
+static const char *
+last_change_str(xmlNode *xml_op)
+{
+ time_t when;
+ const char *when_s = NULL;
+
+ if (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE,
+ &when) == pcmk_ok) {
+ when_s = crm_now_string(&when);
+ if (when_s) {
+ // Skip day of week to make message shorter
+ when_s = strchr(when_s, ' ');
+ if (when_s) {
+ ++when_s;
+ }
+ }
+ }
+ return ((when_s && *when_s)? when_s : "unknown time");
+}
+
static void
unpack_rsc_op_failure(resource_t * rsc, node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure,
enum action_fail_response * on_fail, pe_working_set_t * data_set)
{
guint interval_ms = 0;
- bool is_probe = FALSE;
+ bool is_probe = false;
action_t *action = NULL;
const char *key = get_op_key(xml_op);
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
+ const char *exit_reason = crm_element_value(xml_op,
+ XML_LRM_ATTR_EXIT_REASON);
CRM_ASSERT(rsc);
+ CRM_CHECK(task != NULL, return);
*last_failure = xml_op;
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
- if ((interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) {
- is_probe = TRUE;
- pe_rsc_trace(rsc, "is a probe: %s", key);
+ if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) {
+ is_probe = true;
}
- if (rc != PCMK_OCF_NOT_INSTALLED || is_set(data_set->flags, pe_flag_symmetric_cluster)) {
- crm_warn("Processing failed %s of %s on %s: %s " CRM_XS " rc=%d",
+ if (exit_reason == NULL) {
+ exit_reason = "";
+ }
+
+ if (is_not_set(data_set->flags, pe_flag_symmetric_cluster)
+ && (rc == PCMK_OCF_NOT_INSTALLED)) {
+ crm_trace("Unexpected result (%s%s%s) was recorded for "
+ "%s of %s on %s at %s " CRM_XS " rc=%d id=%s",
+ services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason,
+ (is_probe? "probe" : task), rsc->id, node->details->uname,
+ last_change_str(xml_op), rc, ID(xml_op));
+ } else {
+ crm_warn("Unexpected result (%s%s%s) was recorded for "
+ "%s of %s on %s at %s " CRM_XS " rc=%d id=%s",
+ services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason,
(is_probe? "probe" : task), rsc->id, node->details->uname,
- services_ocf_exitcode_str(rc), rc);
+ last_change_str(xml_op), rc, ID(xml_op));
if (is_probe && (rc != PCMK_OCF_OK)
&& (rc != PCMK_OCF_NOT_RUNNING)
&& (rc != PCMK_OCF_RUNNING_MASTER)) {
/* A failed (not just unexpected) probe result could mean the user
* didn't know resources will be probed even where they can't run.
*/
crm_notice("If it is not possible for %s to run on %s, see "
"the resource-discovery option for location constraints",
rsc->id, node->details->uname);
}
record_failed_op(xml_op, node, rsc, data_set);
-
- } else {
- crm_trace("Processing failed op %s for %s on %s: %s (%d)",
- task, rsc->id, node->details->uname, services_ocf_exitcode_str(rc),
- rc);
}
action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set);
if ((action->on_fail <= action_fail_fence && *on_fail < action->on_fail) ||
(action->on_fail == action_fail_reset_remote && *on_fail <= action_fail_recover) ||
(action->on_fail == action_fail_restart_container && *on_fail <= action_fail_recover) ||
(*on_fail == action_fail_restart_container && action->on_fail >= action_fail_migrate)) {
pe_rsc_trace(rsc, "on-fail %s -> %s for %s (%s)", fail2text(*on_fail),
fail2text(action->on_fail), action->uuid, key);
*on_fail = action->on_fail;
}
- if (safe_str_eq(task, CRMD_ACTION_STOP)) {
+ if (!strcmp(task, CRMD_ACTION_STOP)) {
resource_location(rsc, node, -INFINITY, "__stop_fail__", data_set);
- } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) {
+ } else if (!strcmp(task, CRMD_ACTION_MIGRATE)) {
unpack_migrate_to_failure(rsc, node, xml_op, data_set);
- } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
+ } else if (!strcmp(task, CRMD_ACTION_MIGRATED)) {
unpack_migrate_from_failure(rsc, node, xml_op, data_set);
- } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) {
+ } else if (!strcmp(task, CRMD_ACTION_PROMOTE)) {
rsc->role = RSC_ROLE_MASTER;
- } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
+ } else if (!strcmp(task, CRMD_ACTION_DEMOTE)) {
if (action->on_fail == action_fail_block) {
rsc->role = RSC_ROLE_MASTER;
rsc->next_role = RSC_ROLE_STOPPED;
} else if(rc == PCMK_OCF_NOT_RUNNING) {
rsc->role = RSC_ROLE_STOPPED;
} else {
/*
* Staying in master role would put the PE/TE into a loop. Setting
* slave role is not dangerous because the resource will be stopped
* as part of recovery, and any master promotion will be ordered
* after that stop.
*/
rsc->role = RSC_ROLE_SLAVE;
}
}
if(is_probe && rc == PCMK_OCF_NOT_INSTALLED) {
/* leave stopped */
pe_rsc_trace(rsc, "Leaving %s stopped", rsc->id);
rsc->role = RSC_ROLE_STOPPED;
} else if (rsc->role < RSC_ROLE_STARTED) {
pe_rsc_trace(rsc, "Setting %s active", rsc->id);
set_active(rsc);
}
pe_rsc_trace(rsc, "Resource %s: role=%s, unclean=%s, on_fail=%s, fail_role=%s",
rsc->id, role2text(rsc->role),
node->details->unclean ? "true" : "false",
fail2text(action->on_fail), role2text(action->fail_role));
if (action->fail_role != RSC_ROLE_STARTED && rsc->next_role < action->fail_role) {
rsc->next_role = action->fail_role;
}
if (action->fail_role == RSC_ROLE_STOPPED) {
int score = -INFINITY;
resource_t *fail_rsc = rsc;
if (fail_rsc->parent) {
resource_t *parent = uber_parent(fail_rsc);
if (pe_rsc_is_clone(parent)
&& is_not_set(parent->flags, pe_rsc_unique)) {
/* For clone resources, if a child fails on an operation
* with on-fail = stop, all the resources fail. Do this by preventing
* the parent from coming up again. */
fail_rsc = parent;
}
}
- crm_warn("Making sure %s doesn't come up again", fail_rsc->id);
+ crm_notice("%s will not be started under current conditions",
+ fail_rsc->id);
/* make sure it doesn't come up again */
if (fail_rsc->allowed_nodes != NULL) {
g_hash_table_destroy(fail_rsc->allowed_nodes);
}
fail_rsc->allowed_nodes = node_hash_from_list(data_set->nodes);
g_hash_table_foreach(fail_rsc->allowed_nodes, set_node_score, &score);
}
pe_free_action(action);
}
/*!
* \internal
* \brief Remap operation status based on action result
*
* Given an action result, determine an appropriate operation status for the
* purposes of responding to the action (the status provided by the executor is
* not directly usable since the executor does not know what was expected).
*
* \param[in,out] rsc Resource that operation history entry is for
* \param[in] rc Actual return code of operation
* \param[in] target_rc Expected return code of operation
* \param[in] node Node where operation was executed
* \param[in] xml_op Operation history entry XML from CIB status
* \param[in,out] on_fail What should be done about the result
* \param[in] data_set Current cluster working set
*
* \return Operation status based on return code and action info
* \note This may update the resource's current and next role.
*/
static int
determine_op_status(
resource_t *rsc, int rc, int target_rc, node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set)
{
guint interval_ms = 0;
+ bool is_probe = false;
int result = PCMK_LRM_OP_DONE;
-
const char *key = get_op_key(xml_op);
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
-
- bool is_probe = FALSE;
+ const char *exit_reason = crm_element_value(xml_op,
+ XML_LRM_ATTR_EXIT_REASON);
CRM_ASSERT(rsc);
+ CRM_CHECK(task != NULL, return PCMK_LRM_OP_ERROR);
+
+ if (exit_reason == NULL) {
+ exit_reason = "";
+ }
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
- if ((interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) {
- is_probe = TRUE;
+ if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) {
+ is_probe = true;
+ task = "probe";
}
if (target_rc < 0) {
/* Pre-1.0 Pacemaker versions, and Pacemaker 1.1.6 or earlier with
* Heartbeat 2.0.7 or earlier as the cluster layer, did not include the
* target_rc in the transition key, which (along with the similar case
* of a corrupted transition key in the CIB) will be reported to this
* function as -1. Pacemaker 2.0+ does not support rolling upgrades from
* those versions or processing of saved CIB files from those versions,
* so we do not need to care much about this case.
*/
result = PCMK_LRM_OP_ERROR;
crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)",
key, node->details->uname);
} else if (target_rc != rc) {
result = PCMK_LRM_OP_ERROR;
- pe_rsc_debug(rsc, "%s on %s returned '%s' (%d) instead of the expected value: '%s' (%d)",
+ pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)",
key, node->details->uname,
- services_ocf_exitcode_str(rc), rc,
- services_ocf_exitcode_str(target_rc), target_rc);
+ target_rc, services_ocf_exitcode_str(target_rc),
+ rc, services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason);
}
switch (rc) {
case PCMK_OCF_OK:
- // @TODO Should this be (rc != target_rc)?
if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) {
result = PCMK_LRM_OP_DONE;
- pe_rsc_info(rsc, "Operation %s found resource %s active on %s",
- task, rsc->id, node->details->uname);
+ pe_rsc_info(rsc, "Probe found %s active on %s at %s",
+ rsc->id, node->details->uname,
+ last_change_str(xml_op));
}
break;
case PCMK_OCF_NOT_RUNNING:
if (is_probe || target_rc == rc || is_not_set(rsc->flags, pe_rsc_managed)) {
result = PCMK_LRM_OP_DONE;
rsc->role = RSC_ROLE_STOPPED;
/* clear any previous failure actions */
*on_fail = action_fail_ignore;
rsc->next_role = RSC_ROLE_UNKNOWN;
}
break;
case PCMK_OCF_RUNNING_MASTER:
if (is_probe && (rc != target_rc)) {
result = PCMK_LRM_OP_DONE;
- pe_rsc_info(rsc, "Operation %s found resource %s active in master mode on %s",
- task, rsc->id, node->details->uname);
+ pe_rsc_info(rsc,
+ "Probe found %s active and promoted on %s at %s",
+ rsc->id, node->details->uname,
+ last_change_str(xml_op));
}
rsc->role = RSC_ROLE_MASTER;
break;
case PCMK_OCF_DEGRADED_MASTER:
case PCMK_OCF_FAILED_MASTER:
rsc->role = RSC_ROLE_MASTER;
result = PCMK_LRM_OP_ERROR;
break;
case PCMK_OCF_NOT_CONFIGURED:
result = PCMK_LRM_OP_ERROR_FATAL;
break;
case PCMK_OCF_UNIMPLEMENT_FEATURE:
if (interval_ms > 0) {
result = PCMK_LRM_OP_NOTSUPPORTED;
break;
}
// fall through
case PCMK_OCF_NOT_INSTALLED:
case PCMK_OCF_INVALID_PARAM:
case PCMK_OCF_INSUFFICIENT_PRIV:
if (!pe_can_fence(data_set, node)
- && safe_str_eq(task, CRMD_ACTION_STOP)) {
+ && !strcmp(task, CRMD_ACTION_STOP)) {
/* If a stop fails and we can't fence, there's nothing else we can do */
- pe_proc_err("No further recovery can be attempted for %s: %s action failed with '%s' (%d)",
- rsc->id, task, services_ocf_exitcode_str(rc), rc);
+ pe_proc_err("No further recovery can be attempted for %s "
+ "because %s on %s failed (%s%s%s) at %s "
+ CRM_XS " rc=%d id=%s", rsc->id, task,
+ node->details->uname, services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason,
+ last_change_str(xml_op), rc, ID(xml_op));
clear_bit(rsc->flags, pe_rsc_managed);
set_bit(rsc->flags, pe_rsc_block);
}
result = PCMK_LRM_OP_ERROR_HARD;
break;
default:
if (result == PCMK_LRM_OP_DONE) {
- crm_info("Treating unknown return code %d for %s on %s as failure",
- rc, key, node->details->uname);
+ crm_info("Treating unknown exit status %d from %s of %s "
+ "on %s at %s as failure",
+ rc, task, rsc->id, node->details->uname,
+ last_change_str(xml_op));
result = PCMK_LRM_OP_ERROR;
}
break;
}
return result;
}
// return TRUE if start or monitor last failure but parameters changed
static bool
should_clear_for_param_change(xmlNode *xml_op, const char *task,
pe_resource_t *rsc, pe_node_t *node,
pe_working_set_t *data_set)
{
if (!strcmp(task, "start") || !strcmp(task, "monitor")) {
if (pe__bundle_needs_remote_name(rsc)) {
/* We haven't allocated resources yet, so we can't reliably
* substitute addr parameters for the REMOTE_CONTAINER_HACK.
* When that's needed, defer the check until later.
*/
pe__add_param_check(xml_op, rsc, node, pe_check_last_failure,
data_set);
} else {
op_digest_cache_t *digest_data = NULL;
digest_data = rsc_action_digest_cmp(rsc, xml_op, node, data_set);
switch (digest_data->rc) {
case RSC_DIGEST_UNKNOWN:
crm_trace("Resource %s history entry %s on %s"
" has no digest to compare",
rsc->id, get_op_key(xml_op), node->details->id);
break;
case RSC_DIGEST_MATCH:
break;
default:
return TRUE;
}
}
}
return FALSE;
}
// Order action after fencing of remote node, given connection rsc
static void
order_after_remote_fencing(pe_action_t *action, pe_resource_t *remote_conn,
pe_working_set_t *data_set)
{
pe_node_t *remote_node = pe_find_node(data_set->nodes, remote_conn->id);
if (remote_node) {
pe_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL,
data_set);
order_actions(fence, action, pe_order_implies_then);
}
}
static bool
should_ignore_failure_timeout(pe_resource_t *rsc, xmlNode *xml_op,
const char *task, guint interval_ms,
bool is_last_failure, pe_working_set_t *data_set)
{
/* Clearing failures of recurring monitors has special concerns. The
* executor reports only changes in the monitor result, so if the
* monitor is still active and still getting the same failure result,
* that will go undetected after the failure is cleared.
*
* Also, the operation history will have the time when the recurring
* monitor result changed to the given code, not the time when the
* result last happened.
*
* @TODO We probably should clear such failures only when the failure
* timeout has passed since the last occurrence of the failed result.
* However we don't record that information. We could maybe approximate
* that by clearing only if there is a more recent successful monitor or
* stop result, but we don't even have that information at this point
* since we are still unpacking the resource's operation history.
*
* This is especially important for remote connection resources with a
* reconnect interval, so in that case, we skip clearing failures
* if the remote node hasn't been fenced.
*/
if (rsc->remote_reconnect_ms
&& is_set(data_set->flags, pe_flag_stonith_enabled)
&& (interval_ms != 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) {
pe_node_t *remote_node = pe_find_node(data_set->nodes, rsc->id);
if (remote_node && !remote_node->details->remote_was_fenced) {
if (is_last_failure) {
crm_info("Waiting to clear monitor failure for remote node %s"
" until fencing has occurred", rsc->id);
}
return TRUE;
}
}
return FALSE;
}
/*!
* \internal
* \brief Check operation age and schedule failure clearing when appropriate
*
* This function has two distinct purposes. The first is to check whether an
* operation history entry is expired (i.e. the resource has a failure timeout,
* the entry is older than the timeout, and the resource either has no fail
* count or its fail count is entirely older than the timeout). The second is to
* schedule fail count clearing when appropriate (i.e. the operation is expired
* and either the resource has an expired fail count or the operation is a
* last_failure for a remote connection resource with a reconnect interval,
* or the operation is a last_failure for a start or monitor operation and the
* resource's parameters have changed since the operation).
*
* \param[in] rsc Resource that operation happened to
* \param[in] node Node that operation happened on
* \param[in] rc Actual result of operation
* \param[in] xml_op Operation history entry XML
* \param[in] data_set Current working set
*
* \return TRUE if operation history entry is expired, FALSE otherwise
*/
static bool
check_operation_expiry(pe_resource_t *rsc, pe_node_t *node, int rc,
xmlNode *xml_op, pe_working_set_t *data_set)
{
bool expired = FALSE;
bool is_last_failure = crm_ends_with(ID(xml_op), "_last_failure_0");
time_t last_run = 0;
guint interval_ms = 0;
int unexpired_fail_count = 0;
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
const char *clear_reason = NULL;
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
if ((rsc->failure_timeout > 0)
&& (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE,
&last_run) == 0)) {
// Resource has a failure-timeout, and history entry has a timestamp
time_t now = get_effective_time(data_set);
time_t last_failure = 0;
// Is this particular operation history older than the failure timeout?
if ((now >= (last_run + rsc->failure_timeout))
&& !should_ignore_failure_timeout(rsc, xml_op, task, interval_ms,
is_last_failure, data_set)) {
expired = TRUE;
}
// Does the resource as a whole have an unexpired fail count?
unexpired_fail_count = pe_get_failcount(node, rsc, &last_failure,
pe_fc_effective, xml_op,
data_set);
// Update scheduler recheck time according to *last* failure
crm_trace("%s@%lld is %sexpired @%lld with unexpired_failures=%d timeout=%ds"
" last-failure@%lld",
ID(xml_op), (long long) last_run, (expired? "" : "not "),
(long long) now, unexpired_fail_count, rsc->failure_timeout,
(long long) last_failure);
last_failure += rsc->failure_timeout + 1;
if (unexpired_fail_count && (now < last_failure)) {
pe__update_recheck_time(last_failure, data_set);
}
}
if (expired) {
if (pe_get_failcount(node, rsc, NULL, pe_fc_default, xml_op, data_set)) {
// There is a fail count ignoring timeout
if (unexpired_fail_count == 0) {
// There is no fail count considering timeout
clear_reason = "it expired";
} else {
/* This operation is old, but there is an unexpired fail count.
* In a properly functioning cluster, this should only be
* possible if this operation is not a failure (otherwise the
* fail count should be expired too), so this is really just a
* failsafe.
*/
expired = FALSE;
}
} else if (is_last_failure && rsc->remote_reconnect_ms) {
/* Clear any expired last failure when reconnect interval is set,
* even if there is no fail count.
*/
clear_reason = "reconnect interval is set";
}
}
if (!expired && is_last_failure
&& should_clear_for_param_change(xml_op, task, rsc, node, data_set)) {
clear_reason = "resource parameters have changed";
}
if (clear_reason != NULL) {
// Schedule clearing of the fail count
pe_action_t *clear_op = pe__clear_failcount(rsc, node, clear_reason,
data_set);
if (is_set(data_set->flags, pe_flag_stonith_enabled)
&& rsc->remote_reconnect_ms) {
/* If we're clearing a remote connection due to a reconnect
* interval, we want to wait until any scheduled fencing
* completes.
*
* We could limit this to remote_node->details->unclean, but at
* this point, that's always true (it won't be reliable until
* after unpack_node_loop() is done).
*/
crm_info("Clearing %s failure will wait until any scheduled "
"fencing of %s completes", task, rsc->id);
order_after_remote_fencing(clear_op, rsc, data_set);
}
}
if (expired && (interval_ms == 0) && safe_str_eq(task, CRMD_ACTION_STATUS)) {
switch(rc) {
case PCMK_OCF_OK:
case PCMK_OCF_NOT_RUNNING:
case PCMK_OCF_RUNNING_MASTER:
case PCMK_OCF_DEGRADED:
case PCMK_OCF_DEGRADED_MASTER:
// Don't expire probes that return these values
expired = FALSE;
break;
}
}
return expired;
}
int pe__target_rc_from_xml(xmlNode *xml_op)
{
int target_rc = 0;
const char *key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY);
if (key == NULL) {
return -1;
}
decode_transition_key(key, NULL, NULL, NULL, &target_rc);
return target_rc;
}
static enum action_fail_response
get_action_on_fail(resource_t *rsc, const char *key, const char *task, pe_working_set_t * data_set)
{
int result = action_fail_recover;
action_t *action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set);
result = action->on_fail;
pe_free_action(action);
return result;
}
static void
update_resource_state(resource_t * rsc, node_t * node, xmlNode * xml_op, const char * task, int rc,
xmlNode * last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set)
{
gboolean clear_past_failure = FALSE;
CRM_ASSERT(rsc);
CRM_ASSERT(xml_op);
if (rc == PCMK_OCF_NOT_RUNNING) {
clear_past_failure = TRUE;
} else if (rc == PCMK_OCF_NOT_INSTALLED) {
rsc->role = RSC_ROLE_STOPPED;
} else if (safe_str_eq(task, CRMD_ACTION_STATUS)) {
if (last_failure) {
const char *op_key = get_op_key(xml_op);
const char *last_failure_key = get_op_key(last_failure);
if (safe_str_eq(op_key, last_failure_key)) {
clear_past_failure = TRUE;
}
}
if (rsc->role < RSC_ROLE_STARTED) {
set_active(rsc);
}
} else if (safe_str_eq(task, CRMD_ACTION_START)) {
rsc->role = RSC_ROLE_STARTED;
clear_past_failure = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
rsc->role = RSC_ROLE_STOPPED;
clear_past_failure = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) {
rsc->role = RSC_ROLE_MASTER;
clear_past_failure = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
/* Demote from Master does not clear an error */
rsc->role = RSC_ROLE_SLAVE;
} else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
rsc->role = RSC_ROLE_STARTED;
clear_past_failure = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) {
unpack_migrate_to_success(rsc, node, xml_op, data_set);
} else if (rsc->role < RSC_ROLE_STARTED) {
pe_rsc_trace(rsc, "%s active on %s", rsc->id, node->details->uname);
set_active(rsc);
}
/* clear any previous failure actions */
if (clear_past_failure) {
switch (*on_fail) {
case action_fail_stop:
case action_fail_fence:
case action_fail_migrate:
case action_fail_standby:
pe_rsc_trace(rsc, "%s.%s is not cleared by a completed stop",
rsc->id, fail2text(*on_fail));
break;
case action_fail_block:
case action_fail_ignore:
case action_fail_recover:
case action_fail_restart_container:
*on_fail = action_fail_ignore;
rsc->next_role = RSC_ROLE_UNKNOWN;
break;
case action_fail_reset_remote:
if (rsc->remote_reconnect_ms == 0) {
/* With no reconnect interval, the connection is allowed to
* start again after the remote node is fenced and
* completely stopped. (With a reconnect interval, we wait
* for the failure to be cleared entirely before attempting
* to reconnect.)
*/
*on_fail = action_fail_ignore;
rsc->next_role = RSC_ROLE_UNKNOWN;
}
break;
}
}
}
/*!
* \internal
* \brief Remap informational monitor results to usual values
*
* Certain OCF result codes are for providing extended information to the
* user about services that aren't yet failed but not entirely healthy either.
* These must be treated as the "normal" result by pacemaker.
*
* \param[in] rc Actual result of a monitor action
* \param[in] xml_op Operation history XML
* \param[in] node Node that operation happened on
* \param[in] rsc Resource that operation happened to
* \param[in] data_set Cluster working set
*
* \return Result code that pacemaker should use
*
* \note If the result is remapped, and the node is not shutting down or failed,
* the operation will be recorded in the data set's list of failed
* operations, to highlight it for the user.
*/
static int
remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node,
const pe_resource_t *rsc, pe_working_set_t *data_set)
{
int remapped_rc = rc;
switch (rc) {
case PCMK_OCF_DEGRADED:
remapped_rc = PCMK_OCF_OK;
break;
case PCMK_OCF_DEGRADED_MASTER:
remapped_rc = PCMK_OCF_RUNNING_MASTER;
break;
default:
break;
}
if (rc != remapped_rc) {
crm_trace("Remapping monitor result %d to %d", rc, remapped_rc);
if (!node->details->shutdown || node->details->online) {
record_failed_op(xml_op, node, rsc, data_set);
}
}
return remapped_rc;
}
static void
unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
xmlNode **last_failure, enum action_fail_response *on_fail,
pe_working_set_t *data_set)
{
- int task_id = 0;
-
- const char *task = NULL;
- const char *task_key = NULL;
-
int rc = 0;
+ int task_id = 0;
+ int target_rc = 0;
int status = PCMK_LRM_OP_UNKNOWN;
- int target_rc = pe__target_rc_from_xml(xml_op);
guint interval_ms = 0;
-
+ const char *task = NULL;
+ const char *task_key = NULL;
+ const char *exit_reason = NULL;
bool expired = FALSE;
resource_t *parent = rsc;
enum action_fail_response failure_strategy = action_fail_recover;
- CRM_CHECK(rsc != NULL, return);
- CRM_CHECK(node != NULL, return);
- CRM_CHECK(xml_op != NULL, return);
+ CRM_CHECK(rsc && node && xml_op, return);
+ target_rc = pe__target_rc_from_xml(xml_op);
task_key = get_op_key(xml_op);
-
task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
+ exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON);
+ if (exit_reason == NULL) {
+ exit_reason = "";
+ }
crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc);
crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &task_id);
crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status);
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
CRM_CHECK(task != NULL, return);
CRM_CHECK(status <= PCMK_LRM_OP_INVALID, return);
CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return);
- if (safe_str_eq(task, CRMD_ACTION_NOTIFY) ||
- safe_str_eq(task, CRMD_ACTION_METADATA)) {
+ if (!strcmp(task, CRMD_ACTION_NOTIFY) ||
+ !strcmp(task, CRMD_ACTION_METADATA)) {
/* safe to ignore these */
return;
}
if (is_not_set(rsc->flags, pe_rsc_unique)) {
parent = uber_parent(rsc);
}
pe_rsc_trace(rsc, "Unpacking task %s/%s (call_id=%d, status=%d, rc=%d) on %s (role=%s)",
task_key, task, task_id, status, rc, node->details->uname, role2text(rsc->role));
if (node->details->unclean) {
pe_rsc_trace(rsc, "Node %s (where %s is running) is unclean."
" Further action depends on the value of the stop's on-fail attribute",
node->details->uname, rsc->id);
}
/* It should be possible to call remap_monitor_rc() first then call
* check_operation_expiry() only if rc != target_rc, because there should
* never be a fail count without at least one unexpected result in the
* resource history. That would be more efficient by avoiding having to call
* check_operation_expiry() for expected results.
*
* However, we do have such configurations in the scheduler regression
* tests, even if it shouldn't be possible with the current code. It's
* probably a good idea anyway, but that would require updating the test
* inputs to something currently possible.
*/
if ((status != PCMK_LRM_OP_NOT_INSTALLED)
&& check_operation_expiry(rsc, node, rc, xml_op, data_set)) {
expired = TRUE;
}
if (!strcmp(task, CRMD_ACTION_STATUS)) {
rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set);
}
if (expired && (rc != target_rc)) {
const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC);
if (interval_ms == 0) {
crm_notice("Ignoring expired %s failure on %s "
CRM_XS " actual=%d expected=%d magic=%s",
task_key, node->details->uname, rc, target_rc, magic);
goto done;
} else if(node->details->online && node->details->unclean == FALSE) {
/* Reschedule the recurring monitor. CancelXmlOp() won't work at
* this stage, so as a hacky workaround, forcibly change the restart
* digest so check_action_definition() does what we want later.
*
* @TODO We should skip this if there is a newer successful monitor.
* Also, this causes rescheduling only if the history entry
* has an op-digest (which the expire-non-blocked-failure
* scheduler regression test doesn't, but that may not be a
* realistic scenario in production).
*/
crm_notice("Rescheduling %s after failure expired on %s "
CRM_XS " actual=%d expected=%d magic=%s",
task_key, node->details->uname, rc, target_rc, magic);
crm_xml_add(xml_op, XML_LRM_ATTR_RESTART_DIGEST, "calculated-failure-timeout");
goto done;
}
}
/* If the executor reported an operation status of anything but done or
* error, consider that final. But for done or error, we know better whether
* it should be treated as a failure or not, because we know the expected
* result.
*/
if(status == PCMK_LRM_OP_DONE || status == PCMK_LRM_OP_ERROR) {
status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set);
+ pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status);
}
- pe_rsc_trace(rsc, "Handling status: %d", status);
switch (status) {
case PCMK_LRM_OP_CANCELLED:
- /* do nothing?? */
- pe_err("Don't know what to do for cancelled ops yet");
+ // Should never happen
+ pe_err("Resource history contains cancellation '%s' "
+ "(%s of %s on %s at %s)",
+ ID(xml_op), task, rsc->id, node->details->uname,
+ last_change_str(xml_op));
break;
case PCMK_LRM_OP_PENDING:
- if (safe_str_eq(task, CRMD_ACTION_START)) {
+ if (!strcmp(task, CRMD_ACTION_START)) {
set_bit(rsc->flags, pe_rsc_start_pending);
set_active(rsc);
- } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) {
+ } else if (!strcmp(task, CRMD_ACTION_PROMOTE)) {
rsc->role = RSC_ROLE_MASTER;
- } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE) && node->details->unclean) {
+ } else if (!strcmp(task, CRMD_ACTION_MIGRATE) && node->details->unclean) {
/* If a pending migrate_to action is out on a unclean node,
* we have to force the stop action on the target. */
const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET);
node_t *target = pe_find_node(data_set->nodes, migrate_target);
if (target) {
stop_action(rsc, target, FALSE);
}
}
if (rsc->pending_task == NULL) {
- if (safe_str_eq(task, CRMD_ACTION_STATUS) && (interval_ms == 0)) {
+ if ((interval_ms != 0) || strcmp(task, CRMD_ACTION_STATUS)) {
+ rsc->pending_task = strdup(task);
+ rsc->pending_node = node;
+ } else {
/* Pending probes are not printed, even if pending
* operations are requested. If someone ever requests that
- * behavior, uncomment this and the corresponding part of
+ * behavior, enable the below and the corresponding part of
* native.c:native_pending_task().
*/
- /*rsc->pending_task = strdup("probe");*/
- /*rsc->pending_node = node;*/
- } else {
- rsc->pending_task = strdup(task);
+#if 0
+ rsc->pending_task = strdup("probe");
rsc->pending_node = node;
+#endif
}
}
break;
case PCMK_LRM_OP_DONE:
- pe_rsc_trace(rsc, "%s/%s completed on %s", rsc->id, task, node->details->uname);
+ pe_rsc_trace(rsc, "%s of %s on %s completed at %s " CRM_XS " id=%s",
+ task, rsc->id, node->details->uname,
+ last_change_str(xml_op), ID(xml_op));
update_resource_state(rsc, node, xml_op, task, rc, *last_failure, on_fail, data_set);
break;
case PCMK_LRM_OP_NOT_INSTALLED:
failure_strategy = get_action_on_fail(rsc, task_key, task, data_set);
if (failure_strategy == action_fail_ignore) {
- crm_warn("Cannot ignore failed %s (status=%d, rc=%d) on %s: "
- "Resource agent doesn't exist",
- task_key, status, rc, node->details->uname);
+ crm_warn("Cannot ignore failed %s of %s on %s: "
+ "Resource agent doesn't exist "
+ CRM_XS " status=%d rc=%d id=%s",
+ task, rsc->id, node->details->uname, status, rc,
+ ID(xml_op));
/* Also for printing it as "FAILED" by marking it as pe_rsc_failed later */
*on_fail = action_fail_migrate;
}
resource_location(parent, node, -INFINITY, "hard-error", data_set);
unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set);
break;
case PCMK_LRM_OP_NOT_CONNECTED:
if (pe__is_guest_or_remote_node(node)
&& is_set(node->details->remote_rsc->flags, pe_rsc_managed)) {
/* We should never get into a situation where a managed remote
* connection resource is considered OK but a resource action
* behind the connection gets a "not connected" status. But as a
* fail-safe in case a bug or unusual circumstances do lead to
* that, ensure the remote connection is considered failed.
*/
set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
}
// fall through
case PCMK_LRM_OP_ERROR:
case PCMK_LRM_OP_ERROR_HARD:
case PCMK_LRM_OP_ERROR_FATAL:
case PCMK_LRM_OP_TIMEOUT:
case PCMK_LRM_OP_NOTSUPPORTED:
case PCMK_LRM_OP_INVALID:
failure_strategy = get_action_on_fail(rsc, task_key, task, data_set);
if ((failure_strategy == action_fail_ignore)
|| (failure_strategy == action_fail_restart_container
- && safe_str_eq(task, CRMD_ACTION_STOP))) {
+ && !strcmp(task, CRMD_ACTION_STOP))) {
- crm_warn("Pretending the failure of %s (rc=%d) on %s succeeded",
- task_key, rc, node->details->uname);
+ crm_warn("Pretending failed %s (%s%s%s) of %s on %s at %s "
+ "succeeded " CRM_XS " rc=%d id=%s",
+ task, services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason, rsc->id,
+ node->details->uname, last_change_str(xml_op), rc,
+ ID(xml_op));
update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, on_fail, data_set);
crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname);
set_bit(rsc->flags, pe_rsc_failure_ignored);
record_failed_op(xml_op, node, rsc, data_set);
if (failure_strategy == action_fail_restart_container && *on_fail <= action_fail_recover) {
*on_fail = failure_strategy;
}
} else {
unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set);
if(status == PCMK_LRM_OP_ERROR_HARD) {
do_crm_log(rc != PCMK_OCF_NOT_INSTALLED?LOG_ERR:LOG_NOTICE,
- "Preventing %s from re-starting on %s: operation %s failed '%s' (%d)",
+ "Preventing %s from restarting on %s because "
+ "of hard failure (%s%s%s)" CRM_XS " rc=%d id=%s",
parent->id, node->details->uname,
- task, services_ocf_exitcode_str(rc), rc);
-
+ services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason,
+ rc, ID(xml_op));
resource_location(parent, node, -INFINITY, "hard-error", data_set);
} else if(status == PCMK_LRM_OP_ERROR_FATAL) {
- crm_err("Preventing %s from re-starting anywhere: operation %s failed '%s' (%d)",
- parent->id, task, services_ocf_exitcode_str(rc), rc);
-
+ crm_err("Preventing %s from restarting anywhere because "
+ "of fatal failure (%s%s%s) " CRM_XS " rc=%d id=%s",
+ parent->id, services_ocf_exitcode_str(rc),
+ (*exit_reason? ": " : ""), exit_reason,
+ rc, ID(xml_op));
resource_location(parent, NULL, -INFINITY, "fatal-error", data_set);
}
}
break;
}
done:
pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s",
rsc->id, task, role2text(rsc->role),
role2text(rsc->next_role));
}
static void
add_node_attrs(xmlNode *xml_obj, pe_node_t *node, bool overwrite,
pe_working_set_t *data_set)
{
const char *cluster_name = NULL;
g_hash_table_insert(node->details->attrs,
strdup(CRM_ATTR_UNAME), strdup(node->details->uname));
g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_ID),
strdup(node->details->id));
if (safe_str_eq(node->details->id, data_set->dc_uuid)) {
data_set->dc_node = node;
node->details->is_dc = TRUE;
g_hash_table_insert(node->details->attrs,
strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_TRUE));
} else {
g_hash_table_insert(node->details->attrs,
strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_FALSE));
}
cluster_name = g_hash_table_lookup(data_set->config_hash, "cluster-name");
if (cluster_name) {
g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_CLUSTER_NAME),
strdup(cluster_name));
}
pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_ATTR_SETS, NULL,
node->details->attrs, NULL, overwrite, data_set);
if (pe_node_attribute_raw(node, CRM_ATTR_SITE_NAME) == NULL) {
const char *site_name = pe_node_attribute_raw(node, "site-name");
if (site_name) {
g_hash_table_insert(node->details->attrs,
strdup(CRM_ATTR_SITE_NAME),
strdup(site_name));
} else if (cluster_name) {
/* Default to cluster-name if unset */
g_hash_table_insert(node->details->attrs,
strdup(CRM_ATTR_SITE_NAME),
strdup(cluster_name));
}
}
}
static GListPtr
extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter)
{
int counter = -1;
int stop_index = -1;
int start_index = -1;
xmlNode *rsc_op = NULL;
GListPtr gIter = NULL;
GListPtr op_list = NULL;
GListPtr sorted_op_list = NULL;
/* extract operations */
op_list = NULL;
sorted_op_list = NULL;
for (rsc_op = __xml_first_child_element(rsc_entry);
rsc_op != NULL; rsc_op = __xml_next_element(rsc_op)) {
if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) {
crm_xml_add(rsc_op, "resource", rsc);
crm_xml_add(rsc_op, XML_ATTR_UNAME, node);
op_list = g_list_prepend(op_list, rsc_op);
}
}
if (op_list == NULL) {
/* if there are no operations, there is nothing to do */
return NULL;
}
sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
/* create active recurring operations as optional */
if (active_filter == FALSE) {
return sorted_op_list;
}
op_list = NULL;
calculate_active_ops(sorted_op_list, &start_index, &stop_index);
for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
xmlNode *rsc_op = (xmlNode *) gIter->data;
counter++;
if (start_index < stop_index) {
crm_trace("Skipping %s: not active", ID(rsc_entry));
break;
} else if (counter < start_index) {
crm_trace("Skipping %s: old", ID(rsc_op));
continue;
}
op_list = g_list_append(op_list, rsc_op);
}
g_list_free(sorted_op_list);
return op_list;
}
GListPtr
find_operations(const char *rsc, const char *node, gboolean active_filter,
pe_working_set_t * data_set)
{
GListPtr output = NULL;
GListPtr intermediate = NULL;
xmlNode *tmp = NULL;
xmlNode *status = find_xml_node(data_set->input, XML_CIB_TAG_STATUS, TRUE);
node_t *this_node = NULL;
xmlNode *node_state = NULL;
for (node_state = __xml_first_child_element(status); node_state != NULL;
node_state = __xml_next_element(node_state)) {
if (crm_str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, TRUE)) {
const char *uname = crm_element_value(node_state, XML_ATTR_UNAME);
if (node != NULL && safe_str_neq(uname, node)) {
continue;
}
this_node = pe_find_node(data_set->nodes, uname);
if(this_node == NULL) {
CRM_LOG_ASSERT(this_node != NULL);
continue;
} else if (pe__is_guest_or_remote_node(this_node)) {
determine_remote_online_status(data_set, this_node);
} else {
determine_online_status(node_state, this_node, data_set);
}
if (this_node->details->online || is_set(data_set->flags, pe_flag_stonith_enabled)) {
/* offline nodes run no resources...
* unless stonith is enabled in which case we need to
* make sure rsc start events happen after the stonith
*/
xmlNode *lrm_rsc = NULL;
tmp = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE);
tmp = find_xml_node(tmp, XML_LRM_TAG_RESOURCES, FALSE);
for (lrm_rsc = __xml_first_child_element(tmp); lrm_rsc != NULL;
lrm_rsc = __xml_next_element(lrm_rsc)) {
if (crm_str_eq((const char *)lrm_rsc->name, XML_LRM_TAG_RESOURCE, TRUE)) {
const char *rsc_id = crm_element_value(lrm_rsc, XML_ATTR_ID);
if (rsc != NULL && safe_str_neq(rsc_id, rsc)) {
continue;
}
intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter);
output = g_list_concat(output, intermediate);
}
}
}
}
}
return output;
}
diff --git a/tools/crm_mon_output.c b/tools/crm_mon_output.c
index 9a9f3ce74b..d012ef8d5d 100644
--- a/tools/crm_mon_output.c
+++ b/tools/crm_mon_output.c
@@ -1,1203 +1,1205 @@
/*
* Copyright 2019 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "crm_mon.h"
static char *
time_t_string(time_t when) {
crm_time_t *crm_when = crm_time_new(NULL);
char *buf = NULL;
crm_time_set_timet(crm_when, &when);
buf = crm_time_as_string(crm_when, crm_time_log_date | crm_time_log_timeofday | crm_time_log_with_timezone);
crm_time_free(crm_when);
return buf;
}
static char *
failed_action_string(xmlNodePtr xml_op) {
const char *op_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY);
int rc = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_RC), "0");
int status = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS), "0");
const char *exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON);
time_t last_change = 0;
if (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE,
&last_change) == pcmk_ok) {
char *time = time_t_string(last_change);
char *buf = crm_strdup_printf("%s on %s '%s' (%d): call=%s, status='%s', exitreason='%s', last-rc-change='%s', queued=%sms, exec=%sms",
op_key ? op_key : ID(xml_op),
crm_element_value(xml_op, XML_ATTR_UNAME),
services_ocf_exitcode_str(rc), rc,
crm_element_value(xml_op, XML_LRM_ATTR_CALLID),
services_lrm_status_str(status),
exit_reason ? exit_reason : "none",
time,
crm_element_value(xml_op, XML_RSC_OP_T_QUEUE),
crm_element_value(xml_op, XML_RSC_OP_T_EXEC));
free(time);
return buf;
} else {
return crm_strdup_printf("%s on %s '%s' (%d): call=%s, status=%s, exitreason='%s'",
op_key ? op_key : ID(xml_op),
crm_element_value(xml_op, XML_ATTR_UNAME),
services_ocf_exitcode_str(rc), rc,
crm_element_value(xml_op, XML_LRM_ATTR_CALLID),
services_lrm_status_str(status),
exit_reason ? exit_reason : "none");
}
}
static char *
last_changed_string(const char *last_written, const char *user,
const char *client, const char *origin) {
if (last_written != NULL || user != NULL || client != NULL || origin != NULL) {
return crm_strdup_printf("%s%s%s%s%s%s%s",
last_written ? last_written : "",
user ? " by " : "",
user ? user : "",
client ? " via " : "",
client ? client : "",
origin ? " on " : "",
origin ? origin : "");
} else {
return strdup("");
}
}
static char *
op_history_string(xmlNode *xml_op, const char *task, const char *interval_ms_s,
int rc, unsigned int mon_ops) {
const char *call = crm_element_value(xml_op, XML_LRM_ATTR_CALLID);
char *interval_str = NULL;
char *buf = NULL;
if (interval_ms_s && safe_str_neq(interval_ms_s, "0")) {
char *pair = pcmk_format_nvpair("interval", interval_ms_s, "ms");
interval_str = crm_strdup_printf(" %s", pair);
free(pair);
}
if (is_set(mon_ops, mon_op_print_timing)) {
char *last_change_str = NULL;
char *last_run_str = NULL;
char *exec_str = NULL;
char *queue_str = NULL;
const char *value = NULL;
time_t epoch = 0;
if ((crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE, &epoch) == pcmk_ok)
&& (epoch > 0)) {
char *time = pcmk_format_named_time(XML_RSC_OP_LAST_CHANGE, epoch);
last_change_str = crm_strdup_printf(" %s", time);
free(time);
}
if ((crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_RUN, &epoch) == pcmk_ok)
&& (epoch > 0)) {
char *time = pcmk_format_named_time(XML_RSC_OP_LAST_RUN, epoch);
last_run_str = crm_strdup_printf(" %s", time);
free(time);
}
value = crm_element_value(xml_op, XML_RSC_OP_T_EXEC);
if (value) {
char *pair = pcmk_format_nvpair(XML_RSC_OP_T_EXEC, value, "ms");
exec_str = crm_strdup_printf(" %s", pair);
free(pair);
}
value = crm_element_value(xml_op, XML_RSC_OP_T_QUEUE);
if (value) {
char *pair = pcmk_format_nvpair(XML_RSC_OP_T_QUEUE, value, "ms");
queue_str = crm_strdup_printf(" %s", pair);
free(pair);
}
buf = crm_strdup_printf("(%s) %s:%s%s%s%s%s rc=%d (%s)", call, task,
interval_str ? interval_str : "",
last_change_str ? last_change_str : "",
last_run_str ? last_run_str : "",
exec_str ? exec_str : "",
queue_str ? queue_str : "",
rc, services_ocf_exitcode_str(rc));
if (last_change_str) {
free(last_change_str);
}
if (last_run_str) {
free(last_run_str);
}
if (exec_str) {
free(exec_str);
}
if (queue_str) {
free(queue_str);
}
} else {
buf = crm_strdup_printf("(%s) %s:%s", call, task,
interval_str ? interval_str : "");
}
if (interval_str) {
free(interval_str);
}
return buf;
}
static char *
resource_history_string(resource_t *rsc, const char *rsc_id, gboolean all,
int failcount, time_t last_failure) {
char *buf = NULL;
if (rsc == NULL) {
buf = crm_strdup_printf("%s: orphan", rsc_id);
} else if (all || failcount || last_failure > 0) {
char *failcount_s = failcount > 0 ? crm_strdup_printf(" %s=%d", CRM_FAIL_COUNT_PREFIX, failcount) : strdup("");
char *lastfail_s = last_failure > 0 ? crm_strdup_printf(" %s=%s", CRM_LAST_FAILURE_PREFIX,
crm_now_string(&last_failure)) : strdup("");
buf = crm_strdup_printf("%s: migration-threshold=%d%s%s",
rsc_id, rsc->migration_threshold, failcount_s, lastfail_s);
free(failcount_s);
free(lastfail_s);
} else {
buf = crm_strdup_printf("%s:", rsc_id);
}
return buf;
}
static int
ban_html(pcmk__output_t *out, va_list args) {
pe_node_t *pe_node = va_arg(args, pe_node_t *);
pe__location_t *location = va_arg(args, pe__location_t *);
unsigned int mon_ops = va_arg(args, unsigned int);
char *node_name = get_node_display_name(pe_node, mon_ops);
char *buf = crm_strdup_printf("%s\tprevents %s from running %son %s",
location->id, location->rsc_lh->id,
location->role_filter == RSC_ROLE_MASTER ? "as Master " : "",
node_name);
pcmk__output_create_html_node(out, "li", NULL, NULL, buf);
free(node_name);
free(buf);
return 0;
}
static int
ban_text(pcmk__output_t *out, va_list args) {
pe_node_t *pe_node = va_arg(args, pe_node_t *);
pe__location_t *location = va_arg(args, pe__location_t *);
unsigned int mon_ops = va_arg(args, unsigned int);
char *node_name = get_node_display_name(pe_node, mon_ops);
out->list_item(out, NULL, "%s\tprevents %s from running %son %s",
location->id, location->rsc_lh->id,
location->role_filter == RSC_ROLE_MASTER ? "as Master " : "",
node_name);
free(node_name);
return 0;
}
static int
ban_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "ban");
pe_node_t *pe_node = va_arg(args, pe_node_t *);
pe__location_t *location = va_arg(args, pe__location_t *);
char *weight_s = crm_itoa(pe_node->weight);
xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) location->id);
xmlSetProp(node, (pcmkXmlStr) "resource", (pcmkXmlStr) location->rsc_lh->id);
xmlSetProp(node, (pcmkXmlStr) "node", (pcmkXmlStr) pe_node->details->uname);
xmlSetProp(node, (pcmkXmlStr) "weight", (pcmkXmlStr) weight_s);
xmlSetProp(node, (pcmkXmlStr) "master_only",
(pcmkXmlStr) (location->role_filter == RSC_ROLE_MASTER ? "true" : "false"));
free(weight_s);
return 0;
}
static int
cluster_counts_html(pcmk__output_t *out, va_list args) {
xmlNodePtr nodes_node = pcmk__output_create_xml_node(out, "li");
xmlNodePtr resources_node = pcmk__output_create_xml_node(out, "li");
unsigned int nnodes = va_arg(args, unsigned int);
unsigned int nresources = va_arg(args, unsigned int);
unsigned int ndisabled = va_arg(args, unsigned int);
unsigned int nblocked = va_arg(args, unsigned int);
char *nnodes_str = crm_strdup_printf("%d node%s configured", nnodes, s_if_plural(nnodes));
pcmk_create_html_node(nodes_node, "span", NULL, NULL, nnodes_str);
free(nnodes_str);
if (ndisabled && nblocked) {
char *s = crm_strdup_printf("%d resource instance%s configured (%d ",
nresources, s_if_plural(nresources),
ndisabled);
pcmk_create_html_node(resources_node, "span", NULL, NULL, s);
free(s);
pcmk_create_html_node(resources_node, "span", NULL, "bold", "DISABLED");
s = crm_strdup_printf(", %d ", nblocked);
pcmk_create_html_node(resources_node, "span", NULL, NULL, s);
free(s);
pcmk_create_html_node(resources_node, "span", NULL, "bold", "BLOCKED");
- pcmk_create_html_node(resources_node, "span", NULL, NULL, " from starting due to failure)");
+ pcmk_create_html_node(resources_node, "span", NULL, NULL,
+ " from further action due to failure)");
} else if (ndisabled && !nblocked) {
char *s = crm_strdup_printf("%d resource instance%s configured (%d ",
nresources, s_if_plural(nresources),
ndisabled);
pcmk_create_html_node(resources_node, "span", NULL, NULL, s);
free(s);
pcmk_create_html_node(resources_node, "span", NULL, "bold", "DISABLED");
pcmk_create_html_node(resources_node, "span", NULL, NULL, ")");
} else if (!ndisabled && nblocked) {
char *s = crm_strdup_printf("%d resource instance%s configured (%d ",
nresources, s_if_plural(nresources),
nblocked);
pcmk_create_html_node(resources_node, "span", NULL, NULL, s);
free(s);
pcmk_create_html_node(resources_node, "span", NULL, "bold", "BLOCKED");
- pcmk_create_html_node(resources_node, "span", NULL, NULL, " from starting due to failure)");
+ pcmk_create_html_node(resources_node, "span", NULL, NULL,
+ " from further action due to failure)");
} else {
char *s = crm_strdup_printf("%d resource instance%s configured",
nresources, s_if_plural(nresources));
pcmk_create_html_node(resources_node, "span", NULL, NULL, s);
free(s);
}
return 0;
}
static int
cluster_counts_text(pcmk__output_t *out, va_list args) {
unsigned int nnodes = va_arg(args, unsigned int);
unsigned int nresources = va_arg(args, unsigned int);
unsigned int ndisabled = va_arg(args, unsigned int);
unsigned int nblocked = va_arg(args, unsigned int);
out->list_item(out, NULL, "%d node%s configured", nnodes, s_if_plural(nnodes));
if (ndisabled && nblocked) {
out->list_item(out, NULL, "%d resource instance%s configured "
"(%d DISABLED, %d BLOCKED from "
"further action due to failure)",
nresources, s_if_plural(nresources), ndisabled,
nblocked);
} else if (ndisabled && !nblocked) {
out->list_item(out, NULL, "%d resource instance%s configured "
"(%d DISABLED)",
nresources, s_if_plural(nresources), ndisabled);
} else if (!ndisabled && nblocked) {
out->list_item(out, NULL, "%d resource instance%s configured "
"(%d BLOCKED from further action "
"due to failure)",
nresources, s_if_plural(nresources), nblocked);
} else {
out->list_item(out, NULL, "%d resource instance%s configured",
nresources, s_if_plural(nresources));
}
return 0;
}
static int
cluster_counts_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr nodes_node = pcmk__output_create_xml_node(out, "nodes_configured");
xmlNodePtr resources_node = pcmk__output_create_xml_node(out, "resources_configured");
unsigned int nnodes = va_arg(args, unsigned int);
unsigned int nresources = va_arg(args, unsigned int);
unsigned int ndisabled = va_arg(args, unsigned int);
unsigned int nblocked = va_arg(args, unsigned int);
char *s = crm_itoa(nnodes);
xmlSetProp(nodes_node, (pcmkXmlStr) "number", (pcmkXmlStr) s);
free(s);
s = crm_itoa(nresources);
xmlSetProp(resources_node, (pcmkXmlStr) "number", (pcmkXmlStr) s);
free(s);
s = crm_itoa(ndisabled);
xmlSetProp(resources_node, (pcmkXmlStr) "disabled", (pcmkXmlStr) s);
free(s);
s = crm_itoa(nblocked);
xmlSetProp(resources_node, (pcmkXmlStr) "blocked", (pcmkXmlStr) s);
free(s);
return 0;
}
static int
cluster_dc_html(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "li");
node_t *dc = va_arg(args, node_t *);
const char *quorum = va_arg(args, const char *);
const char *dc_version_s = va_arg(args, const char *);
const char *dc_name = va_arg(args, const char *);
pcmk_create_html_node(node, "span", NULL, "bold", "Current DC: ");
if (dc) {
if (crm_is_true(quorum)) {
char *buf = crm_strdup_printf("%s (version %s) - partition with quorum",
dc_name, dc_version_s ? dc_version_s : "unknown");
pcmk_create_html_node(node, "span", NULL, NULL, buf);
free(buf);
} else {
char *buf = crm_strdup_printf("%s (version %s) - partition",
dc_name, dc_version_s ? dc_version_s : "unknown");
pcmk_create_html_node(node, "span", NULL, NULL, buf);
free(buf);
pcmk_create_html_node(node, "span", NULL, "warning", "WITHOUT");
pcmk_create_html_node(node, "span", NULL, NULL, "quorum");
}
} else {
pcmk_create_html_node(node ,"span", NULL, "warning", "NONE");
}
return 0;
}
static int
cluster_dc_text(pcmk__output_t *out, va_list args) {
node_t *dc = va_arg(args, node_t *);
const char *quorum = va_arg(args, const char *);
const char *dc_version_s = va_arg(args, const char *);
const char *dc_name = va_arg(args, const char *);
if (dc) {
out->list_item(out, "Current DC", "%s (version %s) - partition %s quorum",
dc_name, dc_version_s ? dc_version_s : "unknown",
crm_is_true(quorum) ? "with" : "WITHOUT");
} else {
out->list_item(out, "Current DC", "NONE");
}
return 0;
}
static int
cluster_dc_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "current_dc");
node_t *dc = va_arg(args, node_t *);
const char *quorum = va_arg(args, const char *);
const char *dc_version_s = va_arg(args, const char *);
if (dc) {
xmlSetProp(node, (pcmkXmlStr) "present", (pcmkXmlStr) "true");
xmlSetProp(node, (pcmkXmlStr) "version", (pcmkXmlStr) (dc_version_s ? dc_version_s : ""));
xmlSetProp(node, (pcmkXmlStr) "name", (pcmkXmlStr) dc->details->uname);
xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) dc->details->id);
xmlSetProp(node, (pcmkXmlStr) "with_quorum", (pcmkXmlStr) (crm_is_true(quorum) ? "true" : "false"));
} else {
xmlSetProp(node, (pcmkXmlStr) "present", (pcmkXmlStr) "false");
}
return 0;
}
static int
cluster_options_html(pcmk__output_t *out, va_list args) {
pe_working_set_t *data_set = va_arg(args, pe_working_set_t *);
/* Kind of a hack - close the list started by print_cluster_summary so we
* can put all the options in their own list, but just for HTML output.
*/
out->end_list(out);
/* And then this list will be closed by print_cluster_summary since it
* wants to close the list it created unconditionally.
*/
out->begin_list(out, NULL, NULL, "Config Options");
out->list_item(out, NULL, "STONITH of failed nodes %s",
is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled");
out->list_item(out, NULL, "Cluster is %s",
is_set(data_set->flags, pe_flag_symmetric_cluster) ? "symmetric" : "asymmetric");
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
out->list_item(out, NULL, "No Quorum policy: Freeze resources");
break;
case no_quorum_stop:
out->list_item(out, NULL, "No Quorum policy: Stop ALL resources");
break;
case no_quorum_ignore:
out->list_item(out, NULL, "No Quorum policy: Ignore");
break;
case no_quorum_suicide:
out->list_item(out, NULL, "No Quorum policy: Suicide");
break;
}
if (is_set(data_set->flags, pe_flag_maintenance_mode)) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "li");
pcmk_create_html_node(node, "span", NULL, "bold", "DISABLED");
pcmk_create_html_node(node, "span", NULL, NULL,
" (the cluster will not attempt to start, stop, or recover services)");
} else {
out->list_item(out, NULL, "Resource management enabled");
}
return 0;
}
static int
cluster_options_text(pcmk__output_t *out, va_list args) {
pe_working_set_t *data_set = va_arg(args, pe_working_set_t *);
if (is_set(data_set->flags, pe_flag_maintenance_mode)) {
fprintf(out->dest, "\n *** Resource management is DISABLED ***");
fprintf(out->dest, "\n The cluster will not attempt to start, stop or recover services");
fprintf(out->dest, "\n");
}
return 0;
}
static int
cluster_options_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "cluster_options");
pe_working_set_t *data_set = va_arg(args, pe_working_set_t *);
xmlSetProp(node, (pcmkXmlStr) "stonith-enabled",
(pcmkXmlStr) (is_set(data_set->flags, pe_flag_stonith_enabled) ? "true" : "false"));
xmlSetProp(node, (pcmkXmlStr) "symmetric-cluster",
(pcmkXmlStr) (is_set(data_set->flags, pe_flag_symmetric_cluster) ? "true" : "false"));
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "freeze");
break;
case no_quorum_stop:
xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "stop");
break;
case no_quorum_ignore:
xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "ignore");
break;
case no_quorum_suicide:
xmlSetProp(node, (pcmkXmlStr) "no-quorum-policy", (pcmkXmlStr) "suicide");
break;
}
xmlSetProp(node, (pcmkXmlStr) "maintenance-mode",
(pcmkXmlStr) (is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false"));
return 0;
}
static int
cluster_stack_html(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "li");
const char *stack_s = va_arg(args, const char *);
pcmk_create_html_node(node, "span", NULL, "bold", "Stack: ");
pcmk_create_html_node(node, "span", NULL, NULL, stack_s);
return 0;
}
static int
cluster_stack_text(pcmk__output_t *out, va_list args) {
const char *stack_s = va_arg(args, const char *);
out->list_item(out, "Stack", "%s", stack_s);
return 0;
}
static int
cluster_stack_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr node = pcmk__output_create_xml_node(out, "stack");
const char *stack_s = va_arg(args, const char *);
xmlSetProp(node, (pcmkXmlStr) "type", (pcmkXmlStr) stack_s);
return 0;
}
static int
cluster_times_html(pcmk__output_t *out, va_list args) {
xmlNodePtr updated_node = pcmk__output_create_xml_node(out, "li");
xmlNodePtr changed_node = pcmk__output_create_xml_node(out, "li");
const char *last_written = va_arg(args, const char *);
const char *user = va_arg(args, const char *);
const char *client = va_arg(args, const char *);
const char *origin = va_arg(args, const char *);
char *buf = last_changed_string(last_written, user, client, origin);
pcmk_create_html_node(updated_node, "span", NULL, "bold", "Last updated: ");
pcmk_create_html_node(updated_node, "span", NULL, NULL, crm_now_string(NULL));
pcmk_create_html_node(changed_node, "span", NULL, "bold", "Last change: ");
pcmk_create_html_node(changed_node, "span", NULL, NULL, buf);
free(buf);
return 0;
}
static int
cluster_times_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr updated_node = pcmk__output_create_xml_node(out, "last_update");
xmlNodePtr changed_node = pcmk__output_create_xml_node(out, "last_change");
const char *last_written = va_arg(args, const char *);
const char *user = va_arg(args, const char *);
const char *client = va_arg(args, const char *);
const char *origin = va_arg(args, const char *);
xmlSetProp(updated_node, (pcmkXmlStr) "time", (pcmkXmlStr) crm_now_string(NULL));
xmlSetProp(changed_node, (pcmkXmlStr) "time", (pcmkXmlStr) (last_written ? last_written : ""));
xmlSetProp(changed_node, (pcmkXmlStr) "user", (pcmkXmlStr) (user ? user : ""));
xmlSetProp(changed_node, (pcmkXmlStr) "client", (pcmkXmlStr) (client ? client : ""));
xmlSetProp(changed_node, (pcmkXmlStr) "origin", (pcmkXmlStr) (origin ? origin : ""));
return 0;
}
static int
cluster_times_text(pcmk__output_t *out, va_list args) {
const char *last_written = va_arg(args, const char *);
const char *user = va_arg(args, const char *);
const char *client = va_arg(args, const char *);
const char *origin = va_arg(args, const char *);
char *buf = last_changed_string(last_written, user, client, origin);
out->list_item(out, "Last updated", "%s", crm_now_string(NULL));
out->list_item(out, "Last change", " %s", buf);
free(buf);
return 0;
}
static int
failed_action_console(pcmk__output_t *out, va_list args) {
xmlNodePtr xml_op = va_arg(args, xmlNodePtr);
char *s = failed_action_string(xml_op);
curses_indented_printf(out, "%s\n", s);
free(s);
return 0;
}
static int
failed_action_html(pcmk__output_t *out, va_list args) {
xmlNodePtr xml_op = va_arg(args, xmlNodePtr);
char *s = failed_action_string(xml_op);
pcmk__output_create_html_node(out, "li", NULL, NULL, s);
free(s);
return 0;
}
static int
failed_action_text(pcmk__output_t *out, va_list args) {
xmlNodePtr xml_op = va_arg(args, xmlNodePtr);
char *s = failed_action_string(xml_op);
pcmk__indented_printf(out, "%s\n", s);
free(s);
return 0;
}
static int
failed_action_xml(pcmk__output_t *out, va_list args) {
xmlNodePtr xml_op = va_arg(args, xmlNodePtr);
const char *op_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY);
const char *last = crm_element_value(xml_op, XML_RSC_OP_LAST_CHANGE);
int rc = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_RC), "0");
int status = crm_parse_int(crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS), "0");
const char *exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON);
char *rc_s = crm_itoa(rc);
char *reason_s = crm_xml_escape(exit_reason ? exit_reason : "none");
xmlNodePtr node = pcmk__output_create_xml_node(out, "failure");
xmlSetProp(node, (pcmkXmlStr) (op_key ? "op_key" : "id"),
(pcmkXmlStr) (op_key ? op_key : "id"));
xmlSetProp(node, (pcmkXmlStr) "node",
(pcmkXmlStr) crm_element_value(xml_op, XML_ATTR_UNAME));
xmlSetProp(node, (pcmkXmlStr) "exitstatus",
(pcmkXmlStr) services_ocf_exitcode_str(rc));
xmlSetProp(node, (pcmkXmlStr) "exitreason", (pcmkXmlStr) reason_s);
xmlSetProp(node, (pcmkXmlStr) "exitcode", (pcmkXmlStr) rc_s);
xmlSetProp(node, (pcmkXmlStr) "call",
(pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_CALLID));
xmlSetProp(node, (pcmkXmlStr) "status",
(pcmkXmlStr) services_lrm_status_str(status));
if (last) {
char *s = crm_itoa(crm_parse_ms(crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS)));
char *rc_change = time_t_string(crm_parse_int(last, "0"));
xmlSetProp(node, (pcmkXmlStr) "last-rc-change", (pcmkXmlStr) rc_change);
xmlSetProp(node, (pcmkXmlStr) "queued",
(pcmkXmlStr) crm_element_value(xml_op, XML_RSC_OP_T_QUEUE));
xmlSetProp(node, (pcmkXmlStr) "exec",
(pcmkXmlStr) crm_element_value(xml_op, XML_RSC_OP_T_EXEC));
xmlSetProp(node, (pcmkXmlStr) "interval", (pcmkXmlStr) s);
xmlSetProp(node, (pcmkXmlStr) "task",
(pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_TASK));
free(s);
free(rc_change);
}
free(reason_s);
free(rc_s);
return 0;
}
static int
node_html(pcmk__output_t *out, va_list args) {
node_t *node = va_arg(args, node_t *);
unsigned int mon_ops = va_arg(args, unsigned int);
gboolean full = va_arg(args, gboolean);
char *node_name = get_node_display_name(node, mon_ops);
char *buf = crm_strdup_printf("Node: %s", node_name);
int print_opts = get_resource_display_options(mon_ops, mon_output_html);
if (full) {
xmlNodePtr item_node = pcmk__output_create_xml_node(out, "li");
pcmk_create_html_node(item_node, "span", NULL, NULL, buf);
if (node->details->standby_onfail && node->details->online) {
pcmk_create_html_node(item_node, "span", NULL, "standby", " standby (on-fail)");
} else if (node->details->standby && node->details->online) {
char *s = crm_strdup_printf(" standby%s", node->details->running_rsc ? " (with active resources)" : "");
pcmk_create_html_node(item_node, "span", NULL, " standby", s);
free(s);
} else if (node->details->standby) {
pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE (standby)");
} else if (node->details->maintenance && node->details->online) {
pcmk_create_html_node(item_node, "span", NULL, "maint", " maintenance");
} else if (node->details->maintenance) {
pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE (maintenance)");
} else if (node->details->online) {
pcmk_create_html_node(item_node, "span", NULL, "online", " online");
} else {
pcmk_create_html_node(item_node, "span", NULL, "offline", " OFFLINE");
}
if (is_set(mon_ops, mon_op_print_brief) && is_set(mon_ops, mon_op_group_by_node)) {
out->begin_list(out, NULL, NULL, NULL);
pe__rscs_brief_output(out, node->details->running_rsc, print_opts | pe_print_rsconly,
FALSE);
out->end_list(out);
} else if (is_set(mon_ops, mon_op_group_by_node)) {
GListPtr lpc2 = NULL;
out->begin_list(out, NULL, NULL, NULL);
for (lpc2 = node->details->running_rsc; lpc2 != NULL; lpc2 = lpc2->next) {
resource_t *rsc = (resource_t *) lpc2->data;
out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc);
}
out->end_list(out);
}
} else {
out->begin_list(out, NULL, NULL, "%s", buf);
}
free(buf);
free(node_name);
return 0;
}
static int
node_text(pcmk__output_t *out, va_list args) {
node_t *node = va_arg(args, node_t *);
unsigned int mon_ops = va_arg(args, unsigned int);
gboolean full = va_arg(args, gboolean);
if (full) {
const char *node_mode = va_arg(args, const char *);
char *node_name = get_node_display_name(node, mon_ops);
int print_opts = get_resource_display_options(mon_ops, mon_output_xml);
char *buf = NULL;
/* Print the node name and status */
if (pe__is_guest_node(node)) {
buf = crm_strdup_printf("GuestNode %s: %s", node_name, node_mode);
} else if (pe__is_remote_node(node)) {
buf = crm_strdup_printf("RemoteNode %s: %s", node_name, node_mode);
} else {
buf = crm_strdup_printf("Node %s: %s", node_name, node_mode);
}
/* If we're grouping by node, print its resources */
if (is_set(mon_ops, mon_op_group_by_node)) {
out->begin_list(out, NULL, NULL, "%s", buf);
out->begin_list(out, NULL, NULL, "Resources");
if (is_set(mon_ops, mon_op_print_brief)) {
pe__rscs_brief_output(out, node->details->running_rsc,
print_opts | pe_print_rsconly, FALSE);
} else {
GListPtr gIter2 = NULL;
for (gIter2 = node->details->running_rsc; gIter2 != NULL; gIter2 = gIter2->next) {
resource_t *rsc = (resource_t *) gIter2->data;
out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc);
}
}
out->end_list(out);
out->end_list(out);
} else {
out->list_item(out, NULL, "%s", buf);
}
free(buf);
free(node_name);
} else {
out->begin_list(out, NULL, NULL, "Node: %s", get_node_display_name(node, mon_ops));
}
return 0;
}
static int
node_xml(pcmk__output_t *out, va_list args) {
node_t *node = va_arg(args, node_t *);
unsigned int mon_ops G_GNUC_UNUSED = va_arg(args, unsigned int);
gboolean full = va_arg(args, gboolean);
if (full) {
const char *node_type = "unknown";
int print_opts = get_resource_display_options(mon_ops, mon_output_xml);
char *length_s = crm_itoa(g_list_length(node->details->running_rsc));
switch (node->details->type) {
case node_member:
node_type = "member";
break;
case node_remote:
node_type = "remote";
break;
case node_ping:
node_type = "ping";
break;
}
pe__name_and_nvpairs_xml(out, true, "node", 13,
"name", node->details->uname,
"id", node->details->id,
"online", node->details->online ? "true" : "false",
"standby", node->details->standby ? "true" : "false",
"standby_onfail", node->details->standby_onfail ? "true" : "false",
"maintenance", node->details->maintenance ? "true" : "false",
"pending", node->details->pending ? "true" : "false",
"unclean", node->details->unclean ? "true" : "false",
"shutdown", node->details->shutdown ? "true" : "false",
"expected_up", node->details->expected_up ? "true" : "false",
"is_dc", node->details->is_dc ? "true" : "false",
"resources_running", length_s,
"type", node_type);
if (pe__is_guest_node(node)) {
xmlNodePtr xml_node = pcmk__output_xml_peek_parent(out);
xmlSetProp(xml_node, (pcmkXmlStr) "id_as_resource",
(pcmkXmlStr) node->details->remote_rsc->container->id);
}
if (is_set(mon_ops, mon_op_group_by_node)) {
GListPtr lpc = NULL;
for (lpc = node->details->running_rsc; lpc != NULL; lpc = lpc->next) {
resource_t *rsc = (resource_t *) lpc->data;
out->message(out, crm_map_element_name(rsc->xml), print_opts | pe_print_rsconly, rsc);
}
}
free(length_s);
out->end_list(out);
} else {
xmlNodePtr parent = pcmk__output_xml_create_parent(out, "node");
xmlSetProp(parent, (pcmkXmlStr) "name", (pcmkXmlStr) node->details->uname);
}
return 0;
}
static int
node_attribute_text(pcmk__output_t *out, va_list args) {
const char *name = va_arg(args, const char *);
const char *value = va_arg(args, const char *);
gboolean add_extra = va_arg(args, gboolean);
int expected_score = va_arg(args, int);
if (add_extra) {
int v = crm_parse_int(value, "0");
if (v <= 0) {
out->list_item(out, NULL, "%-32s\t: %-10s\t: Connectivity is lost", name, value);
} else if (v < expected_score) {
out->list_item(out, NULL, "%-32s\t: %-10s\t: Connectivity is degraded (Expected=%d)", name, value, expected_score);
} else {
out->list_item(out, NULL, "%-32s\t: %-10s", name, value);
}
} else {
out->list_item(out, NULL, "%-32s\t: %-10s", name, value);
}
return 0;
}
static int
node_attribute_html(pcmk__output_t *out, va_list args) {
const char *name = va_arg(args, const char *);
const char *value = va_arg(args, const char *);
gboolean add_extra = va_arg(args, gboolean);
int expected_score = va_arg(args, int);
if (add_extra) {
int v = crm_parse_int(value, "0");
char *s = crm_strdup_printf("%s: %s", name, value);
xmlNodePtr item_node = pcmk__output_create_xml_node(out, "li");
pcmk_create_html_node(item_node, "span", NULL, NULL, s);
free(s);
if (v <= 0) {
pcmk_create_html_node(item_node, "span", NULL, "bold", "(connectivity is lost)");
} else if (v < expected_score) {
char *buf = crm_strdup_printf("(connectivity is degraded -- expected %d", expected_score);
pcmk_create_html_node(item_node, "span", NULL, "bold", buf);
free(buf);
}
} else {
out->list_item(out, NULL, "%s: %s", name, value);
}
return 0;
}
static int
node_attribute_xml(pcmk__output_t *out, va_list args) {
const char *name = va_arg(args, const char *);
const char *value = va_arg(args, const char *);
gboolean add_extra = va_arg(args, gboolean);
int expected_score = va_arg(args, int);
xmlNodePtr node = pcmk__output_create_xml_node(out, "attribute");
xmlSetProp(node, (pcmkXmlStr) "name", (pcmkXmlStr) name);
xmlSetProp(node, (pcmkXmlStr) "value", (pcmkXmlStr) value);
if (add_extra) {
char *buf = crm_itoa(expected_score);
xmlSetProp(node, (pcmkXmlStr) "expected", (pcmkXmlStr) buf);
free(buf);
}
return 0;
}
static int
op_history_text(pcmk__output_t *out, va_list args) {
xmlNode *xml_op = va_arg(args, xmlNode *);
const char *task = va_arg(args, const char *);
const char *interval_ms_s = va_arg(args, const char *);
int rc = va_arg(args, int);
unsigned int mon_ops = va_arg(args, unsigned int);
char *buf = op_history_string(xml_op, task, interval_ms_s, rc, mon_ops);
out->list_item(out, NULL, "%s", buf);
free(buf);
return 0;
}
static int
op_history_xml(pcmk__output_t *out, va_list args) {
xmlNode *xml_op = va_arg(args, xmlNode *);
const char *task = va_arg(args, const char *);
const char *interval_ms_s = va_arg(args, const char *);
int rc = va_arg(args, int);
unsigned int mon_ops = va_arg(args, unsigned int);
char *rc_s = NULL;
xmlNodePtr node = pcmk__output_create_xml_node(out, "operation_history");
xmlSetProp(node, (pcmkXmlStr) "call",
(pcmkXmlStr) crm_element_value(xml_op, XML_LRM_ATTR_CALLID));
xmlSetProp(node, (pcmkXmlStr) "task", (pcmkXmlStr) task);
if (interval_ms_s && safe_str_neq(interval_ms_s, "0")) {
char *s = crm_strdup_printf("%sms", interval_ms_s);
xmlSetProp(node, (pcmkXmlStr) "interval", (pcmkXmlStr) s);
free(s);
}
if (is_set(mon_ops, mon_op_print_timing)) {
const char *value = NULL;
value = crm_element_value(xml_op, XML_RSC_OP_LAST_CHANGE);
if (value) {
time_t int_value = (time_t) crm_parse_int(value, NULL);
if (int_value > 0) {
xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_LAST_CHANGE,
(pcmkXmlStr) crm_now_string(&int_value));
}
}
value = crm_element_value(xml_op, XML_RSC_OP_LAST_RUN);
if (value) {
time_t int_value = (time_t) crm_parse_int(value, NULL);
if (int_value > 0) {
xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_LAST_RUN,
(pcmkXmlStr) crm_now_string(&int_value));
}
}
value = crm_element_value(xml_op, XML_RSC_OP_T_EXEC);
if (value) {
char *s = crm_strdup_printf("%sms", value);
xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_T_EXEC, (pcmkXmlStr) s);
free(s);
}
value = crm_element_value(xml_op, XML_RSC_OP_T_QUEUE);
if (value) {
char *s = crm_strdup_printf("%sms", value);
xmlSetProp(node, (pcmkXmlStr) XML_RSC_OP_T_QUEUE, (pcmkXmlStr) s);
free(s);
}
}
rc_s = crm_itoa(rc);
xmlSetProp(node, (pcmkXmlStr) "rc", (pcmkXmlStr) rc_s);
xmlSetProp(node, (pcmkXmlStr) "rc_text", (pcmkXmlStr) services_ocf_exitcode_str(rc));
free(rc_s);
return 0;
}
static int
resource_history_text(pcmk__output_t *out, va_list args) {
resource_t *rsc = va_arg(args, resource_t *);
const char *rsc_id = va_arg(args, const char *);
gboolean all = va_arg(args, gboolean);
int failcount = va_arg(args, int);
time_t last_failure = va_arg(args, int);
char *buf = resource_history_string(rsc, rsc_id, all, failcount, last_failure);
out->begin_list(out, NULL, NULL, "%s", buf);
free(buf);
return 0;
}
static int
resource_history_xml(pcmk__output_t *out, va_list args) {
resource_t *rsc = va_arg(args, resource_t *);
const char *rsc_id = va_arg(args, const char *);
gboolean all = va_arg(args, gboolean);
int failcount = va_arg(args, int);
time_t last_failure = va_arg(args, int);
xmlNodePtr node = pcmk__output_xml_create_parent(out, "resource_history");
xmlSetProp(node, (pcmkXmlStr) "id", (pcmkXmlStr) rsc_id);
if (rsc == NULL) {
xmlSetProp(node, (pcmkXmlStr) "orphan", (pcmkXmlStr) "true");
} else if (all || failcount || last_failure > 0) {
char *migration_s = crm_itoa(rsc->migration_threshold);
xmlSetProp(node, (pcmkXmlStr) "orphan", (pcmkXmlStr) "false");
xmlSetProp(node, (pcmkXmlStr) "migration-threshold",
(pcmkXmlStr) migration_s);
free(migration_s);
if (failcount > 0) {
char *s = crm_itoa(failcount);
xmlSetProp(node, (pcmkXmlStr) CRM_FAIL_COUNT_PREFIX, (pcmkXmlStr) s);
free(s);
}
if (last_failure > 0) {
xmlSetProp(node, (pcmkXmlStr) CRM_LAST_FAILURE_PREFIX,
(pcmkXmlStr) crm_now_string(&last_failure));
}
}
return 0;
}
static int
stonith_event_console(pcmk__output_t *out, va_list args) {
stonith_history_t *event = va_arg(args, stonith_history_t *);
int full_history = va_arg(args, int);
gboolean later_succeeded = va_arg(args, gboolean);
char *buf = NULL;
buf = time_t_string(event->completed);
switch (event->state) {
case st_failed:
curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s %s'\n",
stonith_action_str(event->action), event->target,
event->delegate ? event->delegate : "",
event->client, event->origin,
full_history ? "completed" : "last-failed", buf,
later_succeeded ? "(a later attempt succeeded)" : "");
break;
case st_done:
curses_indented_printf(out, "%s of %s successful: delegate=%s, client=%s, origin=%s, %s='%s'\n",
stonith_action_str(event->action), event->target,
event->delegate ? event->delegate : "",
event->client, event->origin,
full_history ? "completed" : "last-successful", buf);
break;
default:
curses_indented_printf(out, "%s of %s pending: client=%s, origin=%s\n",
stonith_action_str(event->action), event->target,
event->client, event->origin);
break;
}
free(buf);
return 0;
}
static int
ticket_console(pcmk__output_t *out, va_list args) {
ticket_t *ticket = va_arg(args, ticket_t *);
if (ticket->last_granted > -1) {
char *time = pcmk_format_named_time("last-granted", ticket->last_granted);
out->list_item(out, ticket->id, "\t%s%s %s",
ticket->granted ? "granted" : "revoked",
ticket->standby ? " [standby]" : "",
time);
free(time);
} else {
out->list_item(out, ticket->id, "\t%s%s",
ticket->granted ? "granted" : "revoked",
ticket->standby ? " [standby]" : "");
}
return 0;
}
static pcmk__message_entry_t fmt_functions[] = {
{ "ban", "console", ban_text },
{ "ban", "html", ban_html },
{ "ban", "text", ban_text },
{ "ban", "xml", ban_xml },
{ "bundle", "console", pe__bundle_text },
{ "clone", "console", pe__clone_text },
{ "cluster-counts", "console", cluster_counts_text },
{ "cluster-counts", "html", cluster_counts_html },
{ "cluster-counts", "text", cluster_counts_text },
{ "cluster-counts", "xml", cluster_counts_xml },
{ "cluster-dc", "console", cluster_dc_text },
{ "cluster-dc", "html", cluster_dc_html },
{ "cluster-dc", "text", cluster_dc_text },
{ "cluster-dc", "xml", cluster_dc_xml },
{ "cluster-options", "console", cluster_options_text },
{ "cluster-options", "html", cluster_options_html },
{ "cluster-options", "text", cluster_options_text },
{ "cluster-options", "xml", cluster_options_xml },
{ "cluster-stack", "console", cluster_stack_text },
{ "cluster-stack", "html", cluster_stack_html },
{ "cluster-stack", "text", cluster_stack_text },
{ "cluster-stack", "xml", cluster_stack_xml },
{ "cluster-times", "console", cluster_times_text },
{ "cluster-times", "html", cluster_times_html },
{ "cluster-times", "text", cluster_times_text },
{ "cluster-times", "xml", cluster_times_xml },
{ "failed-action", "console", failed_action_console },
{ "failed-action", "html", failed_action_html },
{ "failed-action", "text", failed_action_text },
{ "failed-action", "xml", failed_action_xml },
{ "group", "console", pe__group_text },
{ "node", "console", node_text },
{ "node", "html", node_html },
{ "node", "text", node_text },
{ "node", "xml", node_xml },
{ "node-attribute", "console", node_attribute_text },
{ "node-attribute", "html", node_attribute_html },
{ "node-attribute", "text", node_attribute_text },
{ "node-attribute", "xml", node_attribute_xml },
{ "op-history", "console", op_history_text },
{ "op-history", "html", op_history_text },
{ "op-history", "text", op_history_text },
{ "op-history", "xml", op_history_xml },
{ "primitive", "console", pe__resource_text },
{ "resource-history", "console", resource_history_text },
{ "resource-history", "html", resource_history_text },
{ "resource-history", "text", resource_history_text },
{ "resource-history", "xml", resource_history_xml },
{ "stonith-event", "console", stonith_event_console },
{ "ticket", "console", ticket_console },
{ NULL, NULL, NULL }
};
void
crm_mon_register_messages(pcmk__output_t *out) {
pcmk__register_messages(out, fmt_functions);
}