diff --git a/COPYING b/COPYING
index 3f7fb37b2e..7936df300a 100644
--- a/COPYING
+++ b/COPYING
@@ -1,14 +1,14 @@
 Except where noted otherwise in the file itself, the source code for all
 Pacemaker programs is licensed under version 2 or later of the GNU General
 Public License (GPLv2+), its headers and libraries under version 2.1 or
 later of the less restrictive GNU Lesser General Public License (LGPLv2.1+),
 its documentation under version 4.0 or later of the Creative Commons
 Attribution-ShareAlike International Public License (CC-BY-SA v4.0+),
 and its init scripts under the Revised BSD license.
 
 The text of these licenses are provided in the "licenses" subdirectory.
 
 If you find any deviations from this policy, or wish to inquire about alternate
-licensing arrangements, please e-mail andrew@beekhof.net.
-Licensing issues are further discussed on the ClusterLabs wiki
-(at http://clusterlabs.org/wiki/License).
+licensing arrangements, please e-mail the developers@ClusterLabs.org mailing
+list. Licensing issues are further discussed on the ClusterLabs wiki
+(at https://wiki.clusterlabs.org/wiki/License).
diff --git a/cts/CTStests.py b/cts/CTStests.py
index f1d6cd38a9..6a4aa51fc5 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3115 +1,3111 @@
 """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
 """
 
 # Pacemaker targets compatibility with Python 2.7 and 3.2+
 from __future__ import print_function, unicode_literals, absolute_import, division
 
 __copyright__ = """Copyright 2000, 2001 Alan Robertson <alanr@unix.sh>
 Add RecourceRecover testcase Zhao Kai <zhaokai@cn.ibm.com>
 """
 
 __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
 
 #
 #        SPECIAL NOTE:
 #
 #        Tests may NOT implement any cluster-manager-specific code in them.
 #        EXTEND the ClusterManager object to provide the base capabilities
 #        the test needs if you need to do something that the current CM classes
 #        do not.  Otherwise you screw up the whole point of the object structure
 #        in CTS.
 #
 #                Thank you.
 #
 
 import os
 import re
 import time
 import subprocess
 import tempfile
 
 from stat import *
 from cts import CTS
 from cts.CTSaudits import *
 from cts.CTSvars   import *
 from cts.patterns  import PatternSelector
 from cts.logging   import LogFactory
 from cts.remote    import RemoteFactory, input_wrapper
 from cts.watcher   import LogWatcher
 from cts.environment import EnvFactory
 
 AllTestClasses = [ ]
 
 
 class CTSTest(object):
     '''
     A Cluster test.
     We implement the basic set of properties and behaviors for a generic
     cluster test.
 
     Cluster tests track their own statistics.
     We keep each of the kinds of counts we track as separate {name,value}
     pairs.
     '''
 
     def __init__(self, cm):
         #self.name="the unnamed test"
         self.Stats = {"calls":0
         ,        "success":0
         ,        "failure":0
         ,        "skipped":0
         ,        "auditfail":0}
 
 #        if not issubclass(cm.__class__, ClusterManager):
 #            raise ValueError("Must be a ClusterManager object")
         self.CM = cm
         self.Env = EnvFactory().getInstance()
         self.rsh = RemoteFactory().getInstance()
         self.logger = LogFactory()
         self.templates = PatternSelector(cm["Name"])
         self.Audits = []
         self.timeout = 120
         self.passed = 1
         self.is_loop = 0
         self.is_unsafe = 0
         self.is_docker_unsafe = 0
         self.is_experimental = 0
         self.is_container = 0
         self.is_valgrind = 0
         self.benchmark = 0  # which tests to benchmark
         self.timer = {}  # timers
 
     def log(self, args):
         self.logger.log(args)
 
     def debug(self, args):
         self.logger.debug(args)
 
     def has_key(self, key):
         return key in self.Stats
 
     def __setitem__(self, key, value):
         self.Stats[key] = value
 
     def __getitem__(self, key):
         if str(key) == "0":
             raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")
 
         if key in self.Stats:
             return self.Stats[key]
         return None
 
     def log_mark(self, msg):
         self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
         return
 
     def get_timer(self,key = "test"):
         try: return self.timer[key]
         except: return 0
 
     def set_timer(self,key = "test"):
         self.timer[key] = time.time()
         return self.timer[key]
 
     def log_timer(self,key = "test"):
         elapsed = 0
         if key in self.timer:
             elapsed = time.time() - self.timer[key]
             s = key == "test" and self.name or "%s:%s" % (self.name,key)
             self.debug("%s runtime: %.2f" % (s, elapsed))
             del self.timer[key]
         return elapsed
 
     def incr(self, name):
         '''Increment (or initialize) the value associated with the given name'''
         if not name in self.Stats:
             self.Stats[name] = 0
         self.Stats[name] = self.Stats[name]+1
 
         # Reset the test passed boolean
         if name == "calls":
             self.passed = 1
 
     def failure(self, reason="none"):
         '''Increment the failure count'''
         self.passed = 0
         self.incr("failure")
         self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
         return None
 
     def success(self):
         '''Increment the success count'''
         self.incr("success")
         return 1
 
     def skipped(self):
         '''Increment the skipped count'''
         self.incr("skipped")
         return 1
 
     def __call__(self, node):
         '''Perform the given test'''
         raise ValueError("Abstract Class member (__call__)")
         self.incr("calls")
         return self.failure()
 
     def audit(self):
         passed = 1
         if len(self.Audits) > 0:
             for audit in self.Audits:
                 if not audit():
                     self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
                     self.incr("auditfail")
                     passed = 0
         return passed
 
     def setup(self, node):
         '''Setup the given test'''
         return self.success()
 
     def teardown(self, node):
         '''Tear down the given test'''
         return self.success()
 
     def create_watch(self, patterns, timeout, name=None):
         if not name:
             name = self.name
         return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
 
     def local_badnews(self, prefix, watch, local_ignore=[]):
         errcount = 0
         if not prefix:
             prefix = "LocalBadNews:"
 
         ignorelist = []
         ignorelist.append(" CTS: ")
         ignorelist.append(prefix)
         ignorelist.extend(local_ignore)
 
         while errcount < 100:
             match = watch.look(0)
             if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.logger.log(prefix + " " + match)
                    errcount = errcount + 1
             else:
               break
         else:
             self.logger.log("Too many errors!")
 
         watch.end()
         return errcount
 
     def is_applicable(self):
         return self.is_applicable_common()
 
     def is_applicable_common(self):
         '''Return TRUE if we are applicable in the current test configuration'''
         #raise ValueError("Abstract Class member (is_applicable)")
 
         if self.is_loop and not self.Env["loop-tests"]:
             return 0
         elif self.is_unsafe and not self.Env["unsafe-tests"]:
             return 0
         elif self.is_valgrind and not self.Env["valgrind-tests"]:
             return 0
         elif self.is_experimental and not self.Env["experimental-tests"]:
             return 0
         elif self.is_docker_unsafe and self.Env["docker"]:
             return 0
         elif self.is_container and not self.Env["container-tests"]:
             return 0
         elif self.Env["benchmark"] and self.benchmark == 0:
             return 0
 
         return 1
 
     def find_ocfs2_resources(self, node):
         self.r_o2cb = None
         self.r_ocfs2 = []
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rtype == "o2cb" and r.parent != "NA":
                     self.debug("Found o2cb: %s" % self.r_o2cb)
                     self.r_o2cb = r.parent
             if re.search("^Constraint", line):
                 c = AuditConstraint(self.CM, line)
                 if c.type == "rsc_colocation" and c.target == self.r_o2cb:
                     self.r_ocfs2.append(c.rsc)
 
         self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
         return len(self.r_ocfs2)
 
     def canrunnow(self, node):
         '''Return TRUE if we can meaningfully run right now'''
         return 1
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return []
 
 
 class StopTest(CTSTest):
     '''Stop (deactivate) the cluster manager on a node'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Stop"
 
     def __call__(self, node):
         '''Perform the 'stop' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] != "up":
             return self.skipped()
 
         patterns = []
         # Technically we should always be able to notice ourselves stopping
         patterns.append(self.templates["Pat:We_stopped"] % node)
 
         # Any active node needs to notice this one left
         # (note that this won't work if we have multiple partitions)
         for other in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[other] == "up" and other != node:
                 patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
                 #self.debug("Checking %s will notice %s left"%(other, node))
 
         watch = self.create_watch(patterns, self.Env["DeadTime"])
         watch.setwatch()
 
         if node == self.CM.OurNode:
             self.incr("us")
         else:
             if self.CM.upcount() <= 1:
                 self.incr("all")
             else:
                 self.incr("them")
 
         self.CM.StopaCM(node)
         watch_result = watch.lookforall()
 
         failreason = None
         UnmatchedList = "||"
         if watch.unmatched:
             (rc, output) = self.rsh(node, "/bin/ps axf", None)
             for line in output:
                 self.debug(line)
 
             (rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
             for line in output:
                 self.debug(line)
 
             for regex in watch.unmatched:
                 self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
                 UnmatchedList +=  regex + "||";
                 failreason = "Missing shutdown pattern"
 
         self.CM.cluster_stable(self.Env["DeadTime"])
 
         if not watch.unmatched or self.CM.upcount() == 0:
             return self.success()
 
         if len(watch.unmatched) >= self.CM.upcount():
             return self.failure("no match against (%s)" % UnmatchedList)
 
         if failreason == None:
             return self.success()
         else:
             return self.failure(failreason)
 #
 # We don't register StopTest because it's better when called by
 # another test...
 #
 
 
 class StartTest(CTSTest):
     '''Start (activate) the cluster manager on a node'''
     def __init__(self, cm, debug=None):
         CTSTest.__init__(self,cm)
         self.name = "start"
         self.debug = debug
 
     def __call__(self, node):
         '''Perform the 'start' test. '''
         self.incr("calls")
 
         if self.CM.upcount() == 0:
             self.incr("us")
         else:
             self.incr("them")
 
         if self.CM.ShouldBeStatus[node] != "down":
             return self.skipped()
         elif self.CM.StartaCM(node):
             return self.success()
         else:
             return self.failure("Startup %s on node %s failed"
                                 % (self.Env["Name"], node))
 
 #
 # We don't register StartTest because it's better when called by
 # another test...
 #
 
 
 class FlipTest(CTSTest):
     '''If it's running, stop it.  If it's stopped start it.
        Overthrow the status quo...
     '''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Flip"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, node):
         '''Perform the 'Flip' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] == "up":
             self.incr("stopped")
             ret = self.stop(node)
             type = "up->down"
             # Give the cluster time to recognize it's gone...
             time.sleep(self.Env["StableTime"])
         elif self.CM.ShouldBeStatus[node] == "down":
             self.incr("started")
             ret = self.start(node)
             type = "down->up"
         else:
             return self.skipped()
 
         self.incr(type)
         if ret:
             return self.success()
         else:
             return self.failure("%s failure" % type)
 
 #        Register FlipTest as a good test to run
 AllTestClasses.append(FlipTest)
 
 
 class RestartTest(CTSTest):
     '''Stop and restart a node'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Restart"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
         self.benchmark = 1
 
     def __call__(self, node):
         '''Perform the 'restart' test. '''
         self.incr("calls")
 
         self.incr("node:" + node)
 
         ret1 = 1
         if self.CM.StataCM(node):
             self.incr("WasStopped")
             if not self.start(node):
                 return self.failure("start (setup) failure: "+node)
 
         self.set_timer()
         if not self.stop(node):
             return self.failure("stop failure: "+node)
         if not self.start(node):
             return self.failure("start failure: "+node)
         return self.success()
 
 #        Register RestartTest as a good test to run
 AllTestClasses.append(RestartTest)
 
 
 class StonithdTest(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Stonithd"
         self.startall = SimulStartLite(cm)
         self.benchmark = 1
 
     def __call__(self, node):
         self.incr("calls")
         if len(self.Env["nodes"]) < 2:
             return self.skipped()
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         is_dc = self.CM.is_node_dc(node)
 
         watchpats = []
         watchpats.append(self.templates["Pat:FenceOpOK"] % node)
         watchpats.append(self.templates["Pat:NodeFenced"] % node)
 
         if self.Env["at-boot"] == 0:
             self.debug("Expecting %s to stay down" % node)
             self.CM.ShouldBeStatus[node] = "down"
         else:
             self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
             watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
             watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
 
         watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
         watch.setwatch()
 
         origin = self.Env.RandomGen.choice(self.Env["nodes"])
 
         rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
 
         if rc == 194:
             # 194 - 256 = -62 = Timer expired
             #
             # Look for the patterns, usually this means the required
             # device was running on the node to be fenced - or that
             # the required devices were in the process of being loaded
             # and/or moved
             #
             # Effectively the node committed suicide so there will be
             # no confirmation, but pacemaker should be watching and
             # fence the node again
 
             self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
 
         elif origin != node and rc != 0:
             self.debug("Waiting for the cluster to recover")
             self.CM.cluster_stable()
 
             self.debug("Waiting for fenced node to come back up")
             self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
             self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
 
         elif origin == node and rc != 255:
             # 255 == broken pipe, ie. the node was fenced as expected
             self.logger.log("Locally originated fencing returned %d" % rc)
 
         self.set_timer("fence")
         matched = watch.lookforall()
         self.log_timer("fence")
         self.set_timer("reform")
         if watch.unmatched:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         self.debug("Waiting for fenced node to come back up")
         self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         is_stable = self.CM.cluster_stable(self.Env["StartTime"])
 
         if not matched:
             return self.failure("Didn't find all expected patterns")
         elif not is_stable:
             return self.failure("Cluster did not become stable")
 
         self.log_timer("reform")
         return self.success()
 
     def errorstoignore(self):
         return [
             self.templates["Pat:Fencing_start"] % ".*",
             self.templates["Pat:Fencing_ok"] % ".*",
             r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
             r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired",
         ]
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
 
         if "DoFencing" in list(self.Env.keys()):
             return self.Env["DoFencing"]
 
         return 1
 
 AllTestClasses.append(StonithdTest)
 
 
 class StartOnebyOne(CTSTest):
     '''Start all the nodes ~ one by one'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "StartOnebyOne"
         self.stopall = SimulStopLite(cm)
         self.start = StartTest(cm)
         self.ns = CTS.NodeStatus(cm.Env)
 
     def __call__(self, dummy):
         '''Perform the 'StartOnebyOne' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         failed = []
         self.set_timer()
         for node in self.Env["nodes"]:
             if not self.start(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to start: " + repr(failed))
 
         return self.success()
 
 #        Register StartOnebyOne as a good test to run
 AllTestClasses.append(StartOnebyOne)
 
 
 class SimulStart(CTSTest):
     '''Start all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStart"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStart' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.startall(None):
             return self.failure("Startall failed")
 
         return self.success()
 
 #        Register SimulStart as a good test to run
 AllTestClasses.append(SimulStart)
 
 
 class SimulStop(CTSTest):
     '''Stop all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStop"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStop' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.stopall(None):
             return self.failure("Stopall failed")
 
         return self.success()
 
 #     Register SimulStop as a good test to run
 AllTestClasses.append(SimulStop)
 
 
 class StopOnebyOne(CTSTest):
     '''Stop all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "StopOnebyOne"
         self.startall = SimulStartLite(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, dummy):
         '''Perform the 'StopOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         failed = []
         self.set_timer()
         for node in self.Env["nodes"]:
             if not self.stop(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to stop: " + repr(failed))
 
         return self.success()
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(StopOnebyOne)
 
 
 class RestartOnebyOne(CTSTest):
     '''Restart all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RestartOnebyOne"
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'RestartOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         did_fail = []
         self.set_timer()
         self.restart = RestartTest(self.CM)
         for node in self.Env["nodes"]:
             if not self.restart(node):
                 did_fail.append(node)
 
         if did_fail:
             return self.failure("Could not restart %d nodes: %s"
                                 % (len(did_fail), repr(did_fail)))
         return self.success()
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(RestartOnebyOne)
 
 
 class PartialStart(CTSTest):
     '''Start a node - but tell it to stop before it finishes starting up'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "PartialStart"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
         self.stop = StopTest(cm)
         #self.is_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'PartialStart' test. '''
         self.incr("calls")
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
 
 #   FIXME!  This should use the CM class to get the pattern
 #       then it would be applicable in general
         watchpats = []
         watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure")
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
         watch.setwatch()
 
         self.CM.StartaCMnoBlock(node)
         ret = watch.lookforall()
         if not ret:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
             return self.failure("Setup of %s failed" % node)
 
         ret = self.stop(node)
         if not ret:
             return self.failure("%s did not stop in time" % node)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
 
         # We might do some fencing in the 2-node case if we make it up far enough
         return [
             r"Executing reboot fencing operation",
             r"Requesting fencing \([^)]+\) of node ",
         ]
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(PartialStart)
 
 
 class StandbyTest(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Standby"
         self.benchmark = 1
 
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
 
     # make sure the node is active
     # set the node to standby mode
     # check resources, none resource should be running on the node
     # set the node to active mode
     # check resouces, resources should have been migrated back (SHOULD THEY?)
 
     def __call__(self, node):
 
         self.incr("calls")
         ret = self.startall(None)
         if not ret:
             return self.failure("Start all nodes failed")
 
         self.debug("Make sure node %s is active" % node)
         if self.CM.StandbyStatus(node) != "off":
             if not self.CM.SetStandbyMode(node, "off"):
                 return self.failure("can't set node %s to active mode" % node)
 
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
 
         self.debug("Getting resources running on node %s" % node)
         rsc_on_node = self.CM.active_resources(node)
 
         watchpats = []
         watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
         watch.setwatch()
 
         self.debug("Setting node %s to standby mode" % node)
         if not self.CM.SetStandbyMode(node, "on"):
             return self.failure("can't set node %s to standby mode" % node)
 
         self.set_timer("on")
 
         ret = watch.lookforall()
         if not ret:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
             self.CM.SetStandbyMode(node, "off")
             return self.failure("cluster didn't react to standby change on %s" % node)
 
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "on":
             return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
         self.log_timer("on")
 
         self.debug("Checking resources")
         bad_run = self.CM.active_resources(node)
         if len(bad_run) > 0:
             rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
             self.debug("Setting node %s to active mode" % node)
             self.CM.SetStandbyMode(node, "off")
             return rc
 
         self.debug("Setting node %s to active mode" % node)
         if not self.CM.SetStandbyMode(node, "off"):
             return self.failure("can't set node %s to active mode" % node)
 
         self.set_timer("off")
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
         self.log_timer("off")
 
         return self.success()
 
 AllTestClasses.append(StandbyTest)
 
 
 class ValgrindTest(CTSTest):
     '''Check for memory leaks'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Valgrind"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
         self.is_valgrind = 1
         self.is_loop = 1
 
     def setup(self, node):
         self.incr("calls")
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("Stop all nodes failed")
 
         # @TODO Edit /etc/sysconfig/pacemaker on all nodes to enable valgrind,
         # and clear any valgrind logs from previous runs. For now, we rely on
         # the user to do this manually.
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Start all nodes failed")
 
         return self.success()
 
     def teardown(self, node):
         # Return all nodes to normal
         # @TODO Edit /etc/sysconfig/pacemaker on all nodes to disable valgrind
         ret = self.stopall(None)
         if not ret:
             return self.failure("Stop all nodes failed")
 
         return self.success()
 
     def find_leaks(self):
         # Check for leaks
         # (no longer used but kept in case feature is restored)
         leaked = []
         self.stop = StopTest(self.CM)
 
         for node in self.Env["nodes"]:
             rc = self.stop(node)
             if not rc:
                 self.failure("Couldn't shut down %s" % node)
 
             rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
             if rc != 1:
                 leaked.append(node)
                 self.failure("Valgrind errors detected on %s" % node)
                 (rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
                 for line in output:
                     self.logger.log(line)
                 (rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
                 for line in output:
                     self.debug(line)
 
         self.rsh(node, "rm -f %s" % self.logger.logPat, None)
         return leaked
 
     def __call__(self, node):
         #leaked = self.find_leaks()
         #if len(leaked) > 0:
         #    return self.failure("Nodes %s leaked" % repr(leaked))
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
             r"pacemaker-based.*: .* avoid confusing Valgrind",
             r"HA_VALGRIND_ENABLED",
         ]
 
 
 class StandbyLoopTest(ValgrindTest):
     '''Check for memory leaks by putting a node in and out of standby for an hour'''
     # @TODO This is not a useful test for memory leaks
     def __init__(self, cm):
         ValgrindTest.__init__(self,cm)
         self.name = "StandbyLoop"
 
     def __call__(self, node):
 
         lpc = 0
         delay = 2
         failed = 0
         done = time.time() + self.Env["loop-minutes"] * 60
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             time.sleep(delay)
             if not self.CM.SetStandbyMode(node, "on"):
                 self.failure("can't set node %s to standby mode" % node)
                 failed = lpc
 
             time.sleep(delay)
             if not self.CM.SetStandbyMode(node, "off"):
                 self.failure("can't set node %s to active mode" % node)
                 failed = lpc
 
         leaked = self.find_leaks()
         if failed:
             return self.failure("Iteration %d failed" % failed)
         elif len(leaked) > 0:
             return self.failure("Nodes %s leaked" % repr(leaked))
 
         return self.success()
 
 #AllTestClasses.append(StandbyLoopTest)
 
 
 class BandwidthTest(CTSTest):
 #        Tests should not be cluster-manager-specific
 #        If you need to find out cluster manager configuration to do this, then
 #        it should be added to the generic cluster manager API.
     '''Test the bandwidth which the cluster uses'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Bandwidth"
         self.start = StartTest(cm)
         self.__setitem__("min",0)
         self.__setitem__("max",0)
         self.__setitem__("totalbandwidth",0)
         (handle, self.tempfile) = tempfile.mkstemp(".cts")
         os.close(handle)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, node):
         '''Perform the Bandwidth test'''
         self.incr("calls")
 
         if self.CM.upcount() < 1:
             return self.skipped()
 
         Path = self.CM.InternalCommConfig()
         if "ip" not in Path["mediatype"]:
              return self.skipped()
 
         port = Path["port"][0]
         port = int(port)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
         time.sleep(5)  # We get extra messages right after startup.
 
         fstmpfile = "/var/run/band_estimate"
         dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
         %                (port, fstmpfile)
 
         rc = self.rsh(node, dumpcmd)
         if rc == 0:
             farfile = "root@%s:%s" % (node, fstmpfile)
             self.rsh.cp(farfile, self.tempfile)
             Bandwidth = self.countbandwidth(self.tempfile)
             if not Bandwidth:
                 self.logger.log("Could not compute bandwidth.")
                 return self.success()
             intband = int(Bandwidth + 0.5)
             self.logger.log("...bandwidth: %d bits/sec" % intband)
             self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
             if self.Stats["min"] == 0:
                 self.Stats["min"] = Bandwidth
             if Bandwidth > self.Stats["max"]:
                 self.Stats["max"] = Bandwidth
             if Bandwidth < self.Stats["min"]:
                 self.Stats["min"] = Bandwidth
             self.rsh(node, "rm -f %s" % fstmpfile)
             os.unlink(self.tempfile)
             return self.success()
         else:
             return self.failure("no response from tcpdump command [%d]!" % rc)
 
     def countbandwidth(self, file):
         fp = open(file, "r")
         fp.seek(0)
         count = 0
         sum = 0
         while 1:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count = count + 1
                 linesplit = line.split(" ")
                 for j in range(len(linesplit)-1):
                     if linesplit[j] == "udp": break
                     if linesplit[j] == "length:": break
 
                 try:
                     sum = sum + int(linesplit[j+1])
                 except ValueError:
                     self.logger.log("Invalid tcpdump line: %s" % line)
                     return None
                 T1 = linesplit[0]
                 timesplit = T1.split(":")
                 time2split = timesplit[2].split(".")
                 time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
                 break
 
         while count < 100:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count = count+1
                 linessplit = line.split(" ")
                 for j in range(len(linessplit)-1):
                     if linessplit[j] == "udp": break
                     if linessplit[j] == "length:": break
                 try:
                     sum = int(linessplit[j+1]) + sum
                 except ValueError:
                     self.logger.log("Invalid tcpdump line: %s" % line)
                     return None
 
         T2 = linessplit[0]
         timesplit = T2.split(":")
         time2split = timesplit[2].split(".")
         time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
         time = time2-time1
         if (time <= 0):
             return 0
         return int((sum*8)/time)
 
     def is_applicable(self):
         '''BandwidthTest never applicable'''
         return 0
 
 AllTestClasses.append(BandwidthTest)
 
 
 ###################################################################
 class MaintenanceMode(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "MaintenanceMode"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.max = 30
         #self.is_unsafe = 1
         self.benchmark = 1
         self.action = "asyncmon"
         self.interval = 0
         self.rid = "maintenanceDummy"
 
     def toggleMaintenanceMode(self, node, action):
         pats = []
         pats.append(self.templates["Pat:DC_IDLE"])
 
         # fail the resource right after turning Maintenance mode on
         # verify it is not recovered until maintenance mode is turned off
         if action == "On":
             pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of %s on" % (self.action, self.rid))
         else:
             pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
             pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.debug("Turning maintenance mode %s" % action)
         self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
         if (action == "On"):
             self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
 
         self.set_timer("recover%s" % (action))
         watch.lookforall()
         self.log_timer("recover%s" % (action))
         if watch.unmatched:
             self.debug("Failed to find patterns when turning maintenance mode %s" % action)
             return repr(watch.unmatched)
 
         return ""
 
     def insertMaintenanceDummy(self, node):
         pats = []
         pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.CM.AddDummyRsc(node, self.rid)
 
         self.set_timer("addDummy")
         watch.lookforall()
         self.log_timer("addDummy")
 
         if watch.unmatched:
             self.debug("Failed to find patterns when adding maintenance dummy resource")
             return repr(watch.unmatched)
         return ""
 
     def removeMaintenanceDummy(self, node):
         pats = []
         pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
         self.CM.RemoveDummyRsc(node, self.rid)
 
         self.set_timer("removeDummy")
         watch.lookforall()
         self.log_timer("removeDummy")
 
         if watch.unmatched:
             self.debug("Failed to find patterns when removing maintenance dummy resource")
             return repr(watch.unmatched)
         return ""
 
     def managedRscList(self, node):
         rscList = []
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if tmp.managed():
                     rscList.append(tmp.id)
 
         return rscList
 
     def verifyResources(self, node, rscList, managed):
         managedList = list(rscList)
         managed_str = "managed"
         if not managed:
             managed_str = "unmanaged"
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if managed and not tmp.managed():
                     continue
                 elif not managed and tmp.managed():
                     continue
                 elif managedList.count(tmp.id):
                     managedList.remove(tmp.id)
 
         if len(managedList) == 0:
             self.debug("Found all %s resources on %s" % (managed_str, node))
             return True
 
         self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
         return False
 
     def __call__(self, node):
         '''Perform the 'MaintenanceMode' test. '''
         self.incr("calls")
         verify_managed = False
         verify_unmanaged = False
         failPat = ""
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         # get a list of all the managed resources. We use this list
         # after enabling maintenance mode to verify all managed resources
         # become un-managed.  After maintenance mode is turned off, we use
         # this list to verify all the resources become managed again.
         managedResources = self.managedRscList(node)
         if len(managedResources) == 0:
             self.logger.log("No managed resources on %s" % node)
             return self.skipped()
 
         # insert a fake resource we can fail during maintenance mode
         # so we can verify recovery does not take place until after maintenance
         # mode is disabled.
         failPat = failPat + self.insertMaintenanceDummy(node)
 
         # toggle maintenance mode ON, then fail dummy resource.
         failPat = failPat + self.toggleMaintenanceMode(node, "On")
 
         # verify all the resources are now unmanaged
         if self.verifyResources(node, managedResources, False):
             verify_unmanaged = True
 
         # Toggle maintenance mode  OFF, verify dummy is recovered.
         failPat = failPat + self.toggleMaintenanceMode(node, "Off")
 
         # verify all the resources are now managed again
         if self.verifyResources(node, managedResources, True):
             verify_managed = True
 
         # Remove our maintenance dummy resource.
         failPat = failPat + self.removeMaintenanceDummy(node)
 
         self.CM.cluster_stable()
 
         if failPat != "":
             return self.failure("Unmatched patterns: %s" % (failPat))
         elif verify_unmanaged is False:
             return self.failure("Failed to verify resources became unmanaged during maintenance mode")
         elif verify_managed is False:
             return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
             r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
         ]
 
 AllTestClasses.append(MaintenanceMode)
 
 
 class ResourceRecover(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "ResourceRecover"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.max = 30
         self.rid = None
         self.rid_alt = None
         #self.is_unsafe = 1
         self.benchmark = 1
 
         # these are the values used for the new LRM API call
         self.action = "asyncmon"
         self.interval = 0
 
     def __call__(self, node):
         '''Perform the 'ResourceRecover' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         resourcelist = self.CM.active_resources(node)
         # if there are no resourcelist, return directly
         if len(resourcelist) == 0:
             self.logger.log("No active resources on %s" % node)
             return self.skipped()
 
         self.rid = self.Env.RandomGen.choice(resourcelist)
         self.rid_alt = self.rid
 
         rsc = None
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if tmp.id == self.rid:
                     rsc = tmp
                     # Handle anonymous clones that get renamed
                     self.rid = rsc.clone_id
                     break
 
         if not rsc:
             return self.failure("Could not find %s in the resource list" % self.rid)
 
         self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
 
         pats = []
         pats.append(r"schedulerd.*:\s+warning:.*Processing failed %s of (%s|%s) on" % (self.action,
             rsc.id, rsc.clone_id))
 
         if rsc.managed():
             pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
             if rsc.unique():
                 pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
             else:
                 # Anonymous clones may get restarted with a different clone number
                 pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
 
         self.set_timer("recover")
         watch.lookforall()
         self.log_timer("recover")
 
         self.CM.cluster_stable()
         recovered = self.CM.ResourceLocation(self.rid)
 
         if watch.unmatched:
             return self.failure("Patterns not found: %s" % repr(watch.unmatched))
 
         elif rsc.unique() and len(recovered) > 1:
             return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
 
         elif len(recovered) > 0:
             self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
 
         elif rsc.managed():
             return self.failure("%s was not recovered and is inactive" % self.rid)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
             r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
         ]
 
 AllTestClasses.append(ResourceRecover)
 
 
 class ComponentFail(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "ComponentFail"
         # TODO make this work correctly in docker.
         self.is_docker_unsafe = 1
         self.startall = SimulStartLite(cm)
         self.complist = cm.Components()
         self.patterns = []
         self.okerrpatterns = []
         self.is_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'ComponentFail' test. '''
         self.incr("calls")
         self.patterns = []
         self.okerrpatterns = []
 
         # start all nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.CM.cluster_stable(self.Env["StableTime"]):
             return self.failure("Setup failed - unstable")
 
         node_is_dc = self.CM.is_node_dc(node, None)
 
         # select a component to kill
         chosen = self.Env.RandomGen.choice(self.complist)
         while chosen.dc_only == 1 and node_is_dc == 0:
             chosen = self.Env.RandomGen.choice(self.complist)
 
         self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
         self.incr(chosen.name)
 
         if chosen.name != "corosync":
             self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
             self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
 
         self.patterns.extend(chosen.pats)
         if node_is_dc:
           self.patterns.extend(chosen.dc_pats)
 
         if chosen.name == "pacemaker-fenced":
             # Ignore actions for STONITH resources
             (rc, lines) = self.rsh(node, "crm_resource -c", None)
             for line in lines:
                 if re.search("^Resource", line):
                     r = AuditResource(self.CM, line)
                     if r.rclass == "stonith":
                         self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
 
         # supply a copy so self.patterns doesn't end up empty
         tmpPats = []
         tmpPats.extend(self.patterns)
         self.patterns.extend(chosen.badnews_ignore)
 
         # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
         stonithPats = []
         stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
         stonith = self.create_watch(stonithPats, 0)
         stonith.setwatch()
 
         # set the watch for stable
         watch = self.create_watch(
             tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
         watch.setwatch()
 
         # kill the component
         chosen.kill(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         self.debug("Waiting for any fenced node to come back up")
         self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         self.CM.cluster_stable(self.Env["StartTime"])
 
         self.debug("Checking if %s was shot" % node)
         shot = stonith.look(60)
         if shot:
             self.debug("Found: " + repr(shot))
             self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
 
             if self.Env["at-boot"] == 0:
                 self.CM.ShouldBeStatus[node] = "down"
 
             # If fencing occurred, chances are many (if not all) the expected logs
             # will not be sent - or will be lost when the node reboots
             return self.success()
 
         # check for logs indicating a graceful recovery
         matched = watch.lookforall(allow_multiple_matches=1)
         if watch.unmatched:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         is_stable = self.CM.cluster_stable(self.Env["StartTime"])
 
         if not matched:
             return self.failure("Didn't find all expected %s patterns" % chosen.name)
         elif not is_stable:
             return self.failure("Cluster did not become stable after killing %s" % chosen.name)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
     # Note that okerrpatterns refers to the last time we ran this test
     # The good news is that this works fine for us...
         self.okerrpatterns.extend(self.patterns)
         return self.okerrpatterns
 
 AllTestClasses.append(ComponentFail)
 
 
 class SplitBrainTest(CTSTest):
     '''It is used to test split-brain. when the path between the two nodes break
        check the two nodes both take over the resource'''
     def __init__(self,cm):
         CTSTest.__init__(self,cm)
         self.name = "SplitBrain"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.is_experimental = 1
 
     def isolate_partition(self, partition):
         other_nodes = []
         other_nodes.extend(self.Env["nodes"])
 
         for node in partition:
             try:
                 other_nodes.remove(node)
             except ValueError:
                 self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
 
         if len(other_nodes) == 0:
             return 1
 
         self.debug("Creating partition: " + repr(partition))
         self.debug("Everyone else: " + repr(other_nodes))
 
         for node in partition:
             if not self.CM.isolate_node(node, other_nodes):
                 self.logger.log("Could not isolate %s" % node)
                 return 0
 
         return 1
 
     def heal_partition(self, partition):
         other_nodes = []
         other_nodes.extend(self.Env["nodes"])
 
         for node in partition:
             try:
                 other_nodes.remove(node)
             except ValueError:
                 self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
 
         if len(other_nodes) == 0:
             return 1
 
         self.debug("Healing partition: " + repr(partition))
         self.debug("Everyone else: " + repr(other_nodes))
 
         for node in partition:
             self.CM.unisolate_node(node, other_nodes)
 
     def __call__(self, node):
         '''Perform split-brain test'''
         self.incr("calls")
         self.passed = 1
         partitions = {}
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         while 1:
             # Retry until we get multiple partitions
             partitions = {}
             p_max = len(self.Env["nodes"])
             for node in self.Env["nodes"]:
                 p = self.Env.RandomGen.randint(1, p_max)
                 if not p in partitions:
                     partitions[p] = []
                 partitions[p].append(node)
             p_max = len(list(partitions.keys()))
             if p_max > 1:
                 break
             # else, try again
 
         self.debug("Created %d partitions" % p_max)
         for key in list(partitions.keys()):
             self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
 
         # Disabling STONITH to reduce test complexity for now
         self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
 
         for key in list(partitions.keys()):
             self.isolate_partition(partitions[key])
 
         count = 30
         while count > 0:
             if len(self.CM.find_partitions()) != p_max:
                 time.sleep(10)
             else:
                 break
         else:
             self.failure("Expected partitions were not created")
 
         # Target number of partitions formed - wait for stability
         if not self.CM.cluster_stable():
             self.failure("Partitioned cluster not stable")
 
         # Now audit the cluster state
         self.CM.partitions_expected = p_max
         if not self.audit():
             self.failure("Audits failed")
         self.CM.partitions_expected = 1
 
         # And heal them again
         for key in list(partitions.keys()):
             self.heal_partition(partitions[key])
 
         # Wait for a single partition to form
         count = 30
         while count > 0:
             if len(self.CM.find_partitions()) != 1:
                 time.sleep(10)
                 count -= 1
             else:
                 break
         else:
             self.failure("Cluster did not reform")
 
         # Wait for it to have the right number of members
         count = 30
         while count > 0:
             members = []
 
             partitions = self.CM.find_partitions()
             if len(partitions) > 0:
                 members = partitions[0].split()
 
             if len(members) != len(self.Env["nodes"]):
                 time.sleep(10)
                 count -= 1
             else:
                 break
         else:
             self.failure("Cluster did not completely reform")
 
         # Wait up to 20 minutes - the delay is more preferable than
         # trying to continue with in a messed up state
         if not self.CM.cluster_stable(1200):
             self.failure("Reformed cluster not stable")
             if self.Env["continue"] == 1:
                 answer = "Y"
             else:
                 try:
                     answer = input_wrapper('Continue? [nY]')
                 except EOFError as e:
                     answer = "n" 
             if answer and answer == "n":
                 raise ValueError("Reformed cluster not stable")
 
         # Turn fencing back on
         if self.Env["DoFencing"]:
             self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
 
         self.CM.cluster_stable()
 
         if self.passed:
             return self.success()
         return self.failure("See previous errors")
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return [
             r"Another DC detected:",
             r"(ERROR|error).*: .*Application of an update diff failed",
             r"pacemaker-controld.*:.*not in our membership list",
             r"CRIT:.*node.*returning after partition",
         ]
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
         return len(self.Env["nodes"]) > 2
 
 AllTestClasses.append(SplitBrainTest)
 
 
 class Reattach(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Reattach"
         self.startall = SimulStartLite(cm)
         self.restart1 = RestartTest(cm)
         self.stopall = SimulStopLite(cm)
         self.is_unsafe = 0 # Handled by canrunnow()
 
     def _is_managed(self, node):
         is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1)
         is_managed = is_managed[:-1] # Strip off the newline
         return is_managed == "true"
 
     def _set_unmanaged(self, node):
         self.debug("Disable resource management")
         self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
 
     def _set_managed(self, node):
         self.debug("Re-enable resource management")
         self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
 
     def setup(self, node):
         attempt = 0
         if not self.startall(None):
             return None
 
         # Make sure we are really _really_ stable and that all
         # resources, including those that depend on transient node
         # attributes, are started
         while not self.CM.cluster_stable(double_check=True):
             if attempt < 5:
                 attempt += 1
                 self.debug("Not stable yet, re-testing")
             else:
                 self.logger.log("Cluster is not stable")
                 return None
 
         return 1
 
     def teardown(self, node):
 
         # Make sure 'node' is up
         start = StartTest(self.CM)
         start(node)
 
         if not self._is_managed(node):
             self.logger.log("Attempting to re-enable resource management on %s" % node)
             self._set_managed(node)
             self.CM.cluster_stable()
             if not self._is_managed(node):
                 self.logger.log("Could not re-enable resource management")
                 return 0
 
         return 1
 
     def canrunnow(self, node):
         '''Return TRUE if we can meaningfully run right now'''
         if self.find_ocfs2_resources(node):
             self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
             return 0
         return 1
 
     def __call__(self, node):
         self.incr("calls")
 
         pats = []
         # Conveniently, the scheduler will display this message when disabling
         # management, even if fencing is not enabled, so we can rely on it.
         managed = self.create_watch(["Delaying fencing operations"], 60)
         managed.setwatch()
 
         self._set_unmanaged(node)
 
         if not managed.lookforall():
             self.logger.log("Patterns not found: " + repr(managed.unmatched))
             return self.failure("Resource management not disabled")
 
         pats = []
         pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
         pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
         pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
         pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
         pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
 
         watch = self.create_watch(pats, 60, "ShutdownActivity")
         watch.setwatch()
 
         self.debug("Shutting down the cluster")
         ret = self.stopall(None)
         if not ret:
             self._set_managed(node)
             return self.failure("Couldn't shut down the cluster")
 
         self.debug("Bringing the cluster back up")
         ret = self.startall(None)
         time.sleep(5) # allow ping to update the CIB
         if not ret:
             self._set_managed(node)
             return self.failure("Couldn't restart the cluster")
 
         if self.local_badnews("ResourceActivity:", watch):
             self._set_managed(node)
             return self.failure("Resources stopped or started during cluster restart")
 
         watch = self.create_watch(pats, 60, "StartupActivity")
         watch.setwatch()
 
         # Re-enable resource management (and verify it happened).
         self._set_managed(node)
         self.CM.cluster_stable()
         if not self._is_managed(node):
             return self.failure("Could not re-enable resource management")
 
         # Ignore actions for STONITH resources
         ignore = []
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rclass == "stonith":
 
                     self.debug("Ignoring start actions for %s" % r.id)
                     ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
 
         if self.local_badnews("ResourceActivity:", watch, ignore):
             return self.failure("Resources stopped or started after resource management was re-enabled")
 
         return ret
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"resource( was|s were) active at shutdown",
         ]
 
     def is_applicable(self):
         return 1
 
 AllTestClasses.append(Reattach)
 
 
 class SpecialTest1(CTSTest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SpecialTest1"
         self.startall = SimulStartLite(cm)
         self.restart1 = RestartTest(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, node):
         '''Perform the 'SpecialTest1' test for Andrew. '''
         self.incr("calls")
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Could not stop all nodes")
 
         # Test config recovery when the other nodes come up
         self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
 
         #        Start the selected node
         ret = self.restart1(node)
         if not ret:
             return self.failure("Could not start "+node)
 
         #        Start all remaining nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Could not start the remaining nodes")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         # Errors that occur as a result of the CIB being wiped
         return [
             r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
             r"error.*: Resource start-up disabled since no STONITH resources have been defined",
             r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
             r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
         ]
 
 AllTestClasses.append(SpecialTest1)
 
 
 class HAETest(CTSTest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "HAETest"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
         self.is_loop = 1
 
     def setup(self, node):
         #  Start all remaining nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Couldn't start all nodes")
         return self.success()
 
     def teardown(self, node):
         # Stop everything
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
         return self.success()
 
     def wait_on_state(self, node, resource, expected_clones, attempts=240):
         while attempts > 0:
             active = 0
             (rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
 
             # Hack until crm_resource does the right thing
             if rc == 0 and lines:
                 active = len(lines)
 
             if len(lines) == expected_clones:
                 return 1
 
             elif rc == 1:
                 self.debug("Resource %s is still inactive" % resource)
 
             elif rc == 234:
                 self.logger.log("Unknown resource %s" % resource)
                 return 0
 
             elif rc == 246:
                 self.logger.log("Cluster is inactive")
                 return 0
 
             elif rc != 0:
                 self.logger.log("Call to crm_resource failed, rc=%d" % rc)
                 return 0
 
             else:
                 self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
 
             attempts -= 1
             time.sleep(1)
 
         return 0
 
     def find_dlm(self, node):
         self.r_dlm = None
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rtype == "controld" and r.parent != "NA":
                     self.debug("Found dlm: %s" % self.r_dlm)
                     self.r_dlm = r.parent
                     return 1
         return 0
 
     def find_hae_resources(self, node):
         self.r_dlm = None
         self.r_o2cb = None
         self.r_ocfs2 = []
 
         if self.find_dlm(node):
             self.find_ocfs2_resources(node)
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
         if self.Env["Schema"] == "hae":
             return 1
         return None
 
 
 class HAERoleTest(HAETest):
     def __init__(self, cm):
         '''Lars' mount/unmount test for the HA extension. '''
         HAETest.__init__(self,cm)
         self.name = "HAERoleTest"
 
     def change_state(self, node, resource, target):
         rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s  --meta" % (resource, target))
         return rc
 
     def __call__(self, node):
         self.incr("calls")
         lpc = 0
         failed = 0
         delay = 2
         done = time.time() + self.Env["loop-minutes"]*60
         self.find_hae_resources(node)
 
         clone_max = len(self.Env["nodes"])
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             self.change_state(node, self.r_dlm, "Stopped")
             if not self.wait_on_state(node, self.r_dlm, 0):
                 self.failure("%s did not go down correctly" % self.r_dlm)
                 failed = lpc
 
             self.change_state(node, self.r_dlm, "Started")
             if not self.wait_on_state(node, self.r_dlm, clone_max):
                 self.failure("%s did not come up correctly" % self.r_dlm)
                 failed = lpc
 
             if not self.wait_on_state(node, self.r_o2cb, clone_max):
                 self.failure("%s did not come up correctly" % self.r_o2cb)
                 failed = lpc
 
             for fs in self.r_ocfs2:
                 if not self.wait_on_state(node, fs, clone_max):
                     self.failure("%s did not come up correctly" % fs)
                     failed = lpc
 
         if failed:
             return self.failure("iteration %d failed" % failed)
         return self.success()
 
 AllTestClasses.append(HAERoleTest)
 
 
 class HAEStandbyTest(HAETest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         HAETest.__init__(self,cm)
         self.name = "HAEStandbyTest"
 
     def change_state(self, node, resource, target):
         rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
         return rc
 
     def __call__(self, node):
         self.incr("calls")
 
         lpc = 0
         failed = 0
         done = time.time() + self.Env["loop-minutes"]*60
         self.find_hae_resources(node)
 
         clone_max = len(self.Env["nodes"])
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             self.change_state(node, self.r_dlm, "true")
             if not self.wait_on_state(node, self.r_dlm, clone_max-1):
                 self.failure("%s did not go down correctly" % self.r_dlm)
                 failed = lpc
 
             self.change_state(node, self.r_dlm, "false")
             if not self.wait_on_state(node, self.r_dlm, clone_max):
                 self.failure("%s did not come up correctly" % self.r_dlm)
                 failed = lpc
 
             if not self.wait_on_state(node, self.r_o2cb, clone_max):
                 self.failure("%s did not come up correctly" % self.r_o2cb)
                 failed = lpc
 
             for fs in self.r_ocfs2:
                 if not self.wait_on_state(node, fs, clone_max):
                     self.failure("%s did not come up correctly" % fs)
                     failed = lpc
 
         if failed:
             return self.failure("iteration %d failed" % failed)
         return self.success()
 
 AllTestClasses.append(HAEStandbyTest)
 
 
 class NearQuorumPointTest(CTSTest):
     '''
     This test brings larger clusters near the quorum point (50%).
     In addition, it will test doing starts and stops at the same time.
 
     Here is how I think it should work:
     - loop over the nodes and decide randomly which will be up and which
       will be down  Use a 50% probability for each of up/down.
     - figure out what to do to get into that state from the current state
     - in parallel, bring up those going up  and bring those going down.
     '''
 
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "NearQuorumPoint"
 
     def __call__(self, dummy):
         '''Perform the 'NearQuorumPoint' test. '''
         self.incr("calls")
         startset = []
         stopset = []
 
         stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
         #decide what to do with each node
         for node in self.Env["nodes"]:
             action = self.Env.RandomGen.choice(["start","stop"])
             #action = self.Env.RandomGen.choice(["start","stop","no change"])
             if action == "start" :
                 startset.append(node)
             elif action == "stop" :
                 stopset.append(node)
 
         self.debug("start nodes:" + repr(startset))
         self.debug("stop nodes:" + repr(stopset))
 
         #add search patterns
         watchpats = [ ]
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == "up":
                 watchpats.append(self.templates["Pat:We_stopped"] % node)
 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == "down":
                 #watchpats.append(self.templates["Pat:NonDC_started"] % node)
                 watchpats.append(self.templates["Pat:Local_started"] % node)
             else:
                 for stopping in stopset:
                     if self.CM.ShouldBeStatus[stopping] == "up":
                         watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
 
         if len(watchpats) == 0:
             return self.skipped()
 
         if len(startset) != 0:
             watchpats.append(self.templates["Pat:DC_IDLE"])
 
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
 
         watch.setwatch()
 
         #begin actions
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.CM.StopaCMnoBlock(node)
 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == "down":
                 self.CM.StartaCMnoBlock(node)
 
         #get the result
         if watch.lookforall():
             self.CM.cluster_stable()
             self.CM.fencing_cleanup("NearQuorumPoint", stonith)
             return self.success()
 
         self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
 
         #get the "bad" nodes
         upnodes = []
         for node in stopset:
             if self.CM.StataCM(node) == 1:
                 upnodes.append(node)
 
         downnodes = []
         for node in startset:
             if self.CM.StataCM(node) == 0:
                 downnodes.append(node)
 
         self.CM.fencing_cleanup("NearQuorumPoint", stonith)
         if upnodes == [] and downnodes == []:
             self.CM.cluster_stable()
 
             # Make sure they're completely down with no residule
             for node in stopset:
                 self.rsh(node, self.templates["StopCmd"])
 
             return self.success()
 
         if len(upnodes) > 0:
             self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
 
         if len(downnodes) > 0:
             self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
 
         return self.failure()
 
     def is_applicable(self):
         return 1
 
 AllTestClasses.append(NearQuorumPointTest)
 
 
 class RollingUpgradeTest(CTSTest):
     '''Perform a rolling upgrade of the cluster'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RollingUpgrade"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
 
     def setup(self, node):
         #  Start all remaining nodes
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
 
         for node in self.Env["nodes"]:
             if not self.downgrade(node, None):
                 return self.failure("Couldn't downgrade %s" % node)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Couldn't start all nodes")
         return self.success()
 
     def teardown(self, node):
         # Stop everything
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
 
         for node in self.Env["nodes"]:
             if not self.upgrade(node, None):
                 return self.failure("Couldn't upgrade %s" % node)
 
         return self.success()
 
     def install(self, node, version, start=1, flags="--force"):
 
         target_dir = "/tmp/rpm-%s" % version
         src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
 
         self.logger.log("Installing %s on %s with %s" % (version, node, flags))
         if not self.stop(node):
             return self.failure("stop failure: "+node)
 
         rc = self.rsh(node, "mkdir -p %s" % target_dir)
         rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
         (rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
         for line in lines:
             line = line[:-1]
             rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
         rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
 
         if start and not self.start(node):
             return self.failure("start failure: "+node)
 
         return self.success()
 
     def upgrade(self, node, start=1):
         return self.install(node, self.Env["current-version"], start)
 
     def downgrade(self, node, start=1):
         return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
 
     def __call__(self, node):
         '''Perform the 'Rolling Upgrade' test. '''
         self.incr("calls")
 
         for node in self.Env["nodes"]:
             if self.upgrade(node):
                 return self.failure("Couldn't upgrade %s" % node)
 
             self.CM.cluster_stable()
 
         return self.success()
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return None
 
         if not "rpm-dir" in list(self.Env.keys()):
             return None
         if not "current-version" in list(self.Env.keys()):
             return None
         if not "previous-version" in list(self.Env.keys()):
             return None
 
         return 1
 
 #        Register RestartTest as a good test to run
 AllTestClasses.append(RollingUpgradeTest)
 
 
 class BSC_AddResource(CTSTest):
     '''Add a resource to the cluster'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "AddResource"
         self.resource_offset = 0
         self.cib_cmd = """cibadmin -C -o %s -X '%s' """
 
     def __call__(self, node):
         self.incr("calls")
         self.resource_offset =         self.resource_offset  + 1
 
         r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
         start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok"
 
         patterns = []
         patterns.append(start_pat % r_id)
 
         watch = self.create_watch(patterns, self.Env["DeadTime"])
         watch.setwatch()
 
         ip = self.NextIP()
         if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
             return self.failure("Make resource %s failed" % r_id)
 
         failed = 0
         watch_result = watch.lookforall()
         if watch.unmatched:
             for regex in watch.unmatched:
                 self.logger.log ("Warn: Pattern not found: %s" % (regex))
                 failed = 1
 
         if failed:
             return self.failure("Resource pattern(s) not found")
 
         if not self.CM.cluster_stable(self.Env["DeadTime"]):
             return self.failure("Unstable cluster")
 
         return self.success()
 
     def NextIP(self):
         ip = self.Env["IPBase"]
         if ":" in ip:
             fields = ip.rpartition(":")
             fields[2] = str(hex(int(fields[2], 16)+1))
             print(str(hex(int(f[2], 16)+1)))
         else:
             fields = ip.rpartition('.')
             fields[2] = str(int(fields[2])+1)
 
         ip = fields[0] + fields[1] + fields[3];
         self.Env["IPBase"] = ip
         return ip.strip()
 
     def make_ip_resource(self, node, id, rclass, type, ip):
         self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
         rsc_xml="""
 <primitive id="%s" class="%s" type="%s"  provider="heartbeat">
     <instance_attributes id="%s"><attributes>
         <nvpair id="%s" name="ip" value="%s"/>
     </attributes></instance_attributes>
 </primitive>""" % (id, rclass, type, id, id, ip)
 
         node_constraint = """
       <rsc_location id="run_%s" rsc="%s">
         <rule id="pref_run_%s" score="100">
           <expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
         </rule>
       </rsc_location>""" % (id, id, id, id, node)
 
         rc = 0
         (rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
         if rc != 0:
             self.logger.log("Constraint creation failed: %d" % rc)
             return None
 
         (rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
         if rc != 0:
             self.logger.log("Resource creation failed: %d" % rc)
             return None
 
         return 1
 
     def is_applicable(self):
         if self.Env["DoBSC"]:
             return 1
         return None
 
 AllTestClasses.append(BSC_AddResource)
 
 
 class SimulStopLite(CTSTest):
     '''Stop any active nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStopLite"
 
     def __call__(self, dummy):
         '''Perform the 'SimulStopLite' setup work. '''
         self.incr("calls")
 
         self.debug("Setup: " + self.name)
 
         #     We ignore the "node" parameter...
         watchpats = [ ]
 
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.incr("WasStarted")
                 watchpats.append(self.templates["Pat:We_stopped"] % node)
 
         if len(watchpats) == 0:
             return self.success()
 
         #     Stop all the nodes - at about the same time...
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
 
         watch.setwatch()
         self.set_timer()
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.CM.StopaCMnoBlock(node)
         if watch.lookforall():
             # Make sure they're completely down with no residule
             for node in self.Env["nodes"]:
                 self.rsh(node, self.templates["StopCmd"])
 
             return self.success()
 
         did_fail = 0
         up_nodes = []
         for node in self.Env["nodes"]:
             if self.CM.StataCM(node) == 1:
                 did_fail = 1
                 up_nodes.append(node)
 
         if did_fail:
             return self.failure("Active nodes exist: " + repr(up_nodes))
 
         self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
                     + repr(watch.unmatched))
 
         return self.failure("Missing log message: "+repr(watch.unmatched))
 
     def is_applicable(self):
         '''SimulStopLite is a setup test and never applicable'''
         return 0
 
 
 class SimulStartLite(CTSTest):
     '''Start any stopped nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStartLite"
 
     def __call__(self, dummy):
         '''Perform the 'SimulStartList' setup work. '''
         self.incr("calls")
         self.debug("Setup: " + self.name)
 
         #        We ignore the "node" parameter...
         node_list = []
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "down":
                 self.incr("WasStopped")
                 node_list.append(node)
 
         self.set_timer()
         while len(node_list) > 0:
             # Repeat until all nodes come up
             watchpats = [ ]
 
             uppat = self.templates["Pat:NonDC_started"]
             if self.CM.upcount() == 0:
                 uppat = self.templates["Pat:Local_started"]
 
             watchpats.append(self.templates["Pat:DC_IDLE"])
             for node in node_list:
                 watchpats.append(uppat % node)
                 watchpats.append(self.templates["Pat:InfraUp"] % node)
                 watchpats.append(self.templates["Pat:PacemakerUp"] % node)
 
             #   Start all the nodes - at about the same time...
             watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
             watch.setwatch()
 
             stonith = self.CM.prepare_fencing_watcher(self.name)
 
             for node in node_list:
                 self.CM.StartaCMnoBlock(node)
 
             watch.lookforall()
 
             node_list = self.CM.fencing_cleanup(self.name, stonith)
 
             if node_list == None:
                 return self.failure("Cluster did not stabilize")
 
             # Remove node_list messages from watch.unmatched
             for node in node_list:
                 self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
                 if watch.unmatched:
                     try:
                         watch.unmatched.remove(uppat % node)
                     except:
                         self.debug("Already matched: %s" % (uppat % node))
                     try:                        
                         watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
                     except:
                         self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
                     try:
                         watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
                     except:
                         self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
 
             if watch.unmatched:
                 for regex in watch.unmatched:
                     self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
 
             if not self.CM.cluster_stable():
                 return self.failure("Cluster did not stabilize")
 
         did_fail = 0
         unstable = []
         for node in self.Env["nodes"]:
             if self.CM.StataCM(node) == 0:
                 did_fail = 1
                 unstable.append(node)
 
         if did_fail:
             return self.failure("Unstarted nodes exist: " + repr(unstable))
 
         unstable = []
         for node in self.Env["nodes"]:
             if not self.CM.node_stable(node):
                 did_fail = 1
                 unstable.append(node)
 
         if did_fail:
             return self.failure("Unstable cluster nodes exist: " + repr(unstable))
 
         return self.success()
 
     def is_applicable(self):
         '''SimulStartLite is a setup test and never applicable'''
         return 0
 
 
 def TestList(cm, audits):
     result = []
     for testclass in AllTestClasses:
         bound_test = testclass(cm)
         if bound_test.is_applicable():
             bound_test.Audits = audits
             result.append(bound_test)
     return result
 
 
 class RemoteLXC(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteLXC"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.num_containers = 2
         self.is_container = 1
         self.is_docker_unsafe = 1
         self.failed = 0
         self.fail_string = ""
 
     def start_lxc_simple(self, node):
 
         # restore any artifacts laying around from a previous test.
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
 
         # generate the containers, put them in the config, add some resources to them
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
         pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
         pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
         pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
 
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
         self.set_timer("remoteSimpleInit")
         watch.lookforall()
         self.log_timer("remoteSimpleInit")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
     def cleanup_lxc_simple(self, node):
 
         pats = [ ]
         # if the test failed, attempt to clean up the cib and libvirt environment
         # as best as possible 
         if self.failed == 1:
             # restore libvirt and cib
             self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
             return
 
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
         pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
 
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
         self.set_timer("remoteSimpleCleanup")
         watch.lookforall()
         self.log_timer("remoteSimpleCleanup")
 
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
         # cleanup libvirt
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
 
     def __call__(self, node):
         '''Perform the 'RemoteLXC' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
         if rc == 1:
             self.log("Environment test for lxc support failed.")
             return self.skipped()
 
         self.start_lxc_simple(node)
         self.cleanup_lxc_simple(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         if self.failed == 1:
             return self.failure(self.fail_string)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for ping",
             r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
             # The orphaned lxc-ms resource causes an expected transition error
             # that is a result of the scheduler not having knowledge that the
             # promotable resource used to be a clone. As a result, it looks like that 
             # resource is running in multiple locations when it shouldn't... But in
             # this instance we know why this error is occurring and that it is expected.
             r"Calculated [Tt]ransition .*pe-error",
             r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
             r"Unknown operation: fail",
             r"VirtualDomain.*ERROR: Unable to determine emulator",
         ]
 
 AllTestClasses.append(RemoteLXC)
 
 
 class RemoteDriver(CTSTest):
 
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = self.__class__.__name__
         self.is_docker_unsafe = 1
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.stop = StopTest(cm)
         self.remote_rsc = "remote-rsc"
         self.cib_cmd = """cibadmin -C -o %s -X '%s' """
         self.reset()
 
     def reset(self):
         self.pcmk_started = 0
         self.failed = False
         self.fail_string = ""
         self.remote_node_added = 0
         self.remote_rsc_added = 0
         self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False])
 
     def fail(self, msg):
         """ Mark test as failed. """
 
         self.failed = True
 
         # Always log the failure.
         self.logger.log(msg)
 
         # Use first failure as test status, as it's likely to be most useful.
         if not self.fail_string:
             self.fail_string = msg
 
     def get_othernode(self, node):
         for othernode in self.Env["nodes"]:
             if othernode == node:
                 # we don't want to try and use the cib that we just shutdown.
                 # find a cluster node that is not our soon to be remote-node.
                 continue
             else:
                 return othernode
 
     def del_rsc(self, node, rsc):
         othernode = self.get_othernode(node)
         rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
         if rc != 0:
             self.fail("Removal of resource '%s' failed" % rsc)
 
     def add_rsc(self, node, rsc_xml):
         othernode = self.get_othernode(node)
         rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
         if rc != 0:
             self.fail("resource creation failed")
 
     def add_primitive_rsc(self, node):
         rsc_xml = """
-<primitive class="ocf" id="%s" provider="heartbeat" type="Dummy">
-    <operations>
-      <op id="remote-rsc-monitor-interval-10s" interval="10s" name="monitor"/>
-    </operations>
-    <meta_attributes id="remote-meta_attributes"/>
-</primitive>""" % (self.remote_rsc)
+<primitive class="ocf" id="%(node)s" provider="heartbeat" type="Dummy">
+  <meta_attributes id="%(node)s-meta_attributes"/>
+  <operations>
+    <op id="%(node)s-monitor-interval-20s" interval="20s" name="monitor"/>
+  </operations>
+</primitive>""" % { "node": self.remote_rsc }
         self.add_rsc(node, rsc_xml)
         if not self.failed:
             self.remote_rsc_added = 1
 
     def add_connection_rsc(self, node):
+        rsc_xml = """
+<primitive class="ocf" id="%(node)s" provider="pacemaker" type="remote">
+  <instance_attributes id="%(node)s-instance_attributes">
+    <nvpair id="%(node)s-instance_attributes-server" name="server" value="%(server)s"/>
+""" % { "node": self.remote_node, "server": node }
+
         if self.remote_use_reconnect_interval:
-            # use reconnect interval and make sure to set cluster-recheck-interval as well.
-            rsc_xml = """
-<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
-    <instance_attributes id="remote-instance_attributes"/>
-        <instance_attributes id="remote-instance_attributes">
-          <nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
-          <nvpair id="remote-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
-        </instance_attributes>
-    <operations>
-      <op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
-      <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="60"/>
-    </operations>
-</primitive>""" % (self.remote_node, node)
+            # Set cluster-recheck-interval lower
             self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s"))
-        else:
-            # not using reconnect interval
-            rsc_xml = """
-<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
-    <instance_attributes id="remote-instance_attributes"/>
-        <instance_attributes id="remote-instance_attributes">
-          <nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
-        </instance_attributes>
-    <operations>
-      <op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
-      <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="120"/>
-    </operations>
-</primitive>""" % (self.remote_node, node)
+
+            # Set reconnect interval on resource
+            rsc_xml = rsc_xml + """
+    <nvpair id="%s-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
+""" % (self.remote_node)
+
+        rsc_xml = rsc_xml + """
+  </instance_attributes>
+  <operations>
+    <op id="%(node)s-start"       name="start"   interval="0"   timeout="120s"/>
+    <op id="%(node)s-monitor-20s" name="monitor" interval="20s" timeout="45s"/>
+  </operations>
+</primitive>
+""" % { "node": self.remote_node }
 
         self.add_rsc(node, rsc_xml)
         if not self.failed:
             self.remote_node_added = 1
 
     def stop_pcmk_remote(self, node):
         # disable pcmk remote
         for i in range(10):
             rc = self.rsh(node, "service pacemaker_remote stop")
             if rc != 0:
                 time.sleep(6)
             else:
                 break
 
     def start_pcmk_remote(self, node):
         for i in range(10):
             rc = self.rsh(node, "service pacemaker_remote start")
             if rc != 0:
                 time.sleep(6)
             else:
                 self.pcmk_started = 1
                 break
 
     def freeze_pcmk_remote(self, node):
         """ Simulate a Pacemaker Remote daemon failure. """
 
         # We freeze the process.
         self.rsh(node, "killall -STOP pacemaker-remoted")
 
     def resume_pcmk_remote(self, node):
         # We resume the process.
         self.rsh(node, "killall -CONT pacemaker-remoted")
 
     def start_metal(self, node):
         pcmk_started = 0
 
         # make sure the resource doesn't already exist for some reason
         self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
         self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
 
         if not self.stop(node):
             self.fail("Failed to shutdown cluster node %s" % node)
             return
 
         self.start_pcmk_remote(node)
 
         if self.pcmk_started == 0:
             self.fail("Failed to start pacemaker_remote on node %s" % node)
             return
 
         # Convert node to baremetal now that it has shutdown the cluster stack
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
         pats.append(self.templates["Pat:DC_IDLE"])
 
         self.add_connection_rsc(node)
 
         self.set_timer("remoteMetalInit")
         watch.lookforall()
         self.log_timer("remoteMetalInit")
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
 
     def migrate_connection(self, node):
         if self.failed:
             return
 
         pats = [ ]
         pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
         pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
         pats.append(self.templates["Pat:DC_IDLE"])
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         (rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
         if rc != 0:
             self.fail("failed to move remote node connection resource")
             return
 
         self.set_timer("remoteMetalMigrate")
         watch.lookforall()
         self.log_timer("remoteMetalMigrate")
 
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
             return
 
     def fail_rsc(self, node):
         if self.failed:
             return
 
         watchpats = [ ]
         watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
         watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
         watchpats.append(self.templates["Pat:DC_IDLE"])
 
         watch = self.create_watch(watchpats, 120)
         watch.setwatch()
 
         self.debug("causing dummy rsc to fail.")
 
         rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
 
         self.set_timer("remoteRscFail")
         watch.lookforall()
         self.log_timer("remoteRscFail")
         if watch.unmatched:
             self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)
 
     def fail_connection(self, node):
         if self.failed:
             return
 
         watchpats = [ ]
         watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
         watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
 
         watch = self.create_watch(watchpats, 120)
         watch.setwatch()
 
         # freeze the pcmk remote daemon. this will result in fencing
         self.debug("Force stopped active remote node")
         self.freeze_pcmk_remote(node)
 
         self.debug("Waiting for remote node to be fenced.")
         self.set_timer("remoteMetalFence")
         watch.lookforall()
         self.log_timer("remoteMetalFence")
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
             return
 
         self.debug("Waiting for the remote node to come back up")
         self.CM.ns.WaitForNodeToComeUp(node, 120);
 
         pats = [ ]
         watch = self.create_watch(pats, 240)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
         if self.remote_rsc_added == 1:
             pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
 
         # start the remote node again watch it integrate back into cluster.
         self.start_pcmk_remote(node)
         if self.pcmk_started == 0:
             self.fail("Failed to start pacemaker_remote on node %s" % node)
             return
 
         self.debug("Waiting for remote node to rejoin cluster after being fenced.")
         self.set_timer("remoteMetalRestart")
         watch.lookforall()
         self.log_timer("remoteMetalRestart")
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
             return
 
     def add_dummy_rsc(self, node):
         if self.failed:
             return
 
         # verify we can put a resource on the remote node
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
         pats.append(self.templates["Pat:DC_IDLE"])
 
         # Add a resource that must live on remote-node
         self.add_primitive_rsc(node)
 
         # force that rsc to prefer the remote node. 
         (rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
         if rc != 0:
             self.fail("Failed to place remote resource on remote node.")
             return
 
         self.set_timer("remoteMetalRsc")
         watch.lookforall()
         self.log_timer("remoteMetalRsc")
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
 
     def test_attributes(self, node):
         if self.failed:
             return
 
         # This verifies permanent attributes can be set on a remote-node. It also
         # verifies the remote-node can edit its own cib node section remotely.
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
             return
 
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail("Failed to get remote-node attribute")
             return
 
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail("Failed to delete remote-node attribute")
             return
 
     def cleanup_metal(self, node):
         if self.pcmk_started == 0:
             return
 
         pats = [ ]
 
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         if self.remote_rsc_added == 1:
             pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
         if self.remote_node_added == 1:
             pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))
 
         self.set_timer("remoteMetalCleanup")
 
         self.resume_pcmk_remote(node)
 
         if self.remote_use_reconnect_interval:
             self.debug("Cleaning up re-check interval")
             self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"])
 
         if self.remote_rsc_added == 1:
 
             # Remove dummy resource added for remote node tests
             self.debug("Cleaning up dummy rsc put on remote node")
             self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc)
             self.del_rsc(node, self.remote_rsc)
 
         if self.remote_node_added == 1:
 
             # Remove remote node's connection resource
             self.debug("Cleaning up remote node connection resource")
             self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node))
             self.del_rsc(node, self.remote_node)
 
         watch.lookforall()
         self.log_timer("remoteMetalCleanup")
 
         if watch.unmatched:
             self.fail("Unmatched patterns: %s" % watch.unmatched)
 
         self.stop_pcmk_remote(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         if self.remote_node_added == 1:
             # Remove remote node itself
             self.debug("Cleaning up node entry for remote node")
             self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)
 
     def setup_env(self, node):
 
         self.remote_node = "remote-%s" % (node)
 
         # we are assuming if all nodes have a key, that it is
         # the right key... If any node doesn't have a remote
         # key, we regenerate it everywhere.
         if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
             return
 
         # create key locally
         (handle, keyfile) = tempfile.mkstemp(".cts")
         os.close(handle)
         devnull = open(os.devnull, 'wb')
         subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
             stdout=devnull, stderr=devnull)
         devnull.close()
 
         # sync key throughout the cluster
         for node in self.Env["nodes"]:
             self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
             self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
             self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
             self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
         os.unlink(keyfile)
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return False
 
         for node in self.Env["nodes"]:
             rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
             if rc != 0:
                 return False
         return True
 
     def start_new_test(self, node):
         self.incr("calls")
         self.reset()
 
         ret = self.startall(None)
         if not ret:
             return self.failure("setup failed: could not start all nodes")
 
         self.setup_env(node)
         self.start_metal(node)
         self.add_dummy_rsc(node)
         return True
 
     def __call__(self, node):
         return self.failure("This base class is not meant to be called directly.")
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [ r"""is running on remote.*which isn't allowed""",
                  r"""Connection terminated""",
                  r"""Could not send remote""",
                 ]
 
 # RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses
 
 
 class RemoteBasic(RemoteDriver):
 
     def __call__(self, node):
         '''Perform the 'RemoteBaremetal' test. '''
 
         if not self.start_new_test(node):
             return self.failure(self.fail_string)
 
         self.test_attributes(node)
         self.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.failed:
             return self.failure(self.fail_string)
 
         return self.success()
 
 AllTestClasses.append(RemoteBasic)
 
 class RemoteStonithd(RemoteDriver):
 
     def __call__(self, node):
         '''Perform the 'RemoteStonithd' test. '''
 
         if not self.start_new_test(node):
             return self.failure(self.fail_string)
 
         self.fail_connection(node)
         self.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.failed:
             return self.failure(self.fail_string)
 
         return self.success()
 
     def is_applicable(self):
         if not RemoteDriver.is_applicable(self):
             return False
 
         if "DoFencing" in list(self.Env.keys()):
             return self.Env["DoFencing"]
 
         return True
 
     def errorstoignore(self):
         ignore_pats = [
             r"Lost connection to Pacemaker Remote node",
             r"Software caused connection abort",
             r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
             r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
             r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
             r"Calculated [Tt]ransition .*pe-error",
             r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery",
+            r"error: Result of monitor operation for .* on remote-.*: Error",
         ]
 
         ignore_pats.extend(RemoteDriver.errorstoignore(self))
         return ignore_pats
 
 AllTestClasses.append(RemoteStonithd)
 
 
 class RemoteMigrate(RemoteDriver):
 
     def __call__(self, node):
         '''Perform the 'RemoteMigrate' test. '''
 
         if not self.start_new_test(node):
             return self.failure(self.fail_string)
 
         self.migrate_connection(node)
         self.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.failed:
             return self.failure(self.fail_string)
 
         return self.success()
 
 AllTestClasses.append(RemoteMigrate)
 
 
 class RemoteRscFailure(RemoteDriver):
 
     def __call__(self, node):
         '''Perform the 'RemoteRscFailure' test. '''
 
         if not self.start_new_test(node):
             return self.failure(self.fail_string)
 
         # This is an important step. We are migrating the connection
         # before failing the resource. This verifies that the migration
         # has properly maintained control over the remote-node.
         self.migrate_connection(node)
 
         self.fail_rsc(node)
         self.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.failed:
             return self.failure(self.fail_string)
 
         return self.success()
 
     def errorstoignore(self):
         ignore_pats = [
             r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
             r"Dummy.*: No process state file found",
         ]
 
         ignore_pats.extend(RemoteDriver.errorstoignore(self))
         return ignore_pats
 
 AllTestClasses.append(RemoteRscFailure)
 
 # vim:ts=4:sw=4:et:
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index 9b779aa332..05e1d84f7b 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -1,140 +1,141 @@
 /*
  * Copyright 2013-2018 Andrew Beekhof <andrew@beekhof.net>
  *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
  */
 
 #include <crm_internal.h>
 #include <crm/msg_xml.h>
 #include <crm/cluster.h>
 #include <crm/cluster/election.h>
 
 #include "pacemaker-attrd.h"
 
 static char *peer_writer = NULL;
 static election_t *writer = NULL;
 
 void
 attrd_election_init()
 {
     writer = election_init(T_ATTRD, attrd_cluster->uname, 120000,
                            attrd_election_cb);
 }
 
 void
 attrd_election_fini()
 {
     election_fini(writer);
 }
 
 void
 attrd_start_election_if_needed()
 {
     if ((peer_writer == NULL)
         && (election_state(writer) != election_in_progress)) {
         crm_info("Starting an election to determine the writer");
         election_vote(writer);
     }
 }
 
 bool
 attrd_election_won()
 {
     return (election_state(writer) == election_won);
 }
 
 void
 attrd_handle_election_op(const crm_node_t *peer, xmlNode *xml)
 {
     enum election_result rc = 0;
     enum election_result previous = election_state(writer);
 
     crm_xml_add(xml, F_CRM_HOST_FROM, peer->uname);
     rc = election_count_vote(writer, xml, TRUE);
     switch(rc) {
         case election_start:
             free(peer_writer);
             peer_writer = NULL;
             crm_debug("Unsetting writer (was %s) and starting new election",
                       peer_writer? peer_writer : "unset");
             election_vote(writer);
             break;
 
         case election_lost:
-            /* Losing to this peer does not mean this peer definitely won
-             * (another peer may eventually win). However if we don't already
-             * have a writer, we tentatively record this peer as writer so that
-             * we don't enter "peer_writer == NULL" blocks after this point
-             * (which might start new elections).
+            /* The election API should really distinguish between "we just lost
+             * to this peer" and "we already lost previously, and we are
+             * discarding this vote for some reason", but it doesn't.
              *
-             * However, we don't do this if the state was already lost, because
-             * we may just be getting the current state back when processing a
-             * late no-vote.
+             * In the first case, we want to tentatively set the peer writer to
+             * this peer, even though another peer may eventually win (which we
+             * will learn via attrd_check_for_new_writer()), so
+             * attrd_start_election_if_needed() doesn't start a new election.
+             *
+             * Approximate a test for that case as best as possible.
              */
             if ((peer_writer == NULL) || (previous != election_lost)) {
                 free(peer_writer);
                 peer_writer = strdup(peer->uname);
                 crm_debug("Election lost, presuming %s is writer for now",
                           peer_writer);
             }
             break;
 
         case election_in_progress:
             election_check(writer);
             break;
 
         default:
             crm_info("Ignoring election op from %s due to error", peer->uname);
             break;
     }
 }
 
 bool
 attrd_check_for_new_writer(const crm_node_t *peer, const xmlNode *xml)
 {
     int peer_state = 0;
 
     crm_element_value_int(xml, F_ATTRD_WRITER, &peer_state);
     if (peer_state == election_won) {
         if ((election_state(writer) == election_won)
            && safe_str_neq(peer->uname, attrd_cluster->uname)) {
             crm_notice("Detected another attribute writer (%s), starting new election",
                        peer->uname);
             election_vote(writer);
 
         } else if (safe_str_neq(peer->uname, peer_writer)) {
             crm_notice("Recorded new attribute writer: %s (was %s)",
                        peer->uname, (peer_writer? peer_writer : "unset"));
             free(peer_writer);
             peer_writer = strdup(peer->uname);
         }
     }
     return (peer_state == election_won);
 }
 
 void
 attrd_declare_winner()
 {
     crm_notice("Recorded local node as attribute writer (was %s)",
                (peer_writer? peer_writer : "unset"));
     free(peer_writer);
     peer_writer = strdup(attrd_cluster->uname);
 }
 
 void
 attrd_remove_voter(const crm_node_t *peer)
 {
     if (peer_writer && safe_str_eq(peer->uname, peer_writer)) {
         free(peer_writer);
         peer_writer = NULL;
         crm_notice("Lost attribute writer %s", peer->uname);
     }
     election_remove(writer, peer->uname);
 }
 
 void
 attrd_xml_add_writer(xmlNode *xml)
 {
     crm_xml_add_int(xml, F_ATTRD_WRITER, election_state(writer));
 }
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 8393d5a001..3a3b92939b 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -1,1139 +1,1137 @@
 /*
  * Copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
  *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
  */
 
 #include <crm_internal.h>
 #include "pacemakerd.h"
 
 #include <pwd.h>
 #include <grp.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/reboot.h>
 
 #include <crm/msg_xml.h>
 #include <crm/common/ipcs.h>
 #include <crm/common/mainloop.h>
 #include <crm/cluster/internal.h>
 #include <crm/cluster.h>
 #ifdef SUPPORT_COROSYNC
 #include <corosync/cfg.h>
 #endif
 
 #include <dirent.h>
 #include <ctype.h>
 
 static gboolean pcmk_quorate = FALSE;
 static gboolean fatal_error = FALSE;
 static GMainLoop *mainloop = NULL;
 
 #define PCMK_PROCESS_CHECK_INTERVAL 5
 
 static const char *local_name = NULL;
 static uint32_t local_nodeid = 0;
 static crm_trigger_t *shutdown_trigger = NULL;
 static const char *pid_file = "/var/run/pacemaker.pid";
 
 typedef struct pcmk_child_s {
     int pid;
     long flag;
     int start_seq;
     int respawn_count;
     gboolean respawn;
     const char *name;
     const char *uid;
     const char *command;
 
     gboolean active_before_startup;
 } pcmk_child_t;
 
 /* Index into the array below */
 #define PCMK_CHILD_CONTROLD  3
 
 static pcmk_child_t pcmk_children[] = {
     {
         0, crm_proc_none,       0, 0, FALSE, "none",
         NULL, NULL
     },
     {
         0, crm_proc_execd,      3, 0, TRUE,  "pacemaker-execd",
         NULL, CRM_DAEMON_DIR "/pacemaker-execd"
     },
     {
         0, crm_proc_based,      1, 0, TRUE,  "pacemaker-based",
         CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-based"
     },
     {
         0, crm_proc_controld,   6, 0, TRUE, "pacemaker-controld",
         CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-controld"
     },
     {
         0, crm_proc_attrd,      4, 0, TRUE, "pacemaker-attrd",
         CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-attrd"
     },
     {
         0, crm_proc_schedulerd, 5, 0, TRUE, "pacemaker-schedulerd",
         CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-schedulerd"
     },
     {
         0, crm_proc_fenced,     2, 0, TRUE, "pacemaker-fenced",
         NULL, CRM_DAEMON_DIR "/pacemaker-fenced"
     },
 };
 
 static gboolean start_child(pcmk_child_t * child);
-static gboolean check_active_before_startup_processes(gpointer user_data);
 static gboolean update_node_processes(uint32_t id, const char *uname,
                                       uint32_t procs);
 void update_process_clients(crm_client_t *client);
 
 static uint32_t
 get_process_list(void)
 {
     int lpc = 0;
     uint32_t procs = crm_get_cluster_proc();
 
     for (lpc = 0; lpc < SIZEOF(pcmk_children); lpc++) {
         if (pcmk_children[lpc].pid != 0) {
             procs |= pcmk_children[lpc].flag;
         }
     }
     return procs;
 }
 
 static void
 pcmk_process_exit(pcmk_child_t * child)
 {
     child->pid = 0;
     child->active_before_startup = FALSE;
 
     /* Broadcast the fact that one of our processes died ASAP
      *
      * Try to get some logging of the cause out first though
      * because we're probably about to get fenced
      *
      * Potentially do this only if respawn_count > N
      * to allow for local recovery
      */
     update_node_processes(local_nodeid, NULL, get_process_list());
 
     child->respawn_count += 1;
     if (child->respawn_count > MAX_RESPAWN) {
         crm_err("Child respawn count exceeded by %s", child->name);
         child->respawn = FALSE;
     }
 
     if (shutdown_trigger) {
         mainloop_set_trigger(shutdown_trigger);
         update_node_processes(local_nodeid, NULL, get_process_list());
 
     } else if (child->respawn && crm_is_true(getenv("PCMK_fail_fast"))) {
         crm_err("Rebooting system because of %s", child->name);
         pcmk_panic(__FUNCTION__);
 
     } else if (child->respawn) {
         crm_notice("Respawning failed child process: %s", child->name);
         start_child(child);
     }
 }
 
 static void pcmk_exit_with_cluster(int exitcode)
 {
 #ifdef SUPPORT_COROSYNC
     corosync_cfg_handle_t cfg_handle;
     cs_error_t err;
 
     if (exitcode == CRM_EX_FATAL) {
 	    crm_info("Asking Corosync to shut down");
 	    err = corosync_cfg_initialize(&cfg_handle, NULL);
 	    if (err != CS_OK) {
 		    crm_warn("Unable to open handle to corosync to close it down. err=%d", err);
 	    }
 	    err = corosync_cfg_try_shutdown(cfg_handle, COROSYNC_CFG_SHUTDOWN_FLAG_IMMEDIATE);
 	    if (err != CS_OK) {
 		    crm_warn("Corosync shutdown failed. err=%d", err);
 	    }
 	    corosync_cfg_finalize(cfg_handle);
     }
 #endif
     crm_exit(exitcode);
 }
 
 static void
 pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
 {
     pcmk_child_t *child = mainloop_child_userdata(p);
     const char *name = mainloop_child_name(p);
 
     if (signo) {
         do_crm_log(((signo == SIGKILL)? LOG_WARNING : LOG_ERR),
                    "%s[%d] terminated with signal %d (core=%d)",
                    name, pid, signo, core);
 
     } else {
         switch(exitcode) {
             case CRM_EX_OK:
                 crm_info("%s[%d] exited with status %d (%s)",
                          name, pid, exitcode, crm_exit_str(exitcode));
                 break;
 
             case CRM_EX_FATAL:
                 crm_warn("Shutting cluster down because %s[%d] had fatal failure",
                          name, pid);
                 child->respawn = FALSE;
                 fatal_error = TRUE;
                 pcmk_shutdown(SIGTERM);
                 break;
 
             case CRM_EX_PANIC:
                 do_crm_log_always(LOG_EMERG,
                                   "%s[%d] instructed the machine to reset",
                                   name, pid);
                 child->respawn = FALSE;
                 fatal_error = TRUE;
                 pcmk_panic(__FUNCTION__);
                 pcmk_shutdown(SIGTERM);
                 break;
 
             default:
                 crm_err("%s[%d] exited with status %d (%s)",
                         name, pid, exitcode, crm_exit_str(exitcode));
                 break;
         }
     }
 
     pcmk_process_exit(child);
 }
 
 static gboolean
 stop_child(pcmk_child_t * child, int signal)
 {
     if (signal == 0) {
         signal = SIGTERM;
     }
 
     if (child->command == NULL) {
         crm_debug("Nothing to do for child \"%s\"", child->name);
         return TRUE;
     }
 
     if (child->pid <= 0) {
         crm_trace("Client %s not running", child->name);
         return TRUE;
     }
 
     errno = 0;
     if (kill(child->pid, signal) == 0) {
         crm_notice("Stopping %s "CRM_XS" sent signal %d to process %d",
                    child->name, signal, child->pid);
 
     } else {
         crm_perror(LOG_ERR, "Could not stop %s (process %d) with signal %d",
                    child->name, child->pid, signal);
     }
 
     return TRUE;
 }
 
 static char *opts_default[] = { NULL, NULL };
 static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
 
 static gboolean
 start_child(pcmk_child_t * child)
 {
     int lpc = 0;
     uid_t uid = 0;
     gid_t gid = 0;
     struct rlimit oflimits;
     gboolean use_valgrind = FALSE;
     gboolean use_callgrind = FALSE;
     const char *devnull = "/dev/null";
     const char *env_valgrind = getenv("PCMK_valgrind_enabled");
     const char *env_callgrind = getenv("PCMK_callgrind_enabled");
 
     child->active_before_startup = FALSE;
 
     if (child->command == NULL) {
         crm_info("Nothing to do for child \"%s\"", child->name);
         return TRUE;
     }
 
     if (env_callgrind != NULL && crm_is_true(env_callgrind)) {
         use_callgrind = TRUE;
         use_valgrind = TRUE;
 
     } else if (env_callgrind != NULL && strstr(env_callgrind, child->name)) {
         use_callgrind = TRUE;
         use_valgrind = TRUE;
 
     } else if (env_valgrind != NULL && crm_is_true(env_valgrind)) {
         use_valgrind = TRUE;
 
     } else if (env_valgrind != NULL && strstr(env_valgrind, child->name)) {
         use_valgrind = TRUE;
     }
 
     if (use_valgrind && strlen(VALGRIND_BIN) == 0) {
         crm_warn("Cannot enable valgrind for %s:"
                  " The location of the valgrind binary is unknown", child->name);
         use_valgrind = FALSE;
     }
 
     if (child->uid) {
         if (crm_user_lookup(child->uid, &uid, &gid) < 0) {
             crm_err("Invalid user (%s) for %s: not found", child->uid, child->name);
             return FALSE;
         }
         crm_info("Using uid=%u and group=%u for process %s", uid, gid, child->name);
     }
 
     child->pid = fork();
     CRM_ASSERT(child->pid != -1);
 
     if (child->pid > 0) {
         /* parent */
         mainloop_child_add(child->pid, 0, child->name, child, pcmk_child_exit);
 
         crm_info("Forked child %d for process %s%s", child->pid, child->name,
                  use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
         update_node_processes(local_nodeid, NULL, get_process_list());
         return TRUE;
 
     } else {
         /* Start a new session */
         (void)setsid();
 
         /* Setup the two alternate arg arrays */
         opts_vgrind[0] = strdup(VALGRIND_BIN);
         if (use_callgrind) {
             opts_vgrind[1] = strdup("--tool=callgrind");
             opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p");
             opts_vgrind[3] = strdup(child->command);
             opts_vgrind[4] = NULL;
         } else {
             opts_vgrind[1] = strdup(child->command);
             opts_vgrind[2] = NULL;
             opts_vgrind[3] = NULL;
             opts_vgrind[4] = NULL;
         }
         opts_default[0] = strdup(child->command);
 
         if(gid) {
             // Whether we need root group access to talk to cluster layer
             bool need_root_group = TRUE;
 
             if (is_corosync_cluster()) {
                 /* Corosync clusters can drop root group access, because we set
                  * uidgid.gid.${gid}=1 via CMAP, which allows these processes to
                  * connect to corosync.
                  */
                 need_root_group = FALSE;
             }
 
             // Drop root group access if not needed
             if (!need_root_group && (setgid(gid) < 0)) {
                 crm_perror(LOG_ERR, "Could not set group to %d", gid);
             }
 
             /* Initialize supplementary groups to only those always granted to
              * the user, plus haclient (so we can access IPC).
              */
             if (initgroups(child->uid, gid) < 0) {
                 crm_err("Cannot initialize groups for %s: %s (%d)", child->uid, pcmk_strerror(errno), errno);
             }
         }
 
         if (uid && setuid(uid) < 0) {
             crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid);
         }
 
         /* Close all open file descriptors */
         getrlimit(RLIMIT_NOFILE, &oflimits);
         for (lpc = 0; lpc < oflimits.rlim_cur; lpc++) {
             close(lpc);
         }
 
         (void)open(devnull, O_RDONLY);  /* Stdin:  fd 0 */
         (void)open(devnull, O_WRONLY);  /* Stdout: fd 1 */
         (void)open(devnull, O_WRONLY);  /* Stderr: fd 2 */
 
         if (use_valgrind) {
             (void)execvp(VALGRIND_BIN, opts_vgrind);
         } else {
             (void)execvp(child->command, opts_default);
         }
         crm_perror(LOG_ERR, "FATAL: Cannot exec %s", child->command);
         crm_exit(CRM_EX_FATAL);
     }
     return TRUE;                /* never reached */
 }
 
 static gboolean
 escalate_shutdown(gpointer data)
 {
 
     pcmk_child_t *child = data;
 
     if (child->pid) {
         /* Use SIGSEGV instead of SIGKILL to create a core so we can see what it was up to */
         crm_err("Child %s not terminating in a timely manner, forcing", child->name);
         stop_child(child, SIGSEGV);
     }
     return FALSE;
 }
 
 static gboolean
 pcmk_shutdown_worker(gpointer user_data)
 {
     static int phase = 0;
     static time_t next_log = 0;
     static int max = SIZEOF(pcmk_children);
 
     int lpc = 0;
 
     if (phase == 0) {
         crm_notice("Shutting down Pacemaker");
         phase = max;
-
-        /* Add a second, more frequent, check to speed up shutdown */
-        g_timeout_add_seconds(5, check_active_before_startup_processes, NULL);
     }
 
     for (; phase > 0; phase--) {
         /* Don't stop anything with start_seq < 1 */
 
         for (lpc = max - 1; lpc >= 0; lpc--) {
             pcmk_child_t *child = &(pcmk_children[lpc]);
 
             if (phase != child->start_seq) {
                 continue;
             }
 
             if (child->pid) {
                 time_t now = time(NULL);
 
                 if (child->respawn) {
                     next_log = now + 30;
                     child->respawn = FALSE;
                     stop_child(child, SIGTERM);
                     if (phase < pcmk_children[PCMK_CHILD_CONTROLD].start_seq) {
                         g_timeout_add(180000 /* 3m */ , escalate_shutdown, child);
                     }
 
                 } else if (now >= next_log) {
                     next_log = now + 30;
                     crm_notice("Still waiting for %s to terminate "
                                CRM_XS " pid=%d seq=%d",
                                child->name, child->pid, child->start_seq);
                 }
                 return TRUE;
             }
 
             /* cleanup */
             crm_debug("%s confirmed stopped", child->name);
             child->pid = 0;
         }
     }
 
     /* send_cluster_id(); */
     crm_notice("Shutdown complete");
 
     {
         const char *delay = daemon_option("shutdown_delay");
         if(delay) {
             sync();
             sleep(crm_get_msec(delay) / 1000);
         }
     }
 
     g_main_loop_quit(mainloop);
 
     if (fatal_error) {
         crm_notice("Shutting down and staying down after fatal error");
         pcmk_exit_with_cluster(CRM_EX_FATAL);
     }
 
     return TRUE;
 }
 
 static void
 pcmk_ignore(int nsig)
 {
     crm_info("Ignoring signal %s (%d)", strsignal(nsig), nsig);
 }
 
 static void
 pcmk_sigquit(int nsig)
 {
     pcmk_panic(__FUNCTION__);
 }
 
 void
 pcmk_shutdown(int nsig)
 {
     if (shutdown_trigger == NULL) {
         shutdown_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, pcmk_shutdown_worker, NULL);
     }
     mainloop_set_trigger(shutdown_trigger);
 }
 
 static int32_t
 pcmk_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
 {
     crm_trace("Connection %p", c);
     if (crm_client_new(c, uid, gid) == NULL) {
         return -EIO;
     }
     return 0;
 }
 
 static void
 pcmk_ipc_created(qb_ipcs_connection_t * c)
 {
     crm_trace("Connection %p", c);
 }
 
 /* Exit code means? */
 static int32_t
 pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size)
 {
     uint32_t id = 0;
     uint32_t flags = 0;
     const char *task = NULL;
     crm_client_t *c = crm_client_get(qbc);
     xmlNode *msg = crm_ipcs_recv(c, data, size, &id, &flags);
 
     crm_ipcs_send_ack(c, id, flags, "ack", __FUNCTION__, __LINE__);
     if (msg == NULL) {
         return 0;
     }
 
     task = crm_element_value(msg, F_CRM_TASK);
     if (crm_str_eq(task, CRM_OP_QUIT, TRUE)) {
         /* Time to quit */
         crm_notice("Shutting down in response to ticket %s (%s)",
                    crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN));
         pcmk_shutdown(15);
 
     } else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
         /* Send to everyone */
         struct iovec *iov;
         int id = 0;
         const char *name = NULL;
 
         crm_element_value_int(msg, XML_ATTR_ID, &id);
         name = crm_element_value(msg, XML_ATTR_UNAME);
         crm_notice("Instructing peers to remove references to node %s/%u", name, id);
 
         iov = calloc(1, sizeof(struct iovec));
         iov->iov_base = dump_xml_unformatted(msg);
         iov->iov_len = 1 + strlen(iov->iov_base);
         send_cpg_iov(iov);
 
     } else {
         update_process_clients(c);
     }
 
     free_xml(msg);
     return 0;
 }
 
 /* Error code means? */
 static int32_t
 pcmk_ipc_closed(qb_ipcs_connection_t * c)
 {
     crm_client_t *client = crm_client_get(c);
 
     if (client == NULL) {
         return 0;
     }
     crm_trace("Connection %p", c);
     crm_client_destroy(client);
     return 0;
 }
 
 static void
 pcmk_ipc_destroy(qb_ipcs_connection_t * c)
 {
     crm_trace("Connection %p", c);
     pcmk_ipc_closed(c);
 }
 
 struct qb_ipcs_service_handlers mcp_ipc_callbacks = {
     .connection_accept = pcmk_ipc_accept,
     .connection_created = pcmk_ipc_created,
     .msg_process = pcmk_ipc_dispatch,
     .connection_closed = pcmk_ipc_closed,
     .connection_destroyed = pcmk_ipc_destroy
 };
 
 /*!
  * \internal
  * \brief Send an XML message with process list of all known peers to client(s)
  *
  * \param[in] client  Send message to this client, or all clients if NULL
  */
 void
 update_process_clients(crm_client_t *client)
 {
     GHashTableIter iter;
     crm_node_t *node = NULL;
     xmlNode *update = create_xml_node(NULL, "nodes");
 
     if (is_corosync_cluster()) {
         crm_xml_add_int(update, "quorate", pcmk_quorate);
     }
 
     g_hash_table_iter_init(&iter, crm_peer_cache);
     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) {
         xmlNode *xml = create_xml_node(update, "node");
 
         crm_xml_add_int(xml, "id", node->id);
         crm_xml_add(xml, "uname", node->uname);
         crm_xml_add(xml, "state", node->state);
         crm_xml_add_int(xml, "processes", node->processes);
     }
 
     if(client) {
         crm_trace("Sending process list to client %s", client->id);
         crm_ipcs_send(client, 0, update, crm_ipc_server_event);
 
     } else {
         crm_trace("Sending process list to %d clients", crm_hash_table_size(client_connections));
         g_hash_table_iter_init(&iter, client_connections);
         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & client)) {
             crm_ipcs_send(client, 0, update, crm_ipc_server_event);
         }
     }
 
     free_xml(update);
 }
 
 /*!
  * \internal
  * \brief Send a CPG message with local node's process list to all peers
  */
 static void
 update_process_peers(void)
 {
     /* Do nothing for corosync-2 based clusters */
 
     struct iovec *iov = calloc(1, sizeof(struct iovec));
 
     CRM_ASSERT(iov);
     if (local_name) {
         iov->iov_base = crm_strdup_printf("<node uname=\"%s\" proclist=\"%u\"/>",
                                           local_name, get_process_list());
     } else {
         iov->iov_base = crm_strdup_printf("<node proclist=\"%u\"/>",
                                           get_process_list());
     }
     iov->iov_len = strlen(iov->iov_base) + 1;
     crm_trace("Sending %s", (char*) iov->iov_base);
     send_cpg_iov(iov);
 }
 
 /*!
  * \internal
  * \brief Update a node's process list, notifying clients and peers if needed
  *
  * \param[in] id     Node ID of affected node
  * \param[in] uname  Uname of affected node
  * \param[in] procs  Affected node's process list mask
  *
  * \return TRUE if the process list changed, FALSE otherwise
  */
 static gboolean
 update_node_processes(uint32_t id, const char *uname, uint32_t procs)
 {
     gboolean changed = FALSE;
     crm_node_t *node = crm_get_peer(id, uname);
 
     if (procs != 0) {
         if (procs != node->processes) {
             crm_debug("Node %s now has process list: %.32x (was %.32x)",
                       node->uname, procs, node->processes);
             node->processes = procs;
             changed = TRUE;
 
             /* If local node's processes have changed, notify clients/peers */
             if (id == local_nodeid) {
                 update_process_clients(NULL);
                 update_process_peers();
             }
 
         } else {
             crm_trace("Node %s still has process list: %.32x", node->uname, procs);
         }
     }
     return changed;
 }
 
 
 /* *INDENT-OFF* */
 static struct crm_option long_options[] = {
     /* Top-level Options */
     {"help",           0, 0, '?', "\tThis text"},
     {"version",        0, 0, '$', "\tVersion information"  },
     {"verbose",        0, 0, 'V', "\tIncrease debug output"},
     {"shutdown",       0, 0, 'S', "\tInstruct Pacemaker to shutdown on this machine"},
     {"features",       0, 0, 'F', "\tDisplay the full version and list of features Pacemaker was built with"},
 
     {"-spacer-",       1, 0, '-', "\nAdditional Options:"},
     {"foreground",     0, 0, 'f', "\t(Ignored) Pacemaker always runs in the foreground"},
     {"pid-file",       1, 0, 'p', "\t(Ignored) Daemon pid file location"},
     {"standby",        0, 0, 's', "\tStart node in standby state"},
 
     {NULL, 0, 0, 0}
 };
 /* *INDENT-ON* */
 
 static void
 mcp_chown(const char *path, uid_t uid, gid_t gid)
 {
     int rc = chown(path, uid, gid);
 
     if (rc < 0) {
         crm_warn("Cannot change the ownership of %s to user %s and gid %d: %s",
                  path, CRM_DAEMON_USER, gid, pcmk_strerror(errno));
     }
 }
 
+#if SUPPORT_PROCFS
 static gboolean
 check_active_before_startup_processes(gpointer user_data)
 {
     int start_seq = 1, lpc = 0;
     static int max = SIZEOF(pcmk_children);
     gboolean keep_tracking = FALSE;
 
     for (start_seq = 1; start_seq < max; start_seq++) {
         for (lpc = 0; lpc < max; lpc++) {
             if (pcmk_children[lpc].active_before_startup == FALSE) {
                 /* we are already tracking it as a child process. */
                 continue;
             } else if (start_seq != pcmk_children[lpc].start_seq) {
                 continue;
             } else {
                 const char *name = pcmk_children[lpc].name;
 
                 if (crm_pid_active(pcmk_children[lpc].pid, name) != 1) {
                     crm_notice("Process %s terminated (pid=%d)",
                            name, pcmk_children[lpc].pid);
                     pcmk_process_exit(&(pcmk_children[lpc]));
                     continue;
                 }
             }
             /* at least one of the processes found at startup
              * is still going, so keep this recurring timer around */
             keep_tracking = TRUE;
         }
     }
 
     return keep_tracking;
 }
+#endif // SUPPORT_PROCFS
 
 static void
 find_and_track_existing_processes(void)
 {
 #if SUPPORT_PROCFS
     DIR *dp;
     struct dirent *entry;
     bool start_tracker = FALSE;
     char entry_name[16];
 
     dp = opendir("/proc");
     if (!dp) {
         /* no proc directory to search through */
         crm_notice("Can not read /proc directory to track existing components");
         return;
     }
 
     while ((entry = readdir(dp)) != NULL) {
         int pid;
         int max = SIZEOF(pcmk_children);
         int i;
 
         if (crm_procfs_process_info(entry, entry_name, &pid) < 0) {
             continue;
         }
         for (i = 0; i < max; i++) {
             if ((pcmk_children[i].start_seq != 0)
                 && !strncmp(entry_name, pcmk_children[i].name, 15)
                 && (crm_pid_active(pid, NULL) == 1)) {
 
                 crm_notice("Tracking existing %s process (pid=%d)",
                            pcmk_children[i].name, pid);
                 pcmk_children[i].pid = pid;
                 pcmk_children[i].active_before_startup = TRUE;
                 start_tracker = TRUE;
                 break;
             }
         }
     }
 
     if (start_tracker) {
         g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_active_before_startup_processes,
                               NULL);
     }
     closedir(dp);
 #else
     crm_notice("No procfs support, so skipping check for existing components");
 #endif // SUPPORT_PROCFS
 }
 
 static void
 init_children_processes(void)
 {
     int start_seq = 1, lpc = 0;
     static int max = SIZEOF(pcmk_children);
 
     /* start any children that have not been detected */
     for (start_seq = 1; start_seq < max; start_seq++) {
         /* don't start anything with start_seq < 1 */
         for (lpc = 0; lpc < max; lpc++) {
             if (pcmk_children[lpc].pid) {
                 /* we are already tracking it */
                 continue;
             }
 
             if (start_seq == pcmk_children[lpc].start_seq) {
                 start_child(&(pcmk_children[lpc]));
             }
         }
     }
 
     /* From this point on, any daemons being started will be due to
      * respawning rather than node start.
      *
      * This may be useful for the daemons to know
      */
     setenv("PCMK_respawned", "true", 1);
 }
 
 static void
 mcp_cpg_destroy(gpointer user_data)
 {
     crm_crit("Lost connection to cluster layer, shutting down");
     crm_exit(CRM_EX_DISCONNECT);
 }
 
 /*!
  * \internal
  * \brief Process a CPG message (process list or manual peer cache removal)
  *
  * \param[in] handle     CPG connection (ignored)
  * \param[in] groupName  CPG group name (ignored)
  * \param[in] nodeid     ID of affected node
  * \param[in] pid        Process ID (ignored)
  * \param[in] msg        CPG XML message
  * \param[in] msg_len    Length of msg in bytes (ignored)
  */
 static void
 mcp_cpg_deliver(cpg_handle_t handle,
                  const struct cpg_name *groupName,
                  uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
 {
     xmlNode *xml = string2xml(msg);
     const char *task = crm_element_value(xml, F_CRM_TASK);
 
     crm_trace("Received CPG message (%s): %.200s",
               (task? task : "process list"), (char*)msg);
 
     if (task == NULL) {
         if (nodeid == local_nodeid) {
             crm_debug("Ignoring message with local node's process list");
         } else {
             uint32_t procs = 0;
             const char *uname = crm_element_value(xml, "uname");
 
             crm_element_value_int(xml, "proclist", (int *)&procs);
             if (update_node_processes(nodeid, uname, procs)) {
                 update_process_clients(NULL);
             }
         }
 
     } else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
         int id = 0;
         const char *name = NULL;
 
         crm_element_value_int(xml, XML_ATTR_ID, &id);
         name = crm_element_value(xml, XML_ATTR_UNAME);
         reap_crm_member(id, name);
     }
 
     if (xml != NULL) {
         free_xml(xml);
     }
 }
 
 static void
 mcp_cpg_membership(cpg_handle_t handle,
                     const struct cpg_name *groupName,
                     const struct cpg_address *member_list, size_t member_list_entries,
                     const struct cpg_address *left_list, size_t left_list_entries,
                     const struct cpg_address *joined_list, size_t joined_list_entries)
 {
     /* Update peer cache if needed */
     pcmk_cpg_membership(handle, groupName, member_list, member_list_entries,
                         left_list, left_list_entries,
                         joined_list, joined_list_entries);
 
     /* Always broadcast our own presence after any membership change */
     update_process_peers();
 }
 
 static gboolean
 mcp_quorum_callback(unsigned long long seq, gboolean quorate)
 {
     pcmk_quorate = quorate;
     return TRUE;
 }
 
 static void
 mcp_quorum_destroy(gpointer user_data)
 {
     crm_info("connection lost");
 }
 
 int
 main(int argc, char **argv)
 {
     int rc;
     int flag;
     int argerr = 0;
 
     int option_index = 0;
     gboolean shutdown = FALSE;
 
     uid_t pcmk_uid = 0;
     gid_t pcmk_gid = 0;
     struct rlimit cores;
     crm_ipc_t *old_instance = NULL;
     qb_ipcs_service_t *ipcs = NULL;
     static crm_cluster_t cluster;
 
     crm_log_preinit(NULL, argc, argv);
     crm_set_options(NULL, "mode [options]", long_options, "Start/Stop Pacemaker\n");
     mainloop_add_signal(SIGHUP, pcmk_ignore);
     mainloop_add_signal(SIGQUIT, pcmk_sigquit);
 
     while (1) {
         flag = crm_get_option(argc, argv, &option_index);
         if (flag == -1)
             break;
 
         switch (flag) {
             case 'V':
                 crm_bump_log_level(argc, argv);
                 break;
             case 'f':
                 /* Legacy */
                 break;
             case 'p':
                 pid_file = optarg;
                 break;
             case 's':
                 set_daemon_option("node_start_state", "standby");
                 break;
             case '$':
             case '?':
                 crm_help(flag, CRM_EX_OK);
                 break;
             case 'S':
                 shutdown = TRUE;
                 break;
             case 'F':
                 printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION,
                        CRM_FEATURE_SET, CRM_FEATURES);
                 crm_exit(CRM_EX_OK);
             default:
                 printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag);
                 ++argerr;
                 break;
         }
     }
 
     if (optind < argc) {
         printf("non-option ARGV-elements: ");
         while (optind < argc)
             printf("%s ", argv[optind++]);
         printf("\n");
     }
     if (argerr) {
         crm_help('?', CRM_EX_USAGE);
     }
 
 
     setenv("LC_ALL", "C", 1);
 
     set_daemon_option("mcp", "true");
 
     crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE);
 
     crm_debug("Checking for old instances of %s", CRM_SYSTEM_MCP);
     old_instance = crm_ipc_new(CRM_SYSTEM_MCP, 0);
     crm_ipc_connect(old_instance);
 
     if (shutdown) {
         crm_debug("Terminating previous instance");
         while (crm_ipc_connected(old_instance)) {
             xmlNode *cmd =
                 create_request(CRM_OP_QUIT, NULL, NULL, CRM_SYSTEM_MCP, CRM_SYSTEM_MCP, NULL);
 
             crm_debug(".");
             crm_ipc_send(old_instance, cmd, 0, 0, NULL);
             free_xml(cmd);
 
             sleep(2);
         }
         crm_ipc_close(old_instance);
         crm_ipc_destroy(old_instance);
         crm_exit(CRM_EX_OK);
 
     } else if (crm_ipc_connected(old_instance)) {
         crm_ipc_close(old_instance);
         crm_ipc_destroy(old_instance);
         crm_err("Pacemaker is already active, aborting startup");
         crm_exit(CRM_EX_FATAL);
     }
 
     crm_ipc_close(old_instance);
     crm_ipc_destroy(old_instance);
 
     if (mcp_read_config() == FALSE) {
         crm_notice("Could not obtain corosync config data, exiting");
         crm_exit(CRM_EX_UNAVAILABLE);
     }
 
     // OCF shell functions and cluster-glue need facility under different name
     {
         const char *facility = daemon_option("logfacility");
 
         if (facility && safe_str_neq(facility, "none")) {
             setenv("HA_LOGFACILITY", facility, 1);
         }
     }
 
     crm_notice("Starting Pacemaker %s "CRM_XS" build=%s features:%s",
                PACEMAKER_VERSION, BUILD_VERSION, CRM_FEATURES);
     mainloop = g_main_loop_new(NULL, FALSE);
     sysrq_init();
 
     rc = getrlimit(RLIMIT_CORE, &cores);
     if (rc < 0) {
         crm_perror(LOG_ERR, "Cannot determine current maximum core size.");
     } else {
         if (cores.rlim_max == 0 && geteuid() == 0) {
             cores.rlim_max = RLIM_INFINITY;
         } else {
             crm_info("Maximum core file size is: %lu", (unsigned long)cores.rlim_max);
         }
         cores.rlim_cur = cores.rlim_max;
 
         rc = setrlimit(RLIMIT_CORE, &cores);
         if (rc < 0) {
             crm_perror(LOG_ERR,
                        "Core file generation will remain disabled."
                        " Core files are an important diagnostic tool, so"
                        " please consider enabling them by default.");
         }
     }
 
     if (crm_user_lookup(CRM_DAEMON_USER, &pcmk_uid, &pcmk_gid) < 0) {
         crm_err("Cluster user %s does not exist, aborting Pacemaker startup", CRM_DAEMON_USER);
         crm_exit(CRM_EX_NOUSER);
     }
 
     mkdir(CRM_STATE_DIR, 0750);
     mcp_chown(CRM_STATE_DIR, pcmk_uid, pcmk_gid);
 
     /* Used to store core/blackbox/scheduler/cib files in */
     crm_build_path(CRM_PACEMAKER_DIR, 0750);
     mcp_chown(CRM_PACEMAKER_DIR, pcmk_uid, pcmk_gid);
 
     /* Used to store core files in */
     crm_build_path(CRM_CORE_DIR, 0750);
     mcp_chown(CRM_CORE_DIR, pcmk_uid, pcmk_gid);
 
     /* Used to store blackbox dumps in */
     crm_build_path(CRM_BLACKBOX_DIR, 0750);
     mcp_chown(CRM_BLACKBOX_DIR, pcmk_uid, pcmk_gid);
 
     // Used to store scheduler inputs in
     crm_build_path(PE_STATE_DIR, 0750);
     mcp_chown(PE_STATE_DIR, pcmk_uid, pcmk_gid);
 
     /* Used to store the cluster configuration */
     crm_build_path(CRM_CONFIG_DIR, 0750);
     mcp_chown(CRM_CONFIG_DIR, pcmk_uid, pcmk_gid);
 
     // Don't build CRM_RSCTMP_DIR, pacemaker-execd will do it
 
     ipcs = mainloop_add_ipc_server(CRM_SYSTEM_MCP, QB_IPC_NATIVE, &mcp_ipc_callbacks);
     if (ipcs == NULL) {
         crm_err("Couldn't start IPC server");
         crm_exit(CRM_EX_OSERR);
     }
 
     /* Allows us to block shutdown */
     if (cluster_connect_cfg(&local_nodeid) == FALSE) {
         crm_err("Couldn't connect to Corosync's CFG service");
         crm_exit(CRM_EX_PROTOCOL);
     }
 
     if(pcmk_locate_sbd() > 0) {
         setenv("PCMK_watchdog", "true", 1);
     } else {
         setenv("PCMK_watchdog", "false", 1);
     }
 
     find_and_track_existing_processes();
 
     cluster.destroy = mcp_cpg_destroy;
     cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver;
     cluster.cpg.cpg_confchg_fn = mcp_cpg_membership;
 
     crm_set_autoreap(FALSE);
 
     rc = pcmk_ok;
 
     if (cluster_connect_cpg(&cluster) == FALSE) {
         crm_err("Couldn't connect to Corosync's CPG service");
         rc = -ENOPROTOOPT;
 
     } else if (cluster_connect_quorum(mcp_quorum_callback, mcp_quorum_destroy)
                == FALSE) {
         rc = -ENOTCONN;
 
     } else {
         local_name = get_local_node_name();
         update_node_processes(local_nodeid, local_name, get_process_list());
 
         mainloop_add_signal(SIGTERM, pcmk_shutdown);
         mainloop_add_signal(SIGINT, pcmk_shutdown);
 
         init_children_processes();
 
         crm_info("Starting mainloop");
 
         g_main_loop_run(mainloop);
     }
 
     if (ipcs) {
         crm_trace("Closing IPC server");
         mainloop_del_ipc_server(ipcs);
         ipcs = NULL;
     }
 
     g_main_loop_unref(mainloop);
 
     cluster_disconnect_cpg(&cluster);
     cluster_disconnect_cfg();
 
     return crm_exit(crm_errno2exit(rc));
 }
diff --git a/doc/Clusters_from_Scratch/en-US/Book_Info.xml b/doc/Clusters_from_Scratch/en-US/Book_Info.xml
index cfabd51e1c..68be4424c1 100644
--- a/doc/Clusters_from_Scratch/en-US/Book_Info.xml
+++ b/doc/Clusters_from_Scratch/en-US/Book_Info.xml
@@ -1,71 +1,69 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE bookinfo PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Clusters_from_Scratch.ent">
 %BOOK_ENTITIES;
 ]>
 <bookinfo id="book-Clusters_from_Scratch-Clusters_from_Scratch">
-	<title>Clusters from Scratch</title>
-	<subtitle>Step-by-Step Instructions for Building Your First High-Availability Cluster</subtitle>
-	<productname>Pacemaker</productname>
-	<productnumber>2.0</productnumber>
-	<!--
-		EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
-		Increment EDITION when the syntax of the documented software
-		changes (OS, pacemaker, corosync, pcs), and PUBSNUMBER for
-		simple textual changes (corrections, translations, etc.).
-	-->
-	<edition>10</edition>
-	<pubsnumber>1</pubsnumber>
-	<abstract>
-		<para>
-			This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker.
-		</para>
-		<para>
-			The example cluster will use:
-			<orderedlist>
-				<listitem>
-					<para>
-						&DISTRO; &DISTRO_VERSION; as the host operating system
-					</para>
-				</listitem>
-				<listitem>
-					<para>
-						Corosync to provide messaging and membership services,
-					</para>
-				</listitem>
-				<listitem>
-					<para>
-						Pacemaker 1.1.18
-						<footnote><para>While this guide is part of the document set for
-						Pacemaker 2.0, it demonstrates the version available in
-						the standard &DISTRO; repositories</para></footnote>
-						to perform resource management,
-					</para>
-				</listitem>
-				<listitem>
-					<para>
-						DRBD as a cost-effective alternative to shared storage,
-					</para>
-				</listitem>
-				<listitem>
-					<para>
-						GFS2 as the cluster filesystem (in active/active mode)
-					</para>
-				</listitem>
-			</orderedlist>
-		</para>
-		<para>
-			Given the graphical nature of the install process, a number of screenshots are included. However the guide is primarily composed of commands, the reasons for executing them and their expected outputs.
-		</para>
-	</abstract>
-	<corpauthor>
-		<inlinemediaobject>
-			<imageobject>
-				<imagedata fileref="Common_Content/images/title_logo.svg" format="SVG" />
-			</imageobject>
-		</inlinemediaobject>
-	</corpauthor>
-	<xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
-	<xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+        <title>Clusters from Scratch</title>
+        <subtitle>Step-by-Step Instructions for Building Your First High-Availability Cluster</subtitle>
+        <productname>Pacemaker</productname>
+        <productnumber>2.0</productnumber>
+        <!--
+                EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
+                Increment EDITION when the syntax of the documented software
+                changes (OS, pacemaker, corosync, pcs), and PUBSNUMBER for
+                simple textual changes (corrections, translations, etc.).
+        -->
+        <edition>10</edition>
+        <pubsnumber>2</pubsnumber>
+        <abstract>
+                <para>
+                        This document provides a step-by-step guide to building a simple high-availability cluster using Pacemaker.
+                </para>
+                <para>
+                        The example cluster will use:
+                        <orderedlist>
+                                <listitem>
+                                        <para>
+                                                &DISTRO; &DISTRO_VERSION; as the host operating system
+                                        </para>
+                                </listitem>
+                                <listitem>
+                                        <para>
+                                                Corosync to provide messaging and membership services,
+                                        </para>
+                                </listitem>
+                                <listitem>
+                                        <para>
+                                                Pacemaker 1.1.18
+                                                <footnote><para>While this guide is part of the document set for
+                                                Pacemaker 2.0, it demonstrates the version available in
+                                                the standard &DISTRO; repositories.</para></footnote>
+                                        </para>
+                                </listitem>
+                                <listitem>
+                                        <para>
+                                                DRBD as a cost-effective alternative to shared storage,
+                                        </para>
+                                </listitem>
+                                <listitem>
+                                        <para>
+                                                GFS2 as the cluster filesystem (in active/active mode)
+                                        </para>
+                                </listitem>
+                        </orderedlist>
+                </para>
+                <para>
+                        Given the graphical nature of the install process, a number of screenshots are included. However the guide is primarily composed of commands, the reasons for executing them and their expected outputs.
+                </para>
+        </abstract>
+        <corpauthor>
+                <inlinemediaobject>
+                        <imageobject>
+                                <imagedata fileref="Common_Content/images/title_logo.svg" format="SVG" />
+                        </imageobject>
+                </inlinemediaobject>
+        </corpauthor>
+        <xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+        <xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
 </bookinfo>
-
diff --git a/doc/Clusters_from_Scratch/en-US/Revision_History.xml b/doc/Clusters_from_Scratch/en-US/Revision_History.xml
index 537b31e9da..05ef1be65a 100644
--- a/doc/Clusters_from_Scratch/en-US/Revision_History.xml
+++ b/doc/Clusters_from_Scratch/en-US/Revision_History.xml
@@ -1,85 +1,93 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE appendix PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Clusters_from_Scratch.ent">
 %BOOK_ENTITIES;
 ]>
 <appendix id="appe-Clusters_from_Scratch-Revision_History">
-	<!-- see comment in Book_Info.xml for revision numbering -->
-	<title>Revision History</title>
-	<simpara>
-		<revhistory>
-			<revision>
-			  <revnumber>1-0</revnumber>
-			  <date>Mon May 17 2010</date>
-			  <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
-			  <revdescription><simplelist><member>Import from Pages.app</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>2-0</revnumber>
-			  <date>Wed Sep 22 2010</date>
-			  <author><firstname>Raoul</firstname><surname>Scarazzini</surname><email>rasca@miamammausalinux.org</email></author>
-			  <revdescription><simplelist><member>Italian translation</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>3-0</revnumber>
-			  <date>Wed Feb 9 2011</date>
-			  <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
-			  <revdescription><simplelist><member>Updated for Fedora 13</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>4-0</revnumber>
-			  <date>Wed Oct 5 2011</date>
-			  <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
-			  <revdescription><simplelist><member>Update the GFS2 section to use CMAN</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>5-0</revnumber>
-			  <date>Fri Feb 10 2012</date>
-			  <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
-			  <revdescription><simplelist><member>Generate docbook content from asciidoc sources</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>6-0</revnumber>
-			  <date>Tues July 3 2012</date>
-			  <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
-			  <revdescription><simplelist><member>Updated for Fedora 17</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>7-0</revnumber>
-			  <date>Fri Sept 14 2012</date>
-			  <author><firstname>David</firstname><surname>Vossel</surname><email>davidvossel@gmail.com</email></author>
-			  <revdescription><simplelist><member>Updated for pcs</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>8-0</revnumber>
-			  <date>Mon Jan 05 2015</date>
-			  <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
-			  <revdescription><simplelist><member>Updated for Fedora 21</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>8-1</revnumber>
-			  <date>Thu Jan 08 2015</date>
-			  <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
-			  <revdescription><simplelist><member>Minor corrections, plus use include file for intro</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>9-0</revnumber>
-			  <date>Fri Aug 14 2015</date>
-			  <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
-			  <revdescription><simplelist><member>Update for CentOS 7.1 and leaving firewalld/SELinux enabled</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>10-0</revnumber>
-			  <date>Fri Jan 12 2018</date>
-			  <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
-			  <revdescription><simplelist><member>Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16</member></simplelist></revdescription>
-			</revision>
-			<revision>
-			  <revnumber>10-1</revnumber>
-			  <date>Wed Sep 5 2018</date>
-			  <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
-			  <revdescription><simplelist><member>Update for CentOS 7.5 with Pacemaker 1.1.18</member></simplelist></revdescription>
-			</revision>
-		</revhistory>
-	</simpara>
+        <!-- see comment in Book_Info.xml for revision numbering -->
+        <title>Revision History</title>
+        <simpara>
+                <revhistory>
+                        <revision>
+                          <revnumber>1-0</revnumber>
+                          <date>Mon May 17 2010</date>
+                          <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
+                          <revdescription><simplelist><member>Import from Pages.app</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>2-0</revnumber>
+                          <date>Wed Sep 22 2010</date>
+                          <author><firstname>Raoul</firstname><surname>Scarazzini</surname><email>rasca@miamammausalinux.org</email></author>
+                          <revdescription><simplelist><member>Italian translation</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>3-0</revnumber>
+                          <date>Wed Feb 9 2011</date>
+                          <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
+                          <revdescription><simplelist><member>Updated for Fedora 13</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>4-0</revnumber>
+                          <date>Wed Oct 5 2011</date>
+                          <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
+                          <revdescription><simplelist><member>Update the GFS2 section to use CMAN</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>5-0</revnumber>
+                          <date>Fri Feb 10 2012</date>
+                          <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
+                          <revdescription><simplelist><member>Generate docbook content from asciidoc sources</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>6-0</revnumber>
+                          <date>Tues July 3 2012</date>
+                          <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
+                          <revdescription><simplelist><member>Updated for Fedora 17</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>7-0</revnumber>
+                          <date>Fri Sept 14 2012</date>
+                          <author><firstname>David</firstname><surname>Vossel</surname><email>davidvossel@gmail.com</email></author>
+                          <revdescription><simplelist><member>Updated for pcs</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>8-0</revnumber>
+                          <date>Mon Jan 05 2015</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <revdescription><simplelist><member>Updated for Fedora 21</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>8-1</revnumber>
+                          <date>Thu Jan 08 2015</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <revdescription><simplelist><member>Minor corrections, plus use include file for intro</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>9-0</revnumber>
+                          <date>Fri Aug 14 2015</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <revdescription><simplelist><member>Update for CentOS 7.1 and leaving firewalld/SELinux enabled</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>10-0</revnumber>
+                          <date>Fri Jan 12 2018</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <revdescription><simplelist><member>Update banner for Pacemaker 2.0 and content for CentOS 7.4 with Pacemaker 1.1.16</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>10-1</revnumber>
+                          <date>Wed Sep 5 2018</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <revdescription><simplelist><member>Update for CentOS 7.5 with Pacemaker 1.1.18</member></simplelist></revdescription>
+                        </revision>
+                        <revision>
+                          <revnumber>10-2</revnumber>
+                          <date>Fri Dec 7 2018</date>
+                          <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+                          <author><firstname>Jan</firstname><surname>Pokorný</surname><email>jpokorny@redhat.com</email></author>
+                          <author><firstname>Chris</firstname><surname>Lumens</surname><email>clumens@redhat.com</email></author>
+                          <revdescription><simplelist><member>Minor clarifications and formatting changes</member></simplelist></revdescription>
+                        </revision>
+                </revhistory>
+        </simpara>
 </appendix>
diff --git a/doc/Pacemaker_Administration/en-US/Book_Info.xml b/doc/Pacemaker_Administration/en-US/Book_Info.xml
index 8622da75c6..fd1bc36d72 100644
--- a/doc/Pacemaker_Administration/en-US/Book_Info.xml
+++ b/doc/Pacemaker_Administration/en-US/Book_Info.xml
@@ -1,36 +1,36 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE bookinfo PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Pacemaker_Administration.ent">
 %BOOK_ENTITIES;
 ]>
 <bookinfo id="book-Pacemaker_Administration-Pacemaker_Administration">
 	<title>Pacemaker Administration</title>
 	<subtitle>Managing Pacemaker Clusters</subtitle>
 	<!--
 		EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
 		Increment EDITION when the syntax of the documented software
 		changes (OS, pacemaker, corosync, pcs), and PUBSNUMBER for
 		simple textual changes (corrections, translations, etc.).
 		Changing the revision is only necessary when releasing a new
 		version of Pacemaker or publishing the documents to the Web.
 	-->
 	<edition>1</edition>
-	<pubsnumber>0</pubsnumber>
+	<pubsnumber>1</pubsnumber>
 	<abstract>
 		<para>
 			This document has instructions and tips for system
 			administrators who need to manage high-availability
 			clusters using Pacemaker.
 		</para>
 	</abstract>
 	<corpauthor>
 		<inlinemediaobject>
 			<imageobject>
 				<imagedata fileref="Common_Content/images/title_logo.svg" format="SVG" />
 			</imageobject>
 		</inlinemediaobject>
 	</corpauthor>
 	<xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
 	<xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
 </bookinfo>
 
diff --git a/doc/Pacemaker_Administration/en-US/Ch-Agents.txt b/doc/Pacemaker_Administration/en-US/Ch-Agents.txt
index c5afcb6b4a..0d8ff1f1fb 100644
--- a/doc/Pacemaker_Administration/en-US/Ch-Agents.txt
+++ b/doc/Pacemaker_Administration/en-US/Ch-Agents.txt
@@ -1,338 +1,350 @@
 :compat-mode: legacy
 = Resource Agents =
 
+== Resource Agent Actions ==
+
+If one resource depends on another resource via constraints, the cluster will
+interpret an expected result as sufficient to continue with dependent actions.
+This may cause timing issues if the resource agent start returns before the
+service is not only launched but fully ready to perform its function, or if the
+resource agent stop returns before the service has fully released all its
+claims on system resources. At a minimum, the start or stop should not return
+before a status command would return the expected (started or stopped) result.
+
 == OCF Resource Agents ==
 
 === Location of Custom Scripts ===
 
 indexterm:[OCF Resource Agents]
 OCF Resource Agents are found in +/usr/lib/ocf/resource.d/pass:[<replaceable>provider</replaceable>]+
 
 When creating your own agents, you are encouraged to create a new
 directory under +/usr/lib/ocf/resource.d/+ so that they are not
 confused with (or overwritten by) the agents shipped by existing providers.
 
 So, for example, if you choose the provider name of bigCorp and want
 a new resource named bigApp, you would create a resource agent called
 +/usr/lib/ocf/resource.d/bigCorp/bigApp+ and define a resource:
  
 [source,XML]
 ----
 <primitive id="custom-app" class="ocf" provider="bigCorp" type="bigApp"/>
 ----
 
 === Actions ===
 
 All OCF resource agents are required to implement the following actions.
 
 .Required Actions for OCF Agents
 [width="95%",cols="3m,3,7",options="header",align="center"]
 |=========================================================
 |Action
 |Description
 |Instructions
 
 |start
 |Start the resource
 |Return 0 on success and an appropriate error code otherwise. Must not
  report success until the resource is fully active.
  indexterm:[start,OCF Action]
  indexterm:[OCF,Action,start]
 
 |stop
 |Stop the resource
 |Return 0 on success and an appropriate error code otherwise. Must not
  report success until the resource is fully stopped.
  indexterm:[stop,OCF Action]
  indexterm:[OCF,Action,stop]
 
 |monitor
 |Check the resource's state 
 
 |Exit 0 if the resource is running, 7 if it is stopped, and anything
  else if it is failed. 
  indexterm:[monitor,OCF Action]
  indexterm:[OCF,Action,monitor]
 
 NOTE: The monitor script should test the state of the resource on the local machine only.
 
 |meta-data
 |Describe the resource
 |Provide information about this resource as an XML snippet. Exit with 0.
  indexterm:[meta-data,OCF Action]
  indexterm:[OCF,Action,meta-data]
 
 NOTE: This is _not_ performed as root.
 
 |validate-all
 |Verify the supplied parameters
 |Return 0 if parameters are valid, 2 if not valid, and 6 if resource is not configured.
  indexterm:[validate-all,OCF Action]
  indexterm:[OCF,Action,validate-all]
 
 |=========================================================
 
 Additional requirements (not part of the OCF specification) are placed on
 agents that will be used for advanced concepts such as clone resources.
 
 .Optional Actions for OCF Resource Agents
 [width="95%",cols="2m,6,3",options="header",align="center"]
 |=========================================================
 
 |Action
 |Description
 |Instructions
 
 |promote
 |Promote the local instance of a promotable clone resource to the master (primary) state.
 |Return 0 on success
  indexterm:[promote,OCF Action]
  indexterm:[OCF,Action,promote]
 
 |demote
 |Demote the local instance of a promotable clone resource to the slave (secondary) state.
 |Return 0 on success
  indexterm:[demote,OCF Action]
  indexterm:[OCF,Action,demote]
 
 |notify
 |Used by the cluster to send the agent pre- and post-notification
  events telling the resource what has happened and will happen.
 |Must not fail. Must exit with 0
  indexterm:[notify,OCF Action]
  indexterm:[OCF,Action,notify]
 
 |=========================================================
 
 One action specified in the OCF specs, +recover+, is not currently used by the
 cluster. It is intended to be a variant of the +start+ action that tries to
 recover a resource locally.
 
 [IMPORTANT]
 ====
 If you create a new OCF resource agent, use indexterm:[ocf-tester]`ocf-tester`
 to verify that the agent complies with the OCF standard properly.
 ====
 
 === How are OCF Return Codes Interpreted? ===
 
 The first thing the cluster does is to check the return code against
 the expected result.  If the result does not match the expected value,
 then the operation is considered to have failed, and recovery action is
 initiated.
 
 There are three types of failure recovery:
 
 .Types of recovery performed by the cluster
 [width="95%",cols="1m,4,4",options="header",align="center"]
 |=========================================================
 
 |Type
 |Description
 |Action Taken by the Cluster
 
 |soft
 |A transient error occurred
 |Restart the resource or move it to a new location
 indexterm:[soft,OCF error]
 indexterm:[OCF,error,soft]
 
 |hard
 |A non-transient error that may be specific to the current node occurred
 |Move the resource elsewhere and prevent it from being retried on the current node
 indexterm:[hard,OCF error]
 indexterm:[OCF,error,hard]
 
 |fatal
 |A non-transient error that will be common to all cluster nodes (e.g. a bad configuration was specified)
 |Stop the resource and prevent it from being started on any cluster node
 indexterm:[fatal,OCF error]
 indexterm:[OCF,error,fatal]
 
 |=========================================================
 
 [[s-ocf-return-codes]]
 === OCF Return Codes ===
 
 The following table outlines the different OCF return codes and the type of
 recovery the cluster will initiate when a failure code is received.
 Although counterintuitive, even actions that return 0
 (aka. +OCF_SUCCESS+) can be considered to have failed, if 0 was not
 the expected return value.
 
 .OCF Return Codes and their Recovery Types
 [width="95%",cols="1m,<4m,<6,1m",options="header",align="center"]
 |=========================================================
 
 |RC
 |OCF Alias
 |Description
 |RT
 
 |0
 |OCF_SUCCESS
 |Success. The command completed successfully. This is the expected result for all start, stop, promote and demote commands.
 indexterm:[Return Code,OCF_SUCCESS]
 indexterm:[Return Code,0,OCF_SUCCESS]
 |soft
 
 |1
 |OCF_ERR_GENERIC
 |Generic "there was a problem" error code.
 indexterm:[Return Code,OCF_ERR_GENERIC]
 indexterm:[Return Code,1,OCF_ERR_GENERIC]
 |soft
 
 |2
 |OCF_ERR_ARGS
 |The resource's configuration is not valid on this machine. E.g. it refers to a location not found on the node. 
 indexterm:[Return Code,OCF_ERR_ARGS]
 indexterm:[Return Code,2,OCF_ERR_ARGS]
 |hard
 
 |3
 |OCF_ERR_UNIMPLEMENTED
 |The requested action is not implemented.
 indexterm:[Return Code,OCF_ERR_UNIMPLEMENTED]
 indexterm:[Return Code,3,OCF_ERR_UNIMPLEMENTED]
 |hard
 
 |4
 |OCF_ERR_PERM
 |The resource agent does not have sufficient privileges to complete the task.
 indexterm:[Return Code,OCF_ERR_PERM]
 indexterm:[Return Code,4,OCF_ERR_PERM]
 |hard
 
 |5
 |OCF_ERR_INSTALLED
 |The tools required by the resource are not installed on this machine.
 indexterm:[Return Code,OCF_ERR_INSTALLED]
 indexterm:[Return Code,5,OCF_ERR_INSTALLED]
 |hard
 
 |6
 |OCF_ERR_CONFIGURED
 |The resource's configuration is invalid. E.g. required parameters are missing.
 indexterm:[Return Code,OCF_ERR_CONFIGURED]
 indexterm:[Return Code,6,OCF_ERR_CONFIGURED]
 |fatal
 
 |7
 |OCF_NOT_RUNNING
 |The resource is safely stopped. The cluster will not attempt to stop a resource that returns this for any action.
 indexterm:[Return Code,OCF_NOT_RUNNING]
 indexterm:[Return Code,7,OCF_NOT_RUNNING]
 |N/A
 
 |8
 |OCF_RUNNING_MASTER
 |The resource is running in master mode.
 indexterm:[Return Code,OCF_RUNNING_MASTER]
 indexterm:[Return Code,8,OCF_RUNNING_MASTER]
 |soft
 
 |9
 |OCF_FAILED_MASTER
 |The resource is in master mode but has failed. The resource will be demoted,
 stopped and then started (and possibly promoted) again.
 indexterm:[Return Code,OCF_FAILED_MASTER]
 indexterm:[Return Code,9,OCF_FAILED_MASTER]
 |soft
 
 |other
 |N/A
 |Custom error code.
 indexterm:[Return Code,other]
 |soft
 
 |=========================================================
 
 Exceptions to the recovery handling described above:
 
 * Probes (non-recurring monitor actions) that find a resource active
   (or in master mode) will not result in recovery action unless it is
   also found active elsewhere.
 * The recovery action taken when a resource is found active more than
   once is determined by the resource's +multiple-active+ property.
 * Recurring actions that return +OCF_ERR_UNIMPLEMENTED+
   do not cause any type of recovery.
 
-== Init Script LSB Compliance ==
+== LSB Resource Agents (Init Scripts) ==
+
+=== LSB Compliance ===
 
 The relevant part of the
 http://refspecs.linuxfoundation.org/lsb.shtml[LSB specifications]
 includes a description of all the return codes listed here.
     
 Assuming `some_service` is configured correctly and currently
 inactive, the following sequence will help you determine if it is
 LSB-compatible:
 
 . Start (stopped):
 +
 ----
 # /etc/init.d/some_service start ; echo "result: $?"
 ----
 +
   .. Did the service start?
-  .. Did the command print *result: 0* (in addition to its usual output)?
+  .. Did the echo command print *result: 0* (in addition to the init script's usual output)?
 +
 . Status (running):
 +
 ----
 # /etc/init.d/some_service status ; echo "result: $?"
 ----
 +
   .. Did the script accept the command?
   .. Did the script indicate the service was running?
-  .. Did the command print *result: 0* (in addition to its usual output)?
+  .. Did the echo command print *result: 0* (in addition to the init script's usual output)?
 +
 . Start (running):
 +
 ----
 # /etc/init.d/some_service start ; echo "result: $?"
 ----
 +
   .. Is the service still running?
-  .. Did the command print *result: 0* (in addition to its usual output)?
+  .. Did the echo command print *result: 0* (in addition to the init script's usual output)?
 +
 . Stop (running):
 +
 ----
 # /etc/init.d/some_service stop ; echo "result: $?"
 ----
 +
   .. Was the service stopped?
-  .. Did the command print *result: 0* (in addition to its usual output)?
+  .. Did the echo command print *result: 0* (in addition to the init script's usual output)?
 +
 . Status (stopped):
 +
 ----
 # /etc/init.d/some_service status ; echo "result: $?"
 ----
 +
   .. Did the script accept the command?
   .. Did the script indicate the service was not running?
-  .. Did the command print *result: 3* (in addition to its usual output)?
+  .. Did the echo command print *result: 3* (in addition to the init script's usual output)?
 +
 . Stop (stopped):
 +
 ----
 # /etc/init.d/some_service stop ; echo "result: $?"
 ----
 +
   .. Is the service still stopped?
-  .. Did the command print *result: 0* (in addition to its usual output)?
+  .. Did the echo command print *result: 0* (in addition to the init script's usual output)?
 +
 . Status (failed):
 +
 .. This step is not readily testable and relies on manual inspection of the script.
 +
 The script can use one of the error codes (other than 3) listed in the
 LSB spec to indicate that it is active but failed. This tells the
 cluster that before moving the resource to another node, it needs to
 stop it on the existing one first.
 
 If the answer to any of the above questions is no, then the script is
 not LSB-compliant. Your options are then to either fix the script or
 write an OCF agent based on the existing script.
diff --git a/doc/Pacemaker_Administration/en-US/Ch-Troubleshooting.txt b/doc/Pacemaker_Administration/en-US/Ch-Troubleshooting.txt
new file mode 100644
index 0000000000..f01d2f04cf
--- /dev/null
+++ b/doc/Pacemaker_Administration/en-US/Ch-Troubleshooting.txt
@@ -0,0 +1,64 @@
+:compat-mode: legacy
+= Troubleshooting Cluster Problems =
+
+== Logging ==
+
+Pacemaker by default logs messages of notice severity and higher to the system
+log, and messages of info severity and higher to the detail log, which by
+default is /var/log/pacemaker/pacemaker.log.
+
+Logging options can be controlled via environment variables at Pacemaker
+start-up. Where these are set varies by operating system (often
++/etc/sysconfig/pacemaker+ or +/etc/default/pacemaker+).
+
+Because cluster problems are often highly complex, involving multiple machines,
+cluster daemons, and managed services, Pacemaker logs rather verbosely to
+provide as much context as possible. It is an ongoing priority to make these
+logs more user-friendly, but by necessity there is a lot of obscure, low-level
+information that can make them difficult to follow.
+
+The default log rotation configuration shipped with Pacemaker (typically
+installed in /etc/logrotate.d/pacemaker) rotates the log when it reaches 100MB
+in size, or weekly, whichever comes first.
+
+If you configure debug or (Heaven forbid) trace-level logging, the logs can
+grow enormous quite quickly. Because rotated logs are by default named with the
+year, month, and day only, this can cause name collisions if your logs exceed
+100MB in a single day. You can add +dateformat -%Y%m%d-%H+ to the rotation
+configuration to avoid this.
+
+== Transitions ==
+
+A key concept in understanding how a Pacemaker cluster functions is a
+'transition'. A transition is a set of actions that need to be taken to bring
+the cluster from its current state to the desired state (as expressed by the
+configuration).
+
+Whenever a relevant event happens (a node joining or leaving the cluster,
+a resource failing, etc.), the controller will ask the scheduler to recalculate
+the status of the cluster, which generates a new transition. The controller
+then performs the actions in the transition in the proper order.
+
+Each transition can be identified in the logs by a line like:
+
+----
+Nov 30 20:28:16 rhel7-1 pacemaker-schedulerd[36417] (process_pe_message)        notice: Calculated transition 19, saving inputs in /var/lib/pacemaker/pengine/pe-input-1463.bz2
+----
+
+The file listed as the "inputs" is a snapshot of the cluster configuration and
+state at that moment (the CIB). This file can help determine why particular
+actions were scheduled. The `crm_simulate` command, described in
+<<s-config-testing-changes>>, can be used to replay the file.
+
+== Further Information About Troubleshooting ==
+
+Andrew Beekhof wrote a series of articles about troubleshooting in his blog,
+ http://blog.clusterlabs.org/[The Cluster Guy]:
+
+* http://blog.clusterlabs.org/blog/2013/debugging-pacemaker[Debugging Pacemaker]
+* http://blog.clusterlabs.org/blog/2013/debugging-pengine[Debugging the Policy Engine]
+* http://blog.clusterlabs.org/blog/2013/pacemaker-logging[Pacemaker Logging]
+
+The articles were written for an earlier version of Pacemaker, so many of the
+specific names and log messages to look for have changed, but the concepts are
+still valid.
diff --git a/doc/Pacemaker_Administration/en-US/Ch-Upgrading.txt b/doc/Pacemaker_Administration/en-US/Ch-Upgrading.txt
index 166a98c4f7..a7e60e3a90 100644
--- a/doc/Pacemaker_Administration/en-US/Ch-Upgrading.txt
+++ b/doc/Pacemaker_Administration/en-US/Ch-Upgrading.txt
@@ -1,455 +1,456 @@
 :compat-mode: legacy
 = Upgrading a Pacemaker Cluster =
 
 == Pacemaker Versioning ==
 
 Pacemaker has an overall release version, plus separate version numbers for
 certain internal components.
 
 * *Pacemaker release version:* This version consists of three numbers
   (_x.y.z_).
 +
 The major version number (the _x_ in _x.y.z_) increases when at least some
 rolling upgrades are not possible from the previous major version. For example,
 a rolling upgrade from 1.0.8 to 1.1.15 should always be supported, but a
 rolling upgrade from 1.0.8 to 2.0.0 may not be possible.
 +
 The minor version (the _y_ in _x.y.z_) increases when there are significant
 changes in cluster default behavior, tool behavior, and/or the API interface
 (for software that utilizes Pacemaker libraries). The main benefit is to alert
 you to pay closer attention to the release notes, to see if you might be
 affected.
 +
 The release counter (the _z_ in _x.y.z_) is increased with all public releases
 of Pacemaker, which typically include both bug fixes and new features.
 
 * *CRM feature set:* This version number applies to the communication between
   full cluster nodes, and is used to avoid problems in mixed-version clusters.
 +
 The major version number increases when nodes with different versions would not
 work (rolling upgrades are not allowed). The minor version number increases
 when mixed-version clusters are allowed only during rolling upgrades. The
 minor-minor version number is ignored, but allows resource agents to detect
 cluster support for various features. footnote:[
 Before CRM feature set 3.1.0 (Pacemaker 2.0.0), the minor-minor
 version number was treated the same as the minor version.
 ]
 +
 Pacemaker ensures that the longest-running node is the cluster's DC. This
 ensures new features are not enabled until all nodes are upgraded to support
 them.
 
 * *LRMD protocol version:* This version applies to communication between a
   Pacemaker Remote node and the cluster. It increases when an older cluster
   node would have problems hosting the connection to a newer Pacemaker Remote
   node. To avoid these problems, Pacemaker Remote nodes will accept connections
   only from cluster nodes with the same or newer LRMD protocol version.
 +
 Unlike with CRM feature set differences between full cluster nodes,
 mixed LRMD protocol versions between Pacemaker Remote nodes and full cluster
 nodes are fine, as long as the Pacemaker Remote nodes have the older version.
 This can be useful, for example, to host a legacy application in an
 older operating system version used as a Pacemaker Remote node.
 
 * *XML schema version:* Pacemaker’s configuration syntax — what's allowed in
   the Configuration Information Base (CIB) — has its own version. This allows
   the configuration syntax to evolve over time while still allowing clusters
   with older configurations to work without change.
 
 == Upgrading Cluster Software ==
 
 There are three approaches to upgrading a cluster, each with advantages and
 disadvantages.
 
 .Upgrade Methods
 [width="95%",cols="s,6*",options="header",align="center"]
 |=========================================================
 
 |Method
 |Available between all versions
 |Can be used with Pacemaker Remote nodes
 |Service outage during upgrade
 |Service recovery during upgrade
 |Exercises failover logic
 |Allows change of messaging layer
 indexterm:[Cluster,switching between stacks]
 indexterm:[Changing cluster stack]
 footnote:[Currently, Corosync version 2 and greater is the only supported
 cluster stack, but other stacks have been supported by past versions, and may
 be supported by future versions.]
 
 |Complete cluster shutdown
 indexterm:[upgrade,shutdown]
 indexterm:[shutdown upgrade]
 |yes
 |yes
 |always
 |N/A
 |no
 |yes
 
 |Rolling (node by node)
 indexterm:[upgrade,rolling]
 indexterm:[rolling upgrade]
 |no
 |yes
 |always
 footnote:[Any active resources will be moved off the node being upgraded,
 so there will be at least a brief outage unless all resources can be
 migrated "live".]
 |yes
 |yes
 |no
 
 |Detach and reattach
 indexterm:[upgrade,reattach]
 indexterm:[reattach upgrade]
 |yes
 |no
 |only due to failure
 |no
 |no
 |yes
 
 |=========================================================
 
 === Complete Cluster Shutdown ===
 
 In this scenario, one shuts down all cluster nodes and resources,
 then upgrades all the nodes before restarting the cluster.
 
 . On each node:
 .. Shutdown the cluster software (pacemaker and the messaging layer).
 .. Upgrade the Pacemaker software. This may also include upgrading the
    messaging layer and/or the underlying operating system.
 .. Check the configuration with the `crm_verify` tool.
 . On each node:
 .. Start the cluster software.
    Currently, only Corosync version 2 and greater is supported as the cluster
    layer, but if another stack is supported in the future, the stack does not
    need to be the same one before the upgrade.
 
 One variation of this approach is to build a new cluster on new hosts.
 This allows the new version to be tested beforehand, and minimizes downtime by
 having the new nodes ready to be placed in production as soon as the old nodes
 are shut down.
 
 === Rolling (node by node) ===
 
 In this scenario, each node is removed from the cluster, upgraded, and then
 brought back online, until all nodes are running the newest version.
 
 Special considerations when planning a rolling upgrade:
 
 * If you plan to upgrade other cluster software -- such as the messaging layer --
   at the same time, consult that software's documentation for its compatibility
   with a rolling upgrade.
 
 * If the major version number is changing in the Pacemaker version you are
   upgrading to, a rolling upgrade may not be possible. Read the new version's
   release notes (as well the information here) for what limitations may exist.
 
 * If the CRM feature set is changing in the Pacemaker version you are upgrading
   to, you should run a mixed-version cluster only during a small rolling
   upgrade window. If one of the older nodes drops out of the cluster for any
   reason, it will not be able to rejoin until it is upgraded.
 
 * If the LRMD protocol version is changing, all cluster nodes should be
   upgraded before upgrading any Pacemaker Remote nodes.
 
 See the ClusterLabs wiki's
 http://clusterlabs.org/wiki/ReleaseCalendar[Release Calendar] to figure out
 whether the CRM feature set and/or LRMD protocol version changed between the
 the Pacemaker release versions in your rolling upgrade.
 
 To perform a rolling upgrade, on each node in turn:
 
 . Put the node into standby mode, and wait for any active resources
   to be moved cleanly to another node. (This step is optional, but
   allows you to deal with any resource issues before the upgrade.)
 . Shutdown the cluster software (pacemaker and the messaging layer) on the node.
 . Upgrade the Pacemaker software. This may also include upgrading the
   messaging layer and/or the underlying operating system.
 . If this is the first node to be upgraded, check the configuration
   with the `crm_verify` tool.
 . Start the messaging layer.
   This must be the same messaging layer (currently only Corosync version 2 and
   greater is supported) that the rest of the cluster is using.
 
 [NOTE]
 ====
 Even if a rolling upgrade from the current version of the cluster to the newest
 version is not directly possible, it may be possible to perform a rolling
 upgrade in multiple steps, by upgrading to an intermediate version first.
 
 .Version Compatibility Table
 [width="95%",cols="2*",options="header",align="center"]
 |=========================================================
 
 |Version being Installed
 |Oldest Compatible Version
 
 |Pacemaker 2.y.z
 |Pacemaker 1.1.11
 footnote:[Rolling upgrades from Pacemaker 1.1.z to 2.y.z are possible only if
 the cluster uses corosync version 2 or greater as its messaging layer, and the
 Cluster Information Base (CIB) uses schema 1.0 or higher in its validate-with
 property.]
 
 |Pacemaker 1.y.z
 |Pacemaker 1.0.0
 
 |Pacemaker 0.7.z
 |Pacemaker 0.6.z
 
 |=========================================================
 ====
 
 === Detach and Reattach ===
 
 The reattach method is a variant of a complete cluster shutdown, where the
 resources are left active and get re-detected when the cluster is restarted.
 
 This method may not be used if the cluster contains any Pacemaker Remote nodes.
 
 . Tell the cluster to stop managing services. This is required to allow the
   services to remain active after the cluster shuts down.
 +
 ----
 # crm_attribute --name maintenance-mode --update true
 ----
 
 . On each node, shutdown the cluster software (pacemaker and the messaging
   layer), and upgrade the Pacemaker software. This may also include upgrading
   the messaging layer. While the underlying operating system may be upgraded
   at the same time, that will be more likely to cause outages in the detached
   services (certainly, if a reboot is required).
 . Check the configuration with the `crm_verify` tool.
 . On each node, start the cluster software.
   Currently, only Corosync version 2 and greater is supported as the cluster
   layer, but if another stack is supported in the future, the stack does not
   need to be the same one before the upgrade.
 . Verify that the cluster re-detected all resources correctly.
 . Allow the cluster to resume managing resources again:
 +
 ----
 # crm_attribute --name maintenance-mode --delete
 ----
 
 == Upgrading the Configuration ==
 
 indexterm:[upgrade,Configuration]
 indexterm:[Configuration,upgrading]
 
 The CIB schema version can change from one Pacemaker version to another.
 
 After cluster software is upgraded, the cluster will continue to use
 the older schema version that it was previously using. This can be useful, for
 example, when administrators have written tools that modify the configuration,
 and are based on the older syntax.
 footnote:[As of Pacemaker 2.0.0, only schema versions pacemaker-1.0 and higher
 are supported (excluding pacemaker-1.1, which was an experimental schema
 now known as pacemaker-next).]
 
 However, when using an older syntax, new features may be unavailable, and there
 is a performance impact, since the cluster must do a non-persistent
 configuration upgrade before each transition. So while using the old syntax is
 possible, it is not advisable to continue using it indefinitely.
 
 Even if you wish to continue using the old syntax, it is a good idea to
 follow the upgrade procedure outlined below, except for the last step, to ensure
 that the new software has no problems with your existing configuration (since it
 will perform much the same task internally).
 
 If you are brave, it is sufficient simply to run `cibadmin --upgrade`.
 
 A more cautious approach would proceed like this:
 
 . Create a shadow copy of the configuration. The later commands will automatically
   operate on this copy, rather than the live configuration.
 +
 -----
 # crm_shadow --create shadow
 -----
 . Verify the configuration is valid with the new software (which may be
   stricter about syntax mistakes, or may have dropped support for deprecated
   features):
 indexterm:[Configuration,verify]
 indexterm:[verify,Configuration]
 +
 -----
 # crm_verify --live-check
 -----
 . Fix any errors or warnings.
 . Perform the upgrade:
 +
 -----
 # cibadmin --upgrade
 -----
 . If this step fails, there are three main possibilities:
 .. The configuration was not valid to start with (did you do steps 2 and 3?).
 .. The transformation failed - http://bugs.clusterlabs.org/[report a bug] or
    mailto:users@clusterlabs.org?subject=Transformation%20failed%20during%20upgrade[email the project].
 .. The transformation was successful but produced an invalid result.
 +
 If the result of the transformation is invalid, you may see a number of errors
 from the validation library. If these are not helpful, visit the
 http://clusterlabs.org/wiki/Validation_FAQ[Validation FAQ wiki page] and/or try
 the manual upgrade procedure described below.
 +        
 . Check the changes:
 +
 -----
 # crm_shadow --diff
 -----
 +
 If at this point there is anything about the upgrade that you wish to fine-tune
 (for example, to change some of the automatic IDs), now is the time to do so:
 +
 -----
 # crm_shadow --edit
 -----
 +
 This will open the configuration in your favorite editor (whichever is
 specified by the standard *$EDITOR* environment variable).
 +
 . Preview how the cluster will react:
 +
 ------
 # crm_simulate --live-check --save-dotfile shadow.dot -S
-# graphviz shadow.dot
+# dot -Tsvg shadow.dot -o shadow.svg
 ------
 +
+You can then view shadow.svg with any compatible image viewer or web browser.
 Verify that either no resource actions will occur or that you are
 happy with any that are scheduled.  If the output contains actions you
 do not expect (possibly due to changes to the score calculations), you
 may need to make further manual changes.  See
 <<s-config-testing-changes>> for further details on how to interpret
-the output of `crm_simulate` and `graphviz`.
+the output of `crm_simulate` and `dot`.
 +
 . Upload the changes:
 +
 -----
 # crm_shadow --commit shadow --force
 -----
 +
 In the unlikely event this step fails, please report a bug.
 
 [NOTE]
 ====
 indexterm:[Configuration,upgrade manually]
 It is also possible to perform the configuration upgrade steps manually:
 
 . Locate the +upgrade*.xsl+ conversion scripts provided with the source code. These will often
   be installed in a location such as +/usr/share/pacemaker+, or may be obtained from
   the https://github.com/ClusterLabs/pacemaker/tree/master/xml[source repository].
           
 . Run the conversion scripts that apply to your older version, for example:
   indexterm:[XML,convert]
 +
 -----
 # xsltproc /path/to/upgrade06.xsl config06.xml > config10.xml
 -----
 +          
 . Locate the +pacemaker.rng+ script (from the same location as the xsl files).
 . Check the XML validity: indexterm:[validate configuration]indexterm:[Configuration,validate XML]
 +
 ----
 # xmllint --relaxng /path/to/pacemaker.rng config10.xml
 ----
 
 The advantage of this method is that it can be performed without the
 cluster running, and any validation errors are often more informative.
 ====
 
 == What Changed in 2.0 ==
 
 The main goal of the 2.0 release was to remove support for deprecated syntax,
 along with some small changes in default configuration behavior and tool
 behavior. Highlights:
 
 * Only Corosync version 2 and greater is now supported as the underlying
   cluster layer. Support for Heartbeat and Corosync 1 (including CMAN) is
   removed.
 
 * The Pacemaker detail log file is now stored in
   /var/log/pacemaker/pacemaker.log by default.
 
 * The record-pending cluster property now defaults to true, which
   allows status tools such as crm_mon to show operations that are in
   progress.
 
 * Support for a number of deprecated build options, environment variables,
   and configuration settings has been removed.
 
 * The +master+ tag has been deprecated in favor of using a +clone+ tag with the
   new +promotable+ meta-attribute set to +true+. "Master/slave" clone resources
   are now referred to as "promotable" clone resources, though it will take
   longer for the full terminology change to be completed.
 
 * The public API for Pacemaker libraries that software applications can use
   has changed significantly.
 
 For a detailed list of changes, see the release notes and the
 https://wiki.clusterlabs.org/wiki/Pacemaker_2.0_Changes[Pacemaker 2.0 Changes]
 page on the ClusterLabs wiki.
 
 == What Changed in 1.0 ==
 
 === New ===
 
 * Failure timeouts.
 * New section for resource and operation defaults.
 * Tool for making offline configuration changes.
 * +Rules, instance_attributes, meta_attributes+ and sets of operations can be defined once and referenced in multiple places.
 * The CIB now accepts XPath-based create/modify/delete operations. See the pass:[<command>cibadmin</command>] help text.
 * Multi-dimensional colocation and ordering constraints.
 * The ability to connect to the CIB from non-cluster machines.
 * Allow recurring actions to be triggered at known times.
 
 
 === Changed ===
 
 * Syntax
 ** All resource and cluster options now use dashes (-) instead of underscores (_)
 ** +master_slave+ was renamed to +master+
 ** The +attributes+ container tag was removed
 ** The operation field +pre-req+ has been renamed +requires+
 ** All operations must have an +interval+, +start+/+stop+ must have it set to zero
 * The +stonith-enabled+ option now defaults to true.
 * The cluster will refuse to start resources if +stonith-enabled+ is true (or unset) and no STONITH resources have been defined
 * The attributes of colocation and ordering constraints were renamed for clarity.
 * +resource-failure-stickiness+ has been replaced by +migration-threshold+.
 * The parameters for command-line tools have been made consistent
 * Switched to 'RelaxNG' schema validation and 'libxml2' parser
 ** id fields are now XML IDs which have the following limitations:
 *** id's cannot contain colons (:)
 *** id's cannot begin with a number
 *** id's must be globally unique (not just unique for that tag)
 ** Some fields (such as those in constraints that refer to resources) are IDREFs.
 +
 This means that they must reference existing resources or objects in
 order for the configuration to be valid.  Removing an object which is
 referenced elsewhere will therefore fail.
 +              
 ** The CIB representation, from which a MD5 digest is calculated to verify CIBs on the nodes, has changed.
 +
 This means that every CIB update will require a full refresh on any
 upgraded nodes until the cluster is fully upgraded to 1.0.  This will
 result in significant performance degradation and it is therefore
 highly inadvisable to run a mixed 1.0/0.6 cluster for any longer than
 absolutely necessary.
 +              
 * Ping node information no longer needs to be added to _ha.cf_.
 +
 Simply include the lists of hosts in your ping resource(s).
 
 
 === Removed ===
 
 
 * Syntax
 ** It is no longer possible to set resource meta options as top-level
    attributes.  Use meta attributes instead.
 ** Resource and operation defaults are no longer read from
    +crm_config+.
diff --git a/doc/Pacemaker_Administration/en-US/Pacemaker_Administration.xml b/doc/Pacemaker_Administration/en-US/Pacemaker_Administration.xml
index 07a6b77ddc..03ce6bcbc0 100644
--- a/doc/Pacemaker_Administration/en-US/Pacemaker_Administration.xml
+++ b/doc/Pacemaker_Administration/en-US/Pacemaker_Administration.xml
@@ -1,18 +1,19 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE Book PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Pacemaker_Administration.ent">
 %BOOK_ENTITIES;
 ]>
 <book>
     <xi:include href="Book_Info.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Preface.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Intro.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Installing.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Cluster.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
-    <xi:include href="Ch-Monitoring.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Configuring.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+    <xi:include href="Ch-Monitoring.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+    <xi:include href="Ch-Troubleshooting.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Upgrading.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Ch-Agents.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="Revision_History.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <index></index>
 </book>
diff --git a/doc/Pacemaker_Administration/en-US/Revision_History.xml b/doc/Pacemaker_Administration/en-US/Revision_History.xml
index 56d3c70687..eaaacd6457 100644
--- a/doc/Pacemaker_Administration/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Administration/en-US/Revision_History.xml
@@ -1,28 +1,45 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE appendix PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Pacemaker_Administration.ent">
 %BOOK_ENTITIES;
 ]>
 <appendix id="appe-Pacemaker_Administration-Revision_History">
   <!-- see comment in Book_Info.xml for revision numbering -->
   <title>Revision History</title>
   <simpara>
     <revhistory>
 
+      <revision>
+        <revnumber>1-1</revnumber>
+        <date>Tue Dec 4 2018</date>
+        <author>
+          <firstname>Ken</firstname><surname>Gaillot</surname>
+          <email>kgaillot@redhat.com</email>
+        </author>
+        <author>
+          <firstname>Jan</firstname><surname>Pokorný</surname>
+          <email>jpokorny@redhat.com</email>
+        </author>
+        <revdescription>
+          <simplelist><member>Add "Troubleshooting" chapter, minor
+                              clarifications and reformatting</member></simplelist>
+        </revdescription>
+      </revision>
+
       <revision>
         <revnumber>1-0</revnumber>
         <date>Tue Jan 23 2018</date>
         <author>
           <firstname>Ken</firstname><surname>Gaillot</surname>
           <email>kgaillot@redhat.com</email>
         </author>
         <revdescription>
           <simplelist><member>Move administration-oriented information from
-			      Pacemaker Explained into its own
-			      book</member></simplelist>
+                              Pacemaker Explained into its own
+                              book</member></simplelist>
         </revdescription>
       </revision>
 
     </revhistory>
   </simpara>
 </appendix>
diff --git a/doc/Pacemaker_Development/en-US/Book_Info.xml b/doc/Pacemaker_Development/en-US/Book_Info.xml
index cc88a7caf7..b08acf54f3 100644
--- a/doc/Pacemaker_Development/en-US/Book_Info.xml
+++ b/doc/Pacemaker_Development/en-US/Book_Info.xml
@@ -1,36 +1,35 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE bookinfo PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Pacemaker_Development.ent">
 %BOOK_ENTITIES;
 ]>
 <bookinfo id="book-Pacemaker_Development-Pacemaker_Development">
-	<title>Pacemaker Development</title>
-	<subtitle>Working with the Pacemaker Code Base</subtitle>
-	<!--
-		EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
-		Increment EDITION when the syntax of the documented software
-		changes (OS, pacemaker, corosync, pcs), and PUBSNUMBER for
-		simple textual changes (corrections, translations, etc.).
-		Changing the revision is only necessary when releasing a new
-		version of Pacemaker or publishing the documents to the Web.
-	-->
-	<edition>2</edition>
-	<pubsnumber>0</pubsnumber>
-	<abstract>
-		<para>
-			This document has guidelines and tips for developers
-			interested in editing Pacemaker source code and
-			submitting changes for inclusion in the project.
-		</para>
-	</abstract>
-	<corpauthor>
-		<inlinemediaobject>
-			<imageobject>
-				<imagedata fileref="Common_Content/images/title_logo.svg" format="SVG" />
-			</imageobject>
-		</inlinemediaobject>
-	</corpauthor>
-	<xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
-	<xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+        <title>Pacemaker Development</title>
+        <subtitle>Working with the Pacemaker Code Base</subtitle>
+        <!--
+                EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
+                Increment EDITION when the syntax of the documented software
+                changes (OS, pacemaker, corosync, pcs), and PUBSNUMBER for
+                simple textual changes (corrections, translations, etc.).
+                Changing the revision is only necessary when releasing a new
+                version of Pacemaker or publishing the documents to the Web.
+        -->
+        <edition>2</edition>
+        <pubsnumber>2</pubsnumber>
+        <abstract>
+                <para>
+                        This document has guidelines and tips for developers
+                        interested in editing Pacemaker source code and
+                        submitting changes for inclusion in the project.
+                </para>
+        </abstract>
+        <corpauthor>
+                <inlinemediaobject>
+                        <imageobject>
+                                <imagedata fileref="Common_Content/images/title_logo.svg" format="SVG" />
+                        </imageobject>
+                </inlinemediaobject>
+        </corpauthor>
+        <xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+        <xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
 </bookinfo>
-
diff --git a/doc/Pacemaker_Development/en-US/Ch-Coding.txt b/doc/Pacemaker_Development/en-US/Ch-Coding.txt
index c0bfde984c..6a54a7db0a 100644
--- a/doc/Pacemaker_Development/en-US/Ch-Coding.txt
+++ b/doc/Pacemaker_Development/en-US/Ch-Coding.txt
@@ -1,199 +1,296 @@
 :compat-mode: legacy
 = C Coding Guidelines =
 
 ////
 We prefer [[ch-NAME]], but older versions of asciidoc don't deal well
 with that construct for chapter headings
 ////
 anchor:ch-c-coding[Chapter 2, C Coding Guidelines]
 
-== C Boilerplate ==
+== Style Guidelines ==
+
+Pacemaker is a large, distributed project accepting contributions from
+developers with a wide range of skill levels and organizational affiliations,
+and maintained by multiple people over long periods of time. The guidelines in
+this section are not technically better than alternative approaches, but make
+project management easier.
+
+Many of these simply ensure stylistic consistency, which makes reading,
+writing, and reviewing code easier.
+
+=== C Boilerplate ===
 
 indexterm:[C,boilerplate]
 indexterm:[licensing,C boilerplate]
 
-Every C file should start like this:
+Every C file should start with a short copyright notice listing the original
+author, like:
 
 ====
 [source,C]
 ----
 /*
  * Copyright <YYYY[-YYYY]> Andrew Beekhof <andrew@beekhof.net>
  * 
  * This source code is licensed under <LICENSE> WITHOUT ANY WARRANTY.
  */
 ----
 ====
 
-+<YYYY>+ is the year the code was 'originally' created.
+The first +<YYYY>+ is the year the code was 'originally' created.
 footnote:[
 See the U.S. Copyright Office's https://www.copyright.gov/comp3/["Compendium
 of U.S. Copyright Office Practices"], particularly "Chapter 2200: Notice of
 Copyright", sections 2205.1(A) and 2205.1(F), or
 https://techwhirl.com/updating-copyright-notices/["Updating Copyright
 Notices"] for a more readable summary.
 ]
 If the code is modified in later years, add +-YYYY+ with the most recent year
 of modification.
 
 +<LICENSE>+ should follow the policy set forth in the
 https://github.com/ClusterLabs/pacemaker/blob/master/COPYING[+COPYING+] file,
 generally one of "GNU General Public License version 2 or later (GPLv2+)"
 or "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)".
 
-== Formatting ==
+Header files should additionally protect against multiple inclusion by defining
+a unique symbol.
+
+====
+[source,C]
+----
+#ifndef MY_HEADER_NAME__H
+#  define MY_HEADER_NAME__H
+
+// header code here
+
+#endif // MY_HEADER_NAME__H
+----
+====
 
-=== Whitespace ===
+Public API header files should additionally declare "C" compatibility for
+inclusion by C++, and give a Doxygen file description. For example:
+
+====
+[source,C]
+----
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \file
+ * \brief My brief description here
+ * \ingroup core
+ */
+
+// header code here
+
+#ifdef __cplusplus
+}
+#endif
+----
+====
+
+=== Line Formatting ===
 
 indexterm:[C,whitespace]
 
 - Indentation must be 4 spaces, no tabs.
 - Do not leave trailing whitespace.
-
-=== Line Length ===
-
 - Lines should be no longer than 80 characters unless limiting line length
   significantly impacts readability.
 
 === Pointers ===
 
 indexterm:[C,pointers]
 
 - The +*+ goes by the variable name, not the type:
 
 ====
 [source,C]
 ----
 char *foo;
 ----
 ====
 
 - Use a space before the +*+ and after the closing parenthesis in a cast:
 
 ====
 [source,C]
 ----
 char *foo = (char *) bar;
 ----
 ====
 
-=== Functions ===
+=== Function Definitions ===
 
 indexterm:[C,functions]
 
 - In the function definition, put the return type on its own line, and place
-  the opening brace by itself on a line:
-
-====
-[source,C]
-----
-static int
-foo(void)
-{
-----
-====
-
+  the opening brace by itself on a line.
 - For functions with enough arguments that they must break to the next line,
-  align arguments with the first argument:
+  align arguments with the first argument.
+- When a function argument is a function itself, use the pointer form.
 
 ====
 [source,C]
 ----
 static int
 function_name(int bar, const char *a, const char *b,
-              const char *c, const char *d)
+              const char *c, void (*d)())
 {
 ----
 ====
 
 - If a function name gets really long, start the arguments on their own line
   with 8 spaces of indentation:
 
 ====
 [source,C]
 ----
 static int
 really_really_long_function_name_this_is_getting_silly_now(
         int bar, const char *a, const char *b,
         const char *c, const char *d)
 {
 ----
 ====
 
 === Control Statements (if, else, while, for, switch) ===
 
 - The keyword is followed by one space, then left parenthesis without space,
   condition, right parenthesis, space, opening bracket on the same line.
   +else+ and +else if+ are on the same line with the ending brace and opening
-  brace, separated by a space:
+  brace, separated by a space.
+- Always use braces around control statement blocks, even if they only contain
+  one line. This makes code review diffs smaller if a line gets added in the
+  future, and avoids any chance of bad indenting making a line incorrectly
+  appear to be part of the block.
+- Do not put assignments in +if+ or +while+ conditionals. This ensures that the
+  developer's intent is always clear, making code reviews easier and reducing
+  the chance of using assignment where comparison is intended.
 
 ====
 [source,C]
 ----
-if (condition1) {
+a = f();
+if (a < 0) {
     statement1;
-} else if (condition2) {
+} else if (some_other_condition) {
     statement2;
 } else {
     statement3;
 }
 ----
 ====
 
 - In a +switch+ statement, +case+ is indented one level, and the body of each
   +case+ is indented by another level. The opening brace is on the same line as
   +switch+.
 
 ====
 [source,C]
 ----
 switch (expression) {
     case 0:
         command1;
         break;
     case 1:
         command2;
         break;
     default:
         command3;
 }
 ----
 ====
 
 === Operators ===
 
 indexterm:[C,operators]
 
-- Operators have spaces from both sides. Do not rely on operator precedence;
-  use parentheses when mixing operators with different priority.
+- Operators have spaces from both sides.
+- Do not rely on operator precedence; use parentheses when mixing operators
+  with different priority.
 - No space is used after opening parenthesis and before closing parenthesis.
 
 ====
 [source,C]
 ----
 x = a + b - (c * d);
 ----
 ====
 
-== Naming Conventions ==
+== Best Practices ==
+
+The guidelines in this section offer technical advantages.
+
+=== New Struct and Enum Members ===
+
+In the public APIs, always add new struct members to the end of the struct.
+This allows us to maintain backward API/ABI compatibility (as long as the
+application being linked allocates structs via API functions).
+
+This generally applies to enum values as well, as the compiler will define
+enum values to 0, 1, etc., in the order given, so inserting a value in the
+middle will change the numerical values of all later values, making them
+backward-incompatible. However, if enum numerical values are explicitly
+specified rather than left to the compiler, new values can be added anywhere.
+
+=== Documentation ===
+
+All public API header files, functions, structs, enums, etc.,
+should be documented with Doxygen comment blocks, as Pacemaker's
+http://clusterlabs.org/pacemaker/doxygen/[online API documentation]
+is automatically generated via Doxygen. It is helpful to document
+private symols in the same way, with an +\internal+ tag in the
+Doxygen comment.
+
+=== Symbol Naming ===
 
 indexterm:[C,naming]
 
+- All file and function names should be unique across the entire project,
+  to allow for individual tracing via +PCMK_trace_files+ and
+  +PCMK_trace_functions+, as well as making detail logs easier to follow.
 - Any exposed symbols in libraries (non-+static+ function names, type names,
   etc.) must begin with a prefix appropriate to the library, for example,
-  +crm_+, +pe_+, +st_+, +lrm_+.
+  +crm_+, +pe_+, +st_+, +lrm_+. This reduces the chance of naming collisions
+  with software linked against the library.
+- Time intervals are sometimes represented in Pacemaker code as user-defined
+  text specifications (e.g. "10s"), other times as an integer number of
+  seconds or milliseconds, and still other times as a string representation
+  of an integer number. Variables for these should be named with an indication
+  of which is being used (e.g. +interval_spec+, +interval_ms+, or
+  +interval_ms_s+ instead of +interval+).
+
+=== Memory Allocation ===
+
+Always use calloc() rather than malloc(). It has no additional cost on modern
+operating systems, and reduces the severity of uninitialized memory usage bugs.
+
+=== Logging ===
+
+- When format strings are used for derived data types whose implementation may
+  vary across platforms (+pid_t+, +time_t+, etc.), the safest approach is to
+  use +%lld+ in the format string, and cast the value to +(long long)+.
+
+=== Regular Expressions ===
+
+- Use +REG_NOSUB+ with +regcomp()+ whenever possible, for efficiency.
+- Be sure to use +regfree()+ appropriately.
 
-== vim Settings ==
+=== vim Settings ===
 
 indexterm:[vim]
 
 Developers who use +vim+ to edit source code can add the following settings to
 their +~/.vimrc+ file to follow Pacemaker C coding guidelines:
 
 ----
 " follow Pacemaker coding guidelines when editing C source code files
 filetype plugin indent on
 au FileType c   setlocal expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80
 autocmd BufNewFile,BufRead *.h set filetype=c
 let c_space_errors = 1
 ----
diff --git a/doc/Pacemaker_Development/en-US/Ch-FAQ.txt b/doc/Pacemaker_Development/en-US/Ch-FAQ.txt
index e7b30eb17f..78f14e45a5 100644
--- a/doc/Pacemaker_Development/en-US/Ch-FAQ.txt
+++ b/doc/Pacemaker_Development/en-US/Ch-FAQ.txt
@@ -1,113 +1,119 @@
 :compat-mode: legacy
 = Frequently Asked Questions =
 
 [qanda]
 Who is this document intended for?::
  Anyone who wishes to read and/or edit the Pacemaker source code.
  Casual contributors should feel free to read just this FAQ, and
  consult other chapters as needed.
 
 Where is the source code for Pacemaker?::
  indexterm:[downloads]
  indexterm:[source code]
  indexterm:[git,GitHub]
  The https://github.com/ClusterLabs/pacemaker[source code for Pacemaker] is
  kept on https://github.com/[GitHub], as are all software projects under the
  https://github.com/ClusterLabs[ClusterLabs] umbrella. Pacemaker uses
  https://git-scm.com/[Git] for source code management. If you are a Git newbie,
  the http://schacon.github.io/git/gittutorial.html[gittutorial(7) man page]
  is an excellent starting point. If you're familiar with using Git from the
  command line, you can create a local copy of the Pacemaker source code with:
- `git clone https://github.com/ClusterLabs/pacemaker.git
+ `git clone https://github.com/ClusterLabs/pacemaker.git`
 
 What are the different Git branches and repositories used for?::
  indexterm:[branches]
  * The https://github.com/ClusterLabs/pacemaker/tree/master[master branch]
    is the primary branch used for development.
- * The https://github.com/ClusterLabs/pacemaker/tree/1.1[1.1 branch] contains
+ * The https://github.com/ClusterLabs/pacemaker/tree/2.0[2.0 branch] contains
    the latest official release, and normally does not receive any changes.
    During the release cycle, it will contain release candidates for the
    next official release, and will receive only bug fixes.
+ * The https://github.com/ClusterLabs/pacemaker/tree/1.1[1.1 branch] is similar
+   to both the master and 2.0 branches, but for the 1.1 release series.
+   The 1.1 branch receives only backports of certain bug fixes and
+   backward-compatible features from the master branch. During the release
+   cycle, it will contain release candidates for the next official 1.1 release.
  * The https://github.com/ClusterLabs/pacemaker-1.0[1.0 repository] is a
    frozen snapshot of the 1.0 release series, and is no longer developed.
  * Messages will be posted to the
    http://clusterlabs.org/mailman/listinfo/developers[developers@clusterlabs.org]
    mailing list during the release cycle, with instructions about which
    branches to use when submitting requests.
 
 How do I build from the source code?::
  See https://github.com/ClusterLabs/pacemaker/blob/master/INSTALL.md[INSTALL.md]
  in the main checkout directory.
 
 What coding style should I follow?::
  You'll be mostly fine if you simply follow the example of existing code.
  When unsure, see the relevant chapter of this document for language-specific
  recommendations. Pacemaker has grown and evolved organically over many years,
  so you will see much code that doesn't conform to the current guidelines. We
  discourage making changes solely to bring code into conformance, as any change
  requires developer time for review and opens the possibility of adding bugs.
  However, new code should follow the guidelines, and it is fine to bring lines
  of older code into conformance when modifying that code for other reasons.
 
 How should I format my Git commit messages?::
  indexterm:[git,commit messages]
  See existing examples in the git log. The first line should look like
- +change-type: affected-code: explanation+ where +change-type+ can be
+ +change-type: affected-code: explanation+ where +change-type+ should be
  +Fix+ or +Bug+ for most bug fixes, +Feature+ for new features, +Log+ for
  changes to log messages or handling, +Doc+ for changes to documentation or
  comments, or +Test+ for changes in CTS and regression tests. You will
  sometimes see +Low+, +Med+ (or +Mid+) and +High+ used instead for bug fixes,
  to indicate the severity. The important thing is that only commits with
  +Feature+, +Fix+, +Bug+, or +High+ will automatically be included in the
  change log for the next release. The +affected-code+ is the name of the
- component(s) being changed, for example, +pacemaker-controld+ or
+ component(s) being changed, for example, +controller+ or
  +libcrmcommon+ (it's more free-form, so don't sweat getting it exact). The
  +explanation+ briefly describes the change. The git project recommends the
  entire summary line stay under 50 characters, but more is fine if needed for
  clarity. Except for the most simple and obvious of changes, the summary should
  be followed by a blank line and then a longer explanation of 'why' the change
  was made.
 
 How can I test my changes?::
  Most importantly, Pacemaker has regression tests for most major components;
  these will automatically be run for any pull requests submitted through
  GitHub. Additionally, Pacemaker's Cluster Test Suite (CTS) can be used to set
  up a test cluster and run a wide variety of complex tests. This document will
  have more detail on testing in the future.
 
 What is Pacemaker's license?::
  indexterm:[licensing]
  Except where noted otherwise in the file itself, the source code for all
  Pacemaker programs is licensed under version 2 or later of the GNU General
  Public License (https://www.gnu.org/licenses/gpl-2.0.html[GPLv2+]), its
  headers and libraries under version 2.1 or later of the less restrictive
  GNU Lesser General Public License
  (https://www.gnu.org/licenses/lgpl-2.1.html[LGPLv2.1+]),
  its documentation under version 4.0 or later of the
  Creative Commons Attribution-ShareAlike International Public License
- (https://creativecommons.org/licenses/by-sa/4.0/legalcode[CC-BY-SA]),
+ (https://creativecommons.org/licenses/by-sa/4.0/legalcode[CC-BY-SA-4.0]),
  and its init scripts under the
  https://opensource.org/licenses/BSD-3-Clause[Revised BSD] license. If you find
  any deviations from this policy, or wish to inquire about alternate licensing
- arrangements, please e-mail mailto:andrew@beekhof.net[andrew@beekhof.net].
+ arrangements, please e-mail the
+ mailto:developers@clusterlabs.org[developers@ClusterLabs.org] mailing list.
  Licensing issues are also discussed on the
- http://clusterlabs.org/wiki/License[ClusterLabs wiki].
+ https://wiki.clusterlabs.org/wiki/License[ClusterLabs wiki].
 
 How can I contribute my changes to the project?::
  Contributions of bug fixes or new features are very much appreciated!
  Patches can be submitted as
  https://help.github.com/articles/using-pull-requests/[pull requests]
  via GitHub (the preferred method, due to its excellent
  https://github.com/features/[features]), or e-mailed to the
  http://clusterlabs.org/mailman/listinfo/developers[developers@clusterlabs.org]
  mailing list as an attachment in a format Git can import.
 
 What if I still have questions?::
  indexterm:[mailing lists]
  Ask on the
  http://clusterlabs.org/mailman/listinfo/developers[developers@clusterlabs.org]
  mailing list for development-related questions, or on the
  http://clusterlabs.org/mailman/listinfo/users[users@clusterlabs.org]
  mailing list for general questions about using Pacemaker.
  Developers often also hang out on http://freenode.net/[freenode's]
  #clusterlabs IRC channel.
diff --git a/doc/Pacemaker_Development/en-US/Revision_History.xml b/doc/Pacemaker_Development/en-US/Revision_History.xml
index 594b5df66d..6769c833ea 100644
--- a/doc/Pacemaker_Development/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Development/en-US/Revision_History.xml
@@ -1,65 +1,77 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE appendix PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 <!ENTITY % BOOK_ENTITIES SYSTEM "Pacemaker_Development.ent">
 %BOOK_ENTITIES;
 ]>
 <appendix id="appe-Pacemaker_Development-Revision_History">
   <!-- see comment in Book_Info.xml for revision numbering -->
   <title>Revision History</title>
   <simpara>
     <revhistory>
 
       <revision>
         <revnumber>1-0</revnumber>
         <date>Tue Jul 26 2016</date>
         <author>
           <firstname>Ken</firstname><surname>Gaillot</surname>
           <email>kgaillot@redhat.com</email>
         </author>
         <revdescription>
           <simplelist><member>Convert coding guidelines and developer FAQ
                               to Publican document</member></simplelist>
         </revdescription>
       </revision>
 
       <revision>
         <revnumber>1-1</revnumber>
         <date>Mon Aug 29 2016</date>
         <author>
           <firstname>Ken</firstname><surname>Gaillot</surname>
           <email>kgaillot@redhat.com</email>
         </author>
         <revdescription>
           <simplelist><member>Add Python coding guidelines, and
                               more about licensing</member></simplelist>
         </revdescription>
       </revision>
 
       <revision>
         <revnumber>2-0</revnumber>
         <date>Fri Jan 12 2018</date>
         <author>
           <firstname>Ken</firstname><surname>Gaillot</surname>
           <email>kgaillot@redhat.com</email>
         </author>
         <revdescription>
           <simplelist><member>Drop support for Python 2.6</member></simplelist>
         </revdescription>
       </revision>
 
       <revision>
         <revnumber>2-1</revnumber>
         <date>Tue Sep 18 2018</date>
         <author>
           <firstname>Jan</firstname><surname>Pokorný</surname>
           <email>poki@redhat.com</email>
         </author>
         <revdescription>
           <simplelist><member>Start documenting notable evolutionary
                               points</member></simplelist>
         </revdescription>
       </revision>
 
+      <revision>
+        <revnumber>2-2</revnumber>
+        <date>Fri Dec 7 2018</date>
+        <author>
+          <firstname>Ken</firstname><surname>Gaillot</surname>
+          <email>kgaillot@redhat.com</email>
+        </author>
+        <revdescription>
+          <simplelist><member>Update FAQ and C guidelines</member></simplelist>
+        </revdescription>
+      </revision>
+
     </revhistory>
   </simpara>
 </appendix>
diff --git a/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt b/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt
deleted file mode 100644
index b89bf4af04..0000000000
--- a/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-:compat-mode: legacy
-[appendix]
-
-[[ap-faq]]
-== FAQ ==
-
-
-[qanda]
-Why is the Project Called Pacemaker?::
- indexterm:[Pacemaker]
- First of all, the reason it's not called the CRM is because of the abundance
- of terms footnote:[http://en.wikipedia.org/wiki/CRM] that are commonly
- abbreviated to those three letters. The Pacemaker name came from Kham,
- footnote:[http://khamsouk.souvanlasy.com/] a good friend of Pacemaker
- developer Andrew Beekhof's, and was originally used by a Java GUI that Beekhof
- was prototyping in early 2007. Alas, other commitments prevented the GUI from
- progressing much and, when it came time to choose a name for this project,
- Lars Marowsky-Bree suggested it was an even better fit for an independent CRM.
- The idea stems from the analogy between the role of this software and that of
- the little device that keeps the human heart pumping.  Pacemaker monitors the
- cluster and intervenes when necessary to ensure the smooth operation of the
- services it provides.
- There were a number of other names (and acronyms) tossed around, but suffice to
- say "Pacemaker" was the best.
-
-Why was the Pacemaker Project Created?::
-
- Pacemaker was spun off from an earlier project called
- http://linux-ha.org/[Heartbeat], which combined a cluster layer and a cluster
- resource manager. The CRM was made into its own project, Pacemaker, in order to:
-
- * support both the Corosync and Heartbeat cluster stacks equally (Heartbeat
-   support was dropped in Pacemaker 2.0, as the project had faded out by then)
- * decouple the release cycles of the cluster layer and the cluster resource
-   manager at very different stages of their life-cycles
- * foster clearer package boundaries, thus leading to better and more stable interfaces
-
-What Messaging Layers are Supported?::
- indexterm:[Messaging Layers]
-
- * http://www.corosync.org/[Corosync] version 2 and greater
- * Historically, Pacemaker 1 also supported Corosync version 1 (with either
-   CMAN or a pacemaker plugin) and Heartbeat. Support for these legacy stacks
-   was dropped with Pacemaker 2.0.
-  
-Where Can I Get Pre-built Packages?::
-
- Most major Linux distributions have pacemaker packages in their standard
- package repositories. See the http://clusterlabs.org/wiki/Install[Install wiki
- page] for details.
-
-What Versions of Pacemaker Are Supported?::
-
- Some Linux distributions (such as Red Hat Enterprise Linux and SUSE Linux
- Enterprise) offer technical support for their customers; contact them
- for details of such support.
- For help within the community (mailing lists, IRC, etc.) from Pacemaker developers
- and users, refer to the http://clusterlabs.org/wiki/Releases[Releases wiki page]
- for an up-to-date list of versions considered to be supported by the project.
- When seeking assistance, please try to ensure you have one of these versions.
diff --git a/doc/Pacemaker_Explained/en-US/Author_Group.xml b/doc/Pacemaker_Explained/en-US/Author_Group.xml
index 7f3c297cee..08fc1f72ed 100644
--- a/doc/Pacemaker_Explained/en-US/Author_Group.xml
+++ b/doc/Pacemaker_Explained/en-US/Author_Group.xml
@@ -1,53 +1,59 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE authorgroup PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 ]>
 <authorgroup>
   <author>
     <firstname>Andrew</firstname><surname>Beekhof</surname>
     <affiliation><orgname>Red Hat</orgname></affiliation>
     <contrib>Primary author</contrib>
     <email>andrew@beekhof.net</email>
   </author>
+  <othercredit>
+    <firstname>Ken</firstname><surname>Gaillot</surname>
+    <affiliation><orgname>Red Hat</orgname></affiliation>
+    <contrib>Co-author</contrib>
+    <email>kgaillot@redhat.com</email>
+  </othercredit>
   <othercredit>
     <firstname>Philipp</firstname><surname>Marek</surname>
     <affiliation><orgname>LINBit</orgname></affiliation>
     <contrib>Style and formatting updates. Indexing.</contrib>
     <email>philipp.marek@linbit.com</email>
   </othercredit>
   <othercredit>
     <firstname>Tanja</firstname><surname>Roth</surname>
     <affiliation><orgname>SUSE</orgname></affiliation>
     <contrib>Utilization chapter</contrib>
     <contrib>Resource Templates chapter</contrib>
     <contrib>Multi-Site Clusters chapter</contrib>
     <email>taroth@suse.com</email>
   </othercredit>
   <othercredit>
     <firstname>Lars</firstname><surname>Marowsky-Bree</surname>
     <affiliation><orgname>SUSE</orgname></affiliation>
     <contrib>Multi-Site Clusters chapter</contrib>
     <email>lmb@suse.com</email>
   </othercredit>
   <othercredit>
     <firstname>Yan</firstname><surname>Gao</surname>
     <affiliation><orgname>SUSE</orgname></affiliation>
     <contrib>Utilization chapter</contrib>
     <contrib>Resource Templates chapter</contrib>
     <contrib>Multi-Site Clusters chapter</contrib>
     <email>ygao@suse.com</email>
   </othercredit>
   <othercredit>
     <firstname>Thomas</firstname><surname>Schraitle</surname>
     <affiliation><orgname>SUSE</orgname></affiliation>
     <contrib>Utilization chapter</contrib>
     <contrib>Resource Templates chapter</contrib>
     <contrib>Multi-Site Clusters chapter</contrib>
     <email>toms@suse.com</email>
   </othercredit>
   <othercredit>
     <firstname>Dejan</firstname><surname>Muhamedagic</surname>
     <affiliation><orgname>SUSE</orgname></affiliation>
     <contrib>Resource Templates chapter</contrib>
     <email>dmuhamedagic@suse.com</email>
   </othercredit>
 </authorgroup>
diff --git a/doc/Pacemaker_Explained/en-US/Book_Info.xml b/doc/Pacemaker_Explained/en-US/Book_Info.xml
index da196e34ae..3da61de4f7 100644
--- a/doc/Pacemaker_Explained/en-US/Book_Info.xml
+++ b/doc/Pacemaker_Explained/en-US/Book_Info.xml
@@ -1,35 +1,35 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE bookinfo PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 ]>
 <bookinfo>
   <title>Configuration Explained</title>
   <subtitle>An A-Z guide to Pacemaker's Configuration Options</subtitle>
   <productname>Pacemaker</productname>
   <productnumber>2.0</productnumber>
   <!--
-	EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
-	Increment EDITION when the syntax of the documented software
-	changes (pacemaker), and PUBSNUMBER for
-	simple textual changes (corrections, translations, etc.).
+        EDITION-PUBSNUMBER should match REVNUMBER in Revision_History.xml.
+        Increment EDITION when the syntax of the documented software
+        changes (pacemaker), and PUBSNUMBER for
+        simple textual changes (corrections, translations, etc.).
   -->
-  <edition>11</edition>
+  <edition>12</edition>
   <pubsnumber>0</pubsnumber>
   <abstract>
     <para>
       The purpose of this document is to definitively explain the concepts used to configure Pacemaker.
       To achieve this, it will focus exclusively on the XML syntax used to configure Pacemaker's
       Cluster Information Base (CIB).
     </para>
   </abstract>
   <corpauthor>
     <inlinemediaobject>
       <imageobject>
         <imagedata fileref="Common_Content/images/title_logo.svg" format="SVG"/>
       </imageobject>
     </inlinemediaobject>
   </corpauthor>
   <xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude">
   </xi:include>
   <xi:include href="Author_Group.xml" xmlns:xi="http://www.w3.org/2001/XInclude">
   </xi:include>
 </bookinfo>
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt
index d0aba3914f..dd74e59735 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt
@@ -1,729 +1,732 @@
 :compat-mode: legacy
 = Advanced Configuration =
 
 [[s-recurring-start]]
 == Specifying When Recurring Actions are Performed ==
 
 
 By default, recurring actions are scheduled relative to when the
 resource started.  So if your resource was last started at 14:32 and
 you have a backup set to be performed every 24 hours, then the backup
 will always run in the middle of the business day -- hardly
 desirable.
 
 To specify a date and time that the operation should be relative to, set
 the operation's +interval-origin+.  The cluster uses this point to
 calculate the correct +start-delay+ such that the operation will occur
 at _origin + (interval * N)_.
 
 So, if the operation's interval is 24h, its interval-origin is set to
 02:00 and it is currently 14:32, then the cluster would initiate
 the operation with a start delay of 11 hours and 28 minutes.  If the
 resource is moved to another node before 2am, then the operation is
 cancelled.
 
 The value specified for +interval+ and +interval-origin+ can be any
 date/time conforming to the
 http://en.wikipedia.org/wiki/ISO_8601[ISO8601 standard].  By way of
 example, to specify an operation that would run on the first Monday of
 2009 and every Monday after that, you would add:
 
 .Specifying a Base for Recurring Action Intervals
 =====
 [source,XML]
 <op id="my-weekly-action" name="custom-action" interval="P7D" interval-origin="2009-W01-1"/> 
 =====
 
 [[s-failure-handling]]
 == Handling Resource Failure ==
 
 By default, Pacemaker will attempt to recover failed resources by restarting
 them. However, failure recovery is highly configurable.
 
 === Failure Counts ===
 
 Pacemaker tracks resource failures for each combination of node, resource, and
 operation (start, stop, monitor, etc.).
 
 You can query the fail count for a particular node, resource, and/or operation
 using the `crm_failcount` command. For example, to see how many times the
 10-second monitor for +myrsc+ has failed on +node1+, run:
 
 ----
 # crm_failcount --query -r myrsc -N node1 -n monitor -I 10s
 ----
 
 If you omit the node, `crm_failcount` will use the local node. If you omit the
 operation and interval, `crm_failcount` will display the sum of the fail counts
 for all operations on the resource.
 
 You can use `crm_resource --cleanup` or `crm_failcount --delete` to clear
 fail counts. For example, to clear the above monitor failures, run:
 
 ----
 # crm_resource --cleanup -r myrsc -N node1 -n monitor -I 10s
 ----
 
 If you omit the resource, `crm_resource --cleanup` will clear failures for all
 resources. If you omit the node, it will clear failures on all nodes. If you
 omit the operation and interval, it will clear the failures for all operations
 on the resource.
 
 [NOTE]
 ====
 Even when cleaning up only a single operation, all failed operations will
 disappear from the status display. This allows us to trigger a re-check of the
 resource's current status.
 ====
 
 Higher-level tools may provide other commands for querying and clearing
 fail counts.
 
 The `crm_mon` tool shows the current cluster status, including any failed
 operations. To see the current fail counts for any failed resources, call
 `crm_mon` with the `--failcounts` option. This shows the fail counts per
 resource (that is, the sum of any operation fail counts for the resource).
 
 === Failure Response ===
 
 Normally, if a running resource fails, pacemaker will try to stop it and start
 it again. Pacemaker will choose the best location to start it each time, which
 may be the same node that it failed on.
 
 However, if a resource fails repeatedly, it is possible that there is an
 underlying problem on that node, and you might desire trying a different node
 in such a case. Pacemaker allows you to set your preference via the
 +migration-threshold+ resource meta-attribute.
 footnote:[
 The naming of this option was perhaps unfortunate as it is easily
 confused with live migration, the process of moving a resource from
 one node to another without stopping it.  Xen virtual guests are the
 most common example of resources that can be migrated in this manner.
 ]
 
 If you define +migration-threshold=pass:[<replaceable>N</replaceable>]+ for a
 resource, it will be banned from the original node after 'N' failures.
 
 [NOTE]
 ====
 The +migration-threshold+ is per 'resource', even though fail counts are
 tracked per 'operation'. The operation fail counts are added together
 to compare against the +migration-threshold+.
 ====
 
 By default, fail counts remain until manually cleared by an administrator
 using `crm_resource --cleanup` or `crm_failcount --delete` (hopefully after
 first fixing the failure's cause). It is possible to have fail counts expire
 automatically by setting the +failure-timeout+ resource meta-attribute.
 
 [IMPORTANT]
 ====
 A successful operation does not clear past failures. If a recurring monitor
 operation fails once, succeeds many times, then fails again days later, its
 fail count is 2. Fail counts are cleared only by manual intervention or
 falure timeout.
 ====
 
 For example, a setting of +migration-threshold=2+ and +failure-timeout=60s+
 would cause the resource to move to a new node after 2 failures, and
 allow it to move back (depending on stickiness and constraint scores) after one
 minute.
 
 [NOTE]
 ====
 +failure-timeout+ is measured since the most recent failure. That is, older
 failures do not individually time out and lower the fail count. Instead, all
 failures are timed out simultaneously (and the fail count is reset to 0) if
 there is no new failure for the timeout period.
 ====
 
 There are two exceptions to the migration threshold concept:
 when a resource either fails to start or fails to stop.
 
 If the cluster property +start-failure-is-fatal+ is set to +true+ (which is the
 default), start failures cause the fail count to be set to +INFINITY+ and thus
 always cause the resource to move immediately.
 
 Stop failures are slightly different and crucial.  If a resource fails
 to stop and STONITH is enabled, then the cluster will fence the node
 in order to be able to start the resource elsewhere.  If STONITH is
 not enabled, then the cluster has no way to continue and will not try
 to start the resource elsewhere, but will try to stop it again after
 the failure timeout.
 
 [IMPORTANT]
 Please read <<s-rules-recheck>> to understand how timeouts work
 before configuring a +failure-timeout+.
 
 == Moving Resources ==
 indexterm:[Moving,Resources] 
 indexterm:[Resource,Moving]
 
 === Moving Resources Manually ===
 
 There are primarily two occasions when you would want to move a
 resource from its current location: when the whole node is under
 maintenance, and when a single resource needs to be moved.
 
 ==== Standby Mode ====
 
 Since everything eventually comes down to a score, you could create
 constraints for every resource to prevent them from running on one
 node.  While pacemaker configuration can seem convoluted at times, not even
 we would require this of administrators.
 
 Instead, one can set a special node attribute which tells the cluster
 "don't let anything run here".  There is even a helpful tool to help
 query and set it, called `crm_standby`.  To check the standby status
 of the current machine, run:
 
 ----
 # crm_standby -G
 ----
 
 A value of +on+ indicates that the node is _not_ able to host any
 resources, while a value of +off+ says that it _can_.
 
 You can also check the status of other nodes in the cluster by
 specifying the `--node` option:
 
 ----
 # crm_standby -G --node sles-2
 ----
 
 To change the current node's standby status, use `-v` instead of `-G`:
 
 ----
 # crm_standby -v on
 ----
 
 Again, you can change another host's value by supplying a hostname with `--node`.
 
+A cluster node in standby mode will not run resources, but still contributes to
+quorum, and may fence or be fenced by nodes.
+
 ==== Moving One Resource ====
 
 When only one resource is required to move, we could do this by creating
 location constraints.  However, once again we provide a user-friendly
 shortcut as part of the `crm_resource` command, which creates and
 modifies the extra constraints for you.  If +Email+ were running on
 +sles-1+ and you wanted it moved to a specific location, the command
 would look something like:
         
 ----
 # crm_resource -M -r Email -H sles-2
 ----
 
 Behind the scenes, the tool will create the following location constraint:
 
 [source,XML]
 <rsc_location rsc="Email" node="sles-2" score="INFINITY"/>
 
 It is important to note that subsequent invocations of `crm_resource
 -M` are not cumulative. So, if you ran these commands
 
 ----
 # crm_resource -M -r Email -H sles-2
 # crm_resource -M -r Email -H sles-3
 ----
 
 then it is as if you had never performed the first command.
 
 To allow the resource to move back again, use:
 
 ----
 # crm_resource -U -r Email
 ----
 
 Note the use of the word _allow_.  The resource can move back to its
 original location but, depending on +resource-stickiness+, it might
 stay where it is.  To be absolutely certain that it moves back to
 +sles-1+, move it there before issuing the call to `crm_resource -U`:
         
 ----
 # crm_resource -M -r Email -H sles-1
 # crm_resource -U -r Email
 ----
 
 Alternatively, if you only care that the resource should be moved from
 its current location, try:
 
 ----
 # crm_resource -B -r Email
 ----
 
 Which will instead create a negative constraint, like
 
 [source,XML]
 <rsc_location rsc="Email" node="sles-1" score="-INFINITY"/>
 
 This will achieve the desired effect, but will also have long-term
 consequences.  As the tool will warn you, the creation of a
 +-INFINITY+ constraint will prevent the resource from running on that
 node until `crm_resource -U` is used.  This includes the situation
 where every other cluster node is no longer available!
 
 In some cases, such as when +resource-stickiness+ is set to
 +INFINITY+, it is possible that you will end up with the problem
 described in <<node-score-equal>>.  The tool can detect
 some of these cases and deals with them by creating both
 positive and negative constraints. E.g.
 
 +Email+ prefers +sles-1+ with a score of +-INFINITY+
 
 +Email+ prefers +sles-2+ with a score of +INFINITY+
 
 which has the same long-term consequences as discussed earlier.
 
 === Moving Resources Due to Connectivity Changes ===
 
 You can configure the cluster to move resources when external connectivity is
 lost in two steps.
 
 ==== Tell Pacemaker to Monitor Connectivity ====
 
 First, add an *ocf:pacemaker:ping* resource to the cluster.  The
 *ping* resource uses the system utility of the same name to a test whether
 list of machines (specified by DNS hostname or IPv4/IPv6 address) are
 reachable and uses the results to maintain a node attribute called +pingd+
 by default.
 footnote:[
 The attribute name is customizable, in order to allow multiple ping groups to be defined.
 ]
 
 [NOTE]
 ===========
 Older versions of Pacemaker used a different agent *ocf:pacemaker:pingd* which
 is now deprecated in favor of *ping*. If your version of Pacemaker does not
 contain the *ping* resource agent, download the latest version from
 https://github.com/ClusterLabs/pacemaker/tree/master/extra/resources/ping
 ===========
 
 Normally, the ping resource should run on all cluster nodes, which means that
 you'll need to create a clone.  A template for this can be found below
 along with a description of the most interesting parameters.
           
 .Common Options for a 'ping' Resource
 [width="95%",cols="1m,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Description
 
 |dampen
 |The time to wait (dampening) for further changes to occur. Use this
  to prevent a resource from bouncing around the cluster when cluster
  nodes notice the loss of connectivity at slightly different times.
  indexterm:[dampen,Ping Resource Option]
  indexterm:[Ping Resource,Option,dampen]
 
 |multiplier
 |The number of connected ping nodes gets multiplied by this value to
  get a score. Useful when there are multiple ping nodes configured.
  indexterm:[multiplier,Ping Resource Option]
  indexterm:[Ping Resource,Option,multiplier]
 
 |host_list
 |The machines to contact in order to determine the current
  connectivity status. Allowed values include resolvable DNS host
  names, IPv4 and IPv6 addresses.
  indexterm:[host_list,Ping Resource Option]
  indexterm:[Ping Resource,Option,host_list]
 
 |=========================================================
 
 .An example ping cluster resource that checks node connectivity once every minute
 =====
 [source,XML]
 ------------
 <clone id="Connected">
    <primitive id="ping" provider="pacemaker" class="ocf" type="ping">
     <instance_attributes id="ping-attrs">
       <nvpair id="pingd-dampen" name="dampen" value="5s"/>
       <nvpair id="pingd-multiplier" name="multiplier" value="1000"/>
       <nvpair id="pingd-hosts" name="host_list" value="my.gateway.com www.bigcorp.com"/>
     </instance_attributes>
     <operations>
       <op id="ping-monitor-60s" interval="60s" name="monitor"/>
     </operations>
    </primitive>
 </clone>
 ------------
 =====
 
 [IMPORTANT]
 ===========
 You're only half done.  The next section deals with telling Pacemaker
 how to deal with the connectivity status that +ocf:pacemaker:ping+ is
 recording.
 ===========
 
 ==== Tell Pacemaker How to Interpret the Connectivity Data ====
 
 [IMPORTANT]
 ======
 Before attempting the following, make sure you understand
 <<ch-rules>>.
 ======
 
 There are a number of ways to use the connectivity data.
 
 The most common setup is for people to have a single ping
 target (e.g. the service network's default gateway), to prevent the cluster
 from running a resource on any unconnected node.
 
 .Don't run a resource on unconnected nodes
 =====
 [source,XML]
 -------
 <rsc_location id="WebServer-no-connectivity" rsc="Webserver">
    <rule id="ping-exclude-rule" score="-INFINITY" >
     <expression id="ping-exclude" attribute="pingd" operation="not_defined"/>
    </rule>
 </rsc_location>
 -------
 =====
 
 A more complex setup is to have a number of ping targets configured.
 You can require the cluster to only run resources on nodes that can
 connect to all (or a minimum subset) of them.
 
 .Run only on nodes connected to three or more ping targets.
 =====
 [source,XML]
 -------
 <primitive id="ping" provider="pacemaker" class="ocf" type="ping">
 ... <!-- omitting some configuration to highlight important parts -->
       <nvpair id="pingd-multiplier" name="multiplier" value="1000"/>
 ...
 </primitive>
 ...
 <rsc_location id="WebServer-connectivity" rsc="Webserver">
    <rule id="ping-prefer-rule" score="-INFINITY" >
       <expression id="ping-prefer" attribute="pingd" operation="lt" value="3000"/>
    </rule>
 </rsc_location>
 -------
 =====
 
 Alternatively, you can tell the cluster only to _prefer_ nodes with the best
 connectivity.  Just be sure to set +multiplier+ to a value higher than
 that of +resource-stickiness+ (and don't set either of them to
 +INFINITY+).
 
 .Prefer the node with the most connected ping nodes
 =====
 [source,XML]
 -------
 <rsc_location id="WebServer-connectivity" rsc="Webserver">
    <rule id="ping-prefer-rule" score-attribute="pingd" >
     <expression id="ping-prefer" attribute="pingd" operation="defined"/>
    </rule>
 </rsc_location>
 -------
 =====
 
 It is perhaps easier to think of this in terms of the simple
 constraints that the cluster translates it into.  For example, if
 *sles-1* is connected to all five ping nodes but *sles-2* is only
 connected to two, then it would be as if you instead had the following
 constraints in your configuration:
 
 .How the cluster translates the above location constraint
 =====
 [source,XML]
 -------
 <rsc_location id="ping-1" rsc="Webserver" node="sles-1" score="5000"/>
 <rsc_location id="ping-2" rsc="Webserver" node="sles-2" score="2000"/>
 -------
 =====
 
 The advantage is that you don't have to manually update any
 constraints whenever your network connectivity changes.
 
 You can also combine the concepts above into something even more
 complex.  The example below shows how you can prefer the node with the
 most connected ping nodes provided they have connectivity to at least
 three (again assuming that +multiplier+ is set to 1000).
 
 .A more complex example of choosing a location based on connectivity
 =====
 [source,XML]
 -------
 <rsc_location id="WebServer-connectivity" rsc="Webserver">
    <rule id="ping-exclude-rule" score="-INFINITY" >
     <expression id="ping-exclude" attribute="pingd" operation="lt" value="3000"/>
    </rule>
    <rule id="ping-prefer-rule" score-attribute="pingd" >
     <expression id="ping-prefer" attribute="pingd" operation="defined"/>
    </rule>
 </rsc_location>
 -------
 =====
 
 [[s-migrating-resources]]
 === Migrating Resources ===
 
 Normally, when the cluster needs to move a resource, it fully restarts
 the resource (i.e. stops the resource on the current node
 and starts it on the new node).
 
 However, some types of resources, such as Xen virtual guests, are able to move to
 another location without loss of state (often referred to as live migration
 or hot migration). In pacemaker, this is called resource migration.
 Pacemaker can be configured to migrate a resource when moving it,
 rather than restarting it.
 
 Not all resources are able to migrate; see the Migration Checklist
 below, and those that can, won't do so in all situations.
 Conceptually, there are two requirements from which the other
 prerequisites follow:
 
 * The resource must be active and healthy at the old location; and
 * everything required for the resource to run must be available on
   both the old and new locations.
 
 The cluster is able to accommodate both 'push' and 'pull' migration models
 by requiring the resource agent to support two special actions:
 +migrate_to+ (performed on the current location) and +migrate_from+
 (performed on the destination).
 
 In push migration, the process on the current location transfers the
 resource to the new location where is it later activated.  In this
 scenario, most of the work would be done in the +migrate_to+ action
 and, if anything, the activation would occur during +migrate_from+.
 
 Conversely for pull, the +migrate_to+ action is practically empty and
 +migrate_from+ does most of the work, extracting the relevant resource
 state from the old location and activating it.
 
 There is no wrong or right way for a resource agent to implement migration,
 as long as it works.
 
 .Migration Checklist
 * The resource may not be a clone.
 * The resource must use an OCF style agent.
 * The resource must not be in a failed or degraded state.
 * The resource agent must support +migrate_to+ and
   +migrate_from+ actions, and advertise them in its metadata.
 * The resource must have the +allow-migrate+ meta-attribute set to
   +true+ (which is not the default).
 
 If an otherwise migratable resource depends on another resource
 via an ordering constraint, there are special situations in which it will be
 restarted rather than migrated.
 
 For example, if the resource depends on a clone, and at the time the resource
 needs to be moved, the clone has instances that are stopping and instances
 that are starting, then the resource will be restarted. The scheduler is not
 yet able to model this situation correctly and so takes the safer (if less
 optimal) path.
 
 Also, if a migratable resource depends on a non-migratable resource, and both
 need to be moved, the migratable resource will be restarted.
 
 [[s-node-health]]
 == Tracking Node Health ==
 
 A node may be functioning adequately as far as cluster membership is concerned,
 and yet be "unhealthy" in some respect that makes it an undesirable location
 for resources. For example, a disk drive may be reporting SMART errors, or the
 CPU may be highly loaded.
 
 Pacemaker offers a way to automatically move resources off unhealthy nodes.
 
 === Node Health Attributes ===
 
 Pacemaker will treat any node attribute whose name starts with +#health+ as an
 indicator of node health. Node health attributes may have one of the following
 values:
 
 .Allowed Values for Node Health Attributes
 [width="95%",cols="1,<3",options="header",align="center"]
 |=========================================================
 
 |Value
 |Intended significance
 
 |+red+
 |This indicator is unhealthy
  indexterm:[Node health,red]
 
 |+yellow+
 |This indicator is becoming unhealthy
  indexterm:[Node health,yellow]
 
 |+green+
 |This indicator is healthy
  indexterm:[Node health,green]
 
 |'integer'
 |A numeric score to apply to all resources on this node
  (0 or positive is healthy, negative is unhealthy)
  indexterm:[Node health,score]
 
 |=========================================================
 
 === Node Health Strategy ===
 
 Pacemaker assigns a node health score to each node, as the sum of the values of
 all its node health attributes. This score will be used as a location
 constraint applied to this node for all resources.
 
 The +node-health-strategy+ cluster option controls how Pacemaker responds to
 changes in node health attributes, and how it translates +red+, +yellow+, and
 +green+ to scores.
 
 Allowed values are:
 
 .Node Health Strategies
 [width="95%",cols="1m,<3",options="header",align="center"]
 |=========================================================
 
 |Value
 |Effect
 
 |none
 |Do not track node health attributes at all.
  indexterm:[Node health,none]
 
 |migrate-on-red
 |Assign the value of +-INFINITY+ to +red+, and 0 to +yellow+ and +green+.
  This will cause all resources to move off the node if any attribute is +red+.
  indexterm:[Node health,migrate-on-red]
 
 |only-green
 |Assign the value of +-INFINITY+ to +red+ and +yellow+, and 0 to +green+.
  This will cause all resources to move off the node if any attribute is +red+
  or +yellow+.
  indexterm:[Node health,only-green]
 
 |progressive
 |Assign the value of the +node-health-red+ cluster option to +red+, the value
  of +node-health-yellow+ to +yellow+, and the value of +node-health-green+ to
  +green+. Each node is additionally assigned a score of +node-health-base+
  (this allows resources to start even if some attributes are +yellow+). This
  strategy gives the administrator finer control over how important each value
  is.
  indexterm:[Node health,progressive]
 
 |custom
 |Track node health attributes using the same values as +progressive+ for
  +red+, +yellow+, and +green+, but do not take them into account.
  The administrator is expected to implement a policy by defining rules
  (see <<ch-rules>>) referencing node health attributes.
  indexterm:[Node health,custom]
 
 |=========================================================
 
 === Measuring Node Health ===
 
 Since Pacemaker calculates node health based on node attributes,
 any method that sets node attributes may be used to measure node
 health. The most common ways are resource agents or separate daemons.
 
 Pacemaker provides examples that can be used directly or as a basis for
 custom code. The +ocf:pacemaker:HealthCPU+ and +ocf:pacemaker:HealthSMART+
 resource agents set node health attributes based on CPU and disk parameters.
 The +ipmiservicelogd+ daemon sets node health attributes based on IPMI
 values (the +ocf:pacemaker:SystemHealth+ resource agent can be used to manage
 the daemon as a cluster resource).
 
 == Reloading Services After a Definition Change ==
 
 The cluster automatically detects changes to the definition of
 services it manages.  The normal response is to stop the
 service (using the old definition) and start it again (with the new
 definition).  This works well, but some services are smarter and can
 be told to use a new set of options without restarting.
 
 To take advantage of this capability, the resource agent must:
 
 . Accept the +reload+ operation and perform any required actions.
   _The actions here depend completely on your application!_
 +
 .The DRBD agent's logic for supporting +reload+
 =====
 [source,Bash]
 -------
 case $1 in
     start)
         drbd_start
         ;;
     stop)
         drbd_stop
         ;;
     reload)
         drbd_reload
         ;;
     monitor)
         drbd_monitor
         ;;
     *)
         drbd_usage
         exit $OCF_ERR_UNIMPLEMENTED
         ;;
 esac
 exit $?
 -------
 =====
 . Advertise the +reload+ operation in the +actions+ section of its metadata
 +
 .The DRBD Agent Advertising Support for the +reload+ Operation
 =====
 [source,XML]
 -------
 <?xml version="1.0"?>
   <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
   <resource-agent name="drbd">
     <version>1.1</version>
     
     <longdesc lang="en">
       Master/Slave OCF Resource Agent for DRBD
     </longdesc>
     
     ...
     
     <actions>
       <action name="start"   timeout="240" />
       <action name="reload"  timeout="240" />
       <action name="promote" timeout="90" />
       <action name="demote"  timeout="90" />
       <action name="notify"  timeout="90" />
       <action name="stop"    timeout="100" />
       <action name="meta-data"    timeout="5" />
       <action name="validate-all" timeout="30" />
     </actions>
   </resource-agent>
 -------
 =====
 . Advertise one or more parameters that can take effect using +reload+.
 +
 Any parameter with the +unique+ set to 0 is eligible to be used in this way.
 +
 .Parameter that can be changed using reload
 =====
 [source,XML]
 -------
 <parameter name="drbdconf" unique="0">
     <longdesc lang="en">Full path to the drbd.conf file.</longdesc>
     <shortdesc lang="en">Path to drbd.conf</shortdesc>
     <content type="string" default="${OCF_RESKEY_drbdconf_default}"/>
 </parameter>
 -------
 =====
 
 Once these requirements are satisfied, the cluster will automatically
 know to reload the resource (instead of restarting) when a non-unique
 field changes.
       
 [NOTE]
 ======
 Metadata will not be re-read unless the resource needs to be started. This may
 mean that the resource will be restarted the first time, even though you
 changed a parameter with +unique=0+.
 ======
 
 [NOTE]
 ======
 If both a unique and non-unique field are changed simultaneously, the
 resource will still be restarted.
 ======
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
index 9a358aed2f..339d2b7bd7 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
@@ -1,1455 +1,1468 @@
 :compat-mode: legacy
 = Advanced Resource Types =
 
 [[group-resources]]
 == Groups - A Syntactic Shortcut ==
 indexterm:[Group Resources]
 indexterm:[Resource,Groups]
 
 
 One of the most common elements of a cluster is a set of resources
 that need to be located together, start sequentially, and stop in the
 reverse order.  To simplify this configuration, we support the concept
 of groups.
 
 .A group of two primitive resources
 ======
 [source,XML]
 -------
 <group id="shortcut">
    <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
     <instance_attributes id="params-public-ip">
        <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
     </instance_attributes>
    </primitive>
    <primitive id="Email" class="lsb" type="exim"/>
 </group> 
 -------
 ======
 
 
 Although the example above contains only two resources, there is no
 limit to the number of resources a group can contain.  The example is
 also sufficient to explain the fundamental properties of a group:
 
 * Resources are started in the order they appear in (+Public-IP+
   first, then +Email+)
 * Resources are stopped in the reverse order to which they appear in
   (+Email+ first, then +Public-IP+)
 
 If a resource in the group can't run anywhere, then nothing after that
 is allowed to run, too.
 
 * If +Public-IP+ can't run anywhere, neither can +Email+;
 * but if +Email+ can't run anywhere, this does not affect +Public-IP+
   in any way
 
 The group above is logically equivalent to writing:
 
 .How the cluster sees a group resource
 ======
 [source,XML]
 -------
 <configuration>
    <resources>
     <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
      <instance_attributes id="params-public-ip">
         <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
      </instance_attributes>
     </primitive>
     <primitive id="Email" class="lsb" type="exim"/>
    </resources>
    <constraints>
       <rsc_colocation id="xxx" rsc="Email" with-rsc="Public-IP" score="INFINITY"/>
       <rsc_order id="yyy" first="Public-IP" then="Email"/>
    </constraints>
 </configuration> 
 -------
 ======
 
 Obviously as the group grows bigger, the reduced configuration effort
 can become significant.
 
 Another (typical) example of a group is a DRBD volume, the filesystem
 mount, an IP address, and an application that uses them.
 
 === Group Properties ===
 .Properties of a Group Resource
 [width="95%",cols="3m,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Description
 
 |id
 |A unique name for the group
  indexterm:[id,Group Resource Property]
  indexterm:[Resource,Group Property,id]
 
 |=========================================================
 
 === Group Options ===
 
 Groups inherit the +priority+, +target-role+, and +is-managed+ properties
 from primitive resources. See <<s-resource-options>> for information about
 those properties.
 
 === Group Instance Attributes ===
 
 Groups have no instance attributes. However, any that are set for the group
 object will be inherited by the group's children.
 
 === Group Contents ===
 
 Groups may only contain a collection of cluster resources (see
 <<primitive-resource>>).  To refer to a child of a group resource, just use
 the child's +id+ instead of the group's.
 
 === Group Constraints ===
 
 Although it is possible to reference a group's children in
 constraints, it is usually preferable to reference the group itself.
 
 .Some constraints involving groups
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_location id="group-prefers-node1" rsc="shortcut" node="node1" score="500"/>
     <rsc_colocation id="webserver-with-group" rsc="Webserver" with-rsc="shortcut"/>
     <rsc_order id="start-group-then-webserver" first="Webserver" then="shortcut"/>
 </constraints> 
 -------
 ======
 
 === Group Stickiness ===
 indexterm:[resource-stickiness,Groups]
 
 Stickiness, the measure of how much a resource wants to stay where it
 is, is additive in groups.  Every active resource of the group will
 contribute its stickiness value to the group's total.  So if the
 default +resource-stickiness+ is 100, and a group has seven members,
 five of which are active, then the group as a whole will prefer its
 current location with a score of 500.
 
 [[s-resource-clone]]
 == Clones - Resources That Can Have Multiple Active Instances ==
 indexterm:[Clone Resources]
 indexterm:[Resource,Clones]
 
 'Clone' resources are resources that can have more than one copy active at the
 same time. This allows you, for example, to run a copy of a daemon on every
 node. You can clone any primitive or group resource.
 footnote:[
 Of course, the service must support running multiple instances.
 ]
 
 === Anonymous versus Unique Clones ===
 
 A clone resource is configured to be either 'anonymous' or 'globally unique'.
 
 Anonymous clones are the simplest. These behave completely identically
 everywhere they are running. Because of this, there can be only one instance of
 an anonymous clone active per node.
       
 The instances of globally unique clones are distinct entities. All instances
 are launched identically, but one instance of the clone is not identical to any
 other instance, whether running on the same node or a different node. As an
 example, a cloned IP address can use special kernel functionality such that
 each instance handles a subset of requests for the same IP address.
 
 [[s-resource-promotable]]
 === Promotable clones ===
 
 indexterm:[Promotable Clone Resources]
 indexterm:[Resource,Promotable]
 
 If a clone is 'promotable', its instances can perform a special role that
 Pacemaker will manage via the +promote+ and +demote+ actions of the resource
 agent.
 
 Services that support such a special role have various terms for the special
 role and the default role: primary and secondary, master and replica,
 controller and worker, etc. Pacemaker uses the terms 'master' and 'slave',
 footnote:[
 These are historical terms that will eventually be replaced, but the extensive
 use of them and the need for backward compatibility makes it a long process.
 You may see examples using a +master+ tag instead of a +clone+ tag with the
 +promotable+ meta-attribute set to +true+; the +master+ tag is supported, but
 deprecated, and will be removed in a future version. You may also see such
 services referred to as 'multi-state' or 'stateful'; these means the same thing
 as 'promotable'.
 ]
 but is agnostic to what the service calls them or what they do.
 
 All that Pacemaker cares about is that an instance comes up in the default role
 when started, and the resource agent supports the +promote+ and +demote+ actions
 to manage entering and exiting the special role.
 
 === Clone Properties ===
 
 .Properties of a Clone Resource
 [width="95%",cols="3m,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Description
 
 |id
 |A unique name for the clone
  indexterm:[id,Clone Property]
  indexterm:[Clone,Property,id]
 
 |=========================================================
 
 === Clone Options ===
 
 <<s-resource-options,Options>> inherited from primitive resources:
 +priority, target-role, is-managed+
 
 .Clone-specific configuration options
 [width="95%",cols="1m,1,<3",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |globally-unique
 |false
 |If +true+, each clone instance performs a distinct function
  indexterm:[globally-unique,Clone Option]
  indexterm:[Clone,Option,globally-unique]
   
 |clone-max
 |number of nodes in cluster
 |The maximum number of clone instances that can be started across the entire
  cluster
  indexterm:[clone-max,Clone Option]
  indexterm:[Clone,Option,clone-max]
 
 |clone-node-max
 |1
 |If +globally-unique+ is +true+, the maximum number of clone instances that can
  be started on a single node
  indexterm:[clone-node-max,Clone Option]
  indexterm:[Clone,Option,clone-node-max]
   
 |clone-min
 |0
 |Require at least this number of clone instances to be runnable before allowing
  resources depending on the clone to be runnable. A value of 0 means require
  all clone instances to be runnable.
  indexterm:[clone-min,Clone Option]
  indexterm:[Clone,Option,clone-min]
 
 |notify
 |false
 |Call the resource agent's +notify+ action for all active instances, before and
  after starting or stopping any clone instance. The resource agent must support
  this action. Allowed values: +false+, +true+
  indexterm:[notify,Clone Option]
  indexterm:[Clone,Option,notify]
 
 |ordered
 |false
 |If +true+, clone instances must be started sequentially instead of in parallel
  Allowed values: +false+, +true+
  indexterm:[ordered,Clone Option]
  indexterm:[Clone,Option,ordered]
 
 |interleave
 |false
 |When this clone is ordered relative to another clone, if this option is
  +false+ (the default), the ordering is relative to 'all' instances of the
  other clone, whereas if this option is +true+, the ordering is relative only
  to instances on the same node.
  Allowed values: +false+, +true+
  indexterm:[interleave,Clone Option]
  indexterm:[Clone,Option,interleave]
 
 |promotable
 |false
 |If +true+, clone instances can perform a special role that Pacemaker will
  manage via the resource agent's +promote+ and +demote+ actions. The resource
  agent must support these actions.
  Allowed values: +false+, +true+
  indexterm:[promotable,Clone Option]
  indexterm:[Clone,Option,promotable]
 
 |promoted-max
 |1
 |If +promotable+ is +true+, the number of instances that can be promoted at one
  time across the entire cluster
  indexterm:[promoted-max,Clone Option]
  indexterm:[Clone,Option,promoted-max]
 
 |promoted-node-max
 |1
 |If +promotable+ is +true+ and +globally-unique+ is +false+, the number of
  clone instances can be promoted at one time on a single node
  indexterm:[promoted-node-max,Clone Option]
  indexterm:[Clone,Option,promoted-node-max]
 
 |=========================================================
 
 For backward compatibility, +master-max+ and +master-node-max+ are accepted as
 aliases for +promoted-max+ and +promoted-node-max+, but are deprecated since
 2.0.0, and support for them will be removed in a future version.
 
 === Clone Contents ===
 
 Clones must contain exactly one primitive or group resource.
 
 .A clone that runs a web server on all nodes
 ====
 [source,XML]
 ----
 <clone id="apache-clone">
     <primitive id="apache" class="lsb" type="apache">
         <operations>
            <op id="apache-monitor" name="monitor" interval="30"/>
         </operations>
     </primitive>
 </clone> 
 ----
 ====
 
 [WARNING]
 You should never reference the name of a clone's child (the primitive or group
 resource being cloned). If you think you need to do this, you probably need to
 re-evaluate your design.
 
 === Clone Instance Attributes ===
 
 Clones have no instance attributes; however, any that are set here will be
 inherited by the clone's child.
 
 === Clone Constraints ===
 
 In most cases, a clone will have a single instance on each active cluster
 node.  If this is not the case, you can indicate which nodes the
 cluster should preferentially assign copies to with resource location
 constraints.  These constraints are written no differently from those
 for primitive resources except that the clone's +id+ is used.
 
 .Some constraints involving clones
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_location id="clone-prefers-node1" rsc="apache-clone" node="node1" score="500"/>
     <rsc_colocation id="stats-with-clone" rsc="apache-stats" with="apache-clone"/>
     <rsc_order id="start-clone-then-stats" first="apache-clone" then="apache-stats"/>
 </constraints> 
 -------
 ======
 
 Ordering constraints behave slightly differently for clones.  In the
 example above, +apache-stats+ will wait until all copies of +apache-clone+
 that need to be started have done so before being started itself.
 Only if _no_ copies can be started will +apache-stats+ be prevented
 from being active.  Additionally, the clone will wait for
 +apache-stats+ to be stopped before stopping itself.
 
 Colocation of a primitive or group resource with a clone means that
 the resource can run on any node with an active instance of the clone.
 The cluster will choose an instance based on where the clone is running and
 the resource's own location preferences.
 
 Colocation between clones is also possible.  If one clone +A+ is colocated
 with another clone +B+, the set of allowed locations for +A+ is limited to
 nodes on which +B+ is (or will be) active.  Placement is then performed
 normally.
 
 ==== Promotable Clone Constraints ====
 
 For promotable clone resources, the +first-action+ and/or +then-action+ fields
 for ordering constraints may be set to +promote+ or +demote+ to constrain the
 master role, and colocation constraints may contain +rsc-role+ and/or
 +with-rsc-role+ fields.
           
 .Additional colocation constraint options for promotable clone resources
 [width="95%",cols="1m,1,<3",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |rsc-role
 |Started
 |An additional attribute of colocation constraints that specifies the
  role that +rsc+ must be in.  Allowed values: +Started+, +Master+,
  +Slave+.
  indexterm:[rsc-role,Ordering Constraints]
  indexterm:[Constraints,Ordering,rsc-role]
 
 |with-rsc-role
 |Started
 |An additional attribute of colocation constraints that specifies the
  role that +with-rsc+ must be in.  Allowed values: +Started+,
  +Master+, +Slave+.
  indexterm:[with-rsc-role,Ordering Constraints]
  indexterm:[Constraints,Ordering,with-rsc-role]
 
 |=========================================================
 
 .Constraints involving promotable clone resources       
 ======
 [source,XML]
 -------
 <constraints>
    <rsc_location id="db-prefers-node1" rsc="database" node="node1" score="500"/>
    <rsc_colocation id="backup-with-db-slave" rsc="backup"
      with-rsc="database" with-rsc-role="Slave"/>
    <rsc_colocation id="myapp-with-db-master" rsc="myApp"
      with-rsc="database" with-rsc-role="Master"/>
    <rsc_order id="start-db-before-backup" first="database" then="backup"/>
    <rsc_order id="promote-db-then-app" first="database" first-action="promote"
      then="myApp" then-action="start"/>
 </constraints> 
 -------
 ======
 
 In the example above, +myApp+ will wait until one of the database
 copies has been started and promoted to master before being started
 itself on the same node.  Only if no copies can be promoted will +myApp+ be
 prevented from being active.  Additionally, the cluster will wait for
 +myApp+ to be stopped before demoting the database.
 
 Colocation of a primitive or group resource with a promotable clone
 resource means that it can run on any node with an active instance of
 the promotable clone resource that has the specified role (+master+ or
 +slave+).  In the example above, the cluster will choose a location based on
 where database is running as a +master+, and if there are multiple
 +master+ instances it will also factor in +myApp+'s own location
 preferences when deciding which location to choose.
 
 Colocation with regular clones and other promotable clone resources is also
 possible.  In such cases, the set of allowed locations for the +rsc+
 clone is (after role filtering) limited to nodes on which the
 +with-rsc+ promotable clone resource is (or will be) in the specified role.
 Placement is then performed as normal.
 
 ==== Using Promotable Clone Resources in Colocation Sets ====
 
 .Additional colocation set options relevant to promotable clone resources
 [width="95%",cols="1m,1,<6",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |role
 |Started
 |The role that 'all members' of the set must be in.  Allowed values: +Started+, +Master+,
  +Slave+.
  indexterm:[role,Ordering Constraints]
  indexterm:[Constraints,Ordering,role]
 
 |=========================================================
 
 In the following example +B+'s master must be located on the same node as +A+'s master.
 Additionally resources +C+ and +D+ must be located on the same node as +A+'s
 and +B+'s masters.
 
 .Colocate C and D with A's and B's master instances
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_colocation id="coloc-1" score="INFINITY" >
       <resource_set id="colocated-set-example-1" sequential="true" role="Master">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
       </resource_set>
       <resource_set id="colocated-set-example-2" sequential="true">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
     </rsc_colocation>
 </constraints>
 -------
 ======
 
 ==== Using Promotable Clone Resources in Ordered Sets ====
 
 .Additional ordered set options relevant to promotable clone resources
 [width="95%",cols="1m,1,<3",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |action
 |value of +first-action+
 |An additional attribute of ordering constraint sets that specifies the
  action that applies to 'all members' of the set.  Allowed
  values: +start+, +stop+, +promote+, +demote+.
  indexterm:[action,Ordering Constraints]
  indexterm:[Constraints,Ordering,action]
 
 |=========================================================
 
 .Start C and D after first promoting A and B
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1" score="INFINITY" >
       <resource_set id="ordered-set-1" sequential="true" action="promote">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
       </resource_set>
       <resource_set id="ordered-set-2" sequential="true" action="start">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
     </rsc_order>
 </constraints>
 -------
 ======
 
 In the above example, +B+ cannot be promoted to a master role until +A+ has
 been promoted. Additionally, resources +C+ and +D+ must wait until +A+ and +B+
 have been promoted before they can start.
 
 
 [[s-clone-stickiness]]
 === Clone Stickiness ===
 
 indexterm:[resource-stickiness,Clones]
 
 To achieve a stable allocation pattern, clones are slightly sticky by
 default.  If no value for +resource-stickiness+ is provided, the clone
 will use a value of 1.  Being a small value, it causes minimal
 disturbance to the score calculations of other resources but is enough
 to prevent Pacemaker from needlessly moving copies around the cluster.
 
 [NOTE]
 ====
 For globally unique clones, this may result in multiple instances of the
 clone staying on a single node, even after another eligible node becomes
 active (for example, after being put into standby mode then made active again).
 If you do not want this behavior, specify a +resource-stickiness+ of 0
 for the clone temporarily and let the cluster adjust, then set it back
 to 1 if you want the default behavior to apply again.
 ====
 
+[IMPORTANT]
+====
+If +resource-stickiness+ is set in the +rsc_defaults+ section, it will
+apply to clone instances as well. This means an explicit +resource-stickiness+
+of 0 in +rsc_defaults+ works differently from the implicit default used when
++resource-stickiness+ is not specified.
+====
+
 === Clone Resource Agent Requirements ===
 
 Any resource can be used as an anonymous clone, as it requires no
 additional support from the resource agent.  Whether it makes sense to
 do so depends on your resource and its resource agent.
 
 ==== Resource Agent Requirements for Globally Unique Clones ====
 
 Globally unique clones require additional support in the resource agent. In
 particular, it must only respond with +$\{OCF_SUCCESS}+ if the node has that
 exact instance active. All other probes for instances of the clone should
 result in +$\{OCF_NOT_RUNNING}+ (or one of the other OCF error codes if
 they are failed).
 
 Individual instances of a clone are identified by appending a colon and a
 numerical offset, e.g. +apache:2+.
 
 Resource agents can find out how many copies there are by examining
 the +OCF_RESKEY_CRM_meta_clone_max+ environment variable and which
 instance it is by examining +OCF_RESKEY_CRM_meta_clone+.
 
 The resource agent must not make any assumptions (based on
 +OCF_RESKEY_CRM_meta_clone+) about which numerical instances are active.  In
 particular, the list of active copies will not always be an unbroken
 sequence, nor always start at 0.
 
 ==== Resource Agent Requirements for Promotable Clones ====
 
 Promotable clone resources require two extra actions, +demote+ and +promote+,
 which are responsible for changing the state of the resource. Like +start+ and
 +stop+, they should return +$\{OCF_SUCCESS}+ if they completed successfully or
 a relevant error code if they did not.
 
 The states can mean whatever you wish, but when the resource is
 started, it must come up in the mode called +slave+.  From there the
 cluster will decide which instances to promote to +master+.
 
 In addition to the clone requirements for monitor actions, agents must
 also _accurately_ report which state they are in.  The cluster relies
 on the agent to report its status (including role) accurately and does
 not indicate to the agent what role it currently believes it to be in.
 
 .Role implications of OCF return codes
 [width="95%",cols="1,<1",options="header",align="center"]
 |=========================================================
 
 |Monitor Return Code
 |Description
 
 |OCF_NOT_RUNNING
 |Stopped
  indexterm:[Return Code,OCF_NOT_RUNNING]
  
 |OCF_SUCCESS
 |Running (Slave)
  indexterm:[Return Code,OCF_SUCCESS]
  
 |OCF_RUNNING_MASTER
 |Running (Master)
  indexterm:[Return Code,OCF_RUNNING_MASTER]
 
 |OCF_FAILED_MASTER
 |Failed (Master)
  indexterm:[Return Code,OCF_FAILED_MASTER]
  
 |Other
 |Failed (Slave)
 
 |=========================================================
 
 ==== Clone Notifications ====
 
 If the clone has the +notify+ meta-attribute set to +true+, and the resource
 agent supports the +notify+ action, Pacemaker will call the action when
 appropriate, passing a number of extra variables which, when combined with
 additional context, can be used to calculate the current state of the cluster
 and what is about to happen to it.
 
 .Environment variables supplied with Clone notify actions
 [width="95%",cols="5,<3",options="header",align="center"]
 |=========================================================
 
 |Variable
 |Description
 
 |OCF_RESKEY_CRM_meta_notify_type
 |Allowed values: +pre+, +post+
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,type]
  indexterm:[type,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_operation
 |Allowed values: +start+, +stop+
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,operation]
  indexterm:[operation,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_start_resource
 |Resources to be started
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_resource]
  indexterm:[start_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_stop_resource
 |Resources to be stopped
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_resource]
  indexterm:[stop_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_active_resource
 |Resources that are running
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_resource]
  indexterm:[active_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_inactive_resource
 |Resources that are not running
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,inactive_resource]
  indexterm:[inactive_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_start_uname
 |Nodes on which resources will be started
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_uname]
  indexterm:[start_uname,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_stop_uname
 |Nodes on which resources will be stopped
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_uname]
  indexterm:[stop_uname,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_active_uname
 |Nodes on which resources are running
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_uname]
  indexterm:[active_uname,Notification Environment Variable]
 
 |=========================================================
 
 The variables come in pairs, such as
 +OCF_RESKEY_CRM_meta_notify_start_resource+ and
 +OCF_RESKEY_CRM_meta_notify_start_uname+ and should be treated as an
 array of whitespace-separated elements.
 
 +OCF_RESKEY_CRM_meta_notify_inactive_resource+ is an exception as the
 matching +uname+ variable does not exist since inactive resources
 are not running on any node.
 
 Thus in order to indicate that +clone:0+ will be started on +sles-1+,
 +clone:2+ will be started on +sles-3+, and +clone:3+ will be started
 on +sles-2+, the cluster would set
 
 .Notification variables
 ======
 [source,Bash]
 -------
 OCF_RESKEY_CRM_meta_notify_start_resource="clone:0 clone:2 clone:3"
 OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2"
 -------
 ======
 
+[NOTE]
+====
+Pacemaker will log but otherwise ignore failures of notify actions.
+====
+
 ==== Interpretation of Notification Variables ====
 
 .Pre-notification (stop):
 
 * Active resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
 * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 
 .Post-notification (stop) / Pre-notification (start):
 
 * Active resources
 ** +$OCF_RESKEY_CRM_meta_notify_active_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 * Inactive resources
 ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ 
 * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 
 .Post-notification (start):
 
 * Active resources:
 ** +$OCF_RESKEY_CRM_meta_notify_active_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Inactive resources:
 ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 ==== Extra Notifications for Promotable Clones ====
 
 .Extra environment variables supplied for promotable clones
 [width="95%",cols="5,<3",options="header",align="center"]
 |=========================================================
 
 |OCF_RESKEY_CRM_meta_notify_master_resource
 |Resources that are running in +Master+ mode
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_resource]
  indexterm:[master_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_slave_resource
 |Resources that are running in +Slave+ mode
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_resource]
  indexterm:[slave_resource,Notification Environment Variable]
    
 |OCF_RESKEY_CRM_meta_notify_promote_resource
 |Resources to be promoted
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_resource]
  indexterm:[promote_resource,Notification Environment Variable]
    
 |OCF_RESKEY_CRM_meta_notify_demote_resource
 |Resources to be demoted
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_resource]
  indexterm:[demote_resource,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_promote_uname
 |Nodes on which resources will be promoted
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_uname]
  indexterm:[promote_uname,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_demote_uname
 |Nodes on which resources will be demoted
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_uname]
  indexterm:[demote_uname,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_master_uname
 |Nodes on which resources are running in +Master+ mode
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_uname]
  indexterm:[master_uname,Notification Environment Variable]
 
 |OCF_RESKEY_CRM_meta_notify_slave_uname
 |Nodes on which resources are running in +Slave+ mode
  indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_uname]
  indexterm:[slave_uname,Notification Environment Variable]
 
 |=========================================================
 
 ==== Interpretation of Promotable Notification Variables ====
 
 .Pre-notification (demote):
 
 * +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
 * +Master+ resources: +$OCF_RESKEY_CRM_meta_notify_master_resource+
 * +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
 * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 
 .Post-notification (demote) / Pre-notification (stop):
 
 * +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
 * +Master+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_master_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ 
 * +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
 * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 
 
 .Post-notification (stop) / Pre-notification (start)
 
 * +Active+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_active_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ 
 * +Master+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_master_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ 
 * +Slave+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ 
 * Inactive resources:
 ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ 
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 
 .Post-notification (start) / Pre-notification (promote)
 
 * +Active+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_active_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ 
 * +Master+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_master_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ 
 * +Slave+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ 
 * Inactive resources:
 ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+           
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 .Post-notification (promote)
 
 * +Active+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_active_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ 
 * +Master+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_master_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * +Slave+ resources:
 ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_promote_resource+ 
 * Inactive resources:
 ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
 ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+ 
 * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
 * Resources that were promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
 * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
 * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
 
 === Monitoring Promotable Clone Resources ===
 
 The usual monitor actions are insufficient to monitor a promotable clone
 resource, because Pacemaker needs to verify not only that the resource is
 active, but also that its actual role matches its intended one.
 
 Define two monitoring actions: the usual one will cover the slave role,
 and an additional one with +role="master"+ will cover the master role.
 
 .Monitoring both states of a promotable clone resource
 ======
 [source,XML]
 -------
 <clone id="myMasterRsc">
    <meta_attributes id="myMasterRsc-meta">
        <nvpair name="promotable" value="true"/>
    </meta_attributes>
    <primitive id="myRsc" class="ocf" type="myApp" provider="myCorp">
     <operations>
      <op id="public-ip-slave-check" name="monitor" interval="60"/>
      <op id="public-ip-master-check" name="monitor" interval="61" role="Master"/>
     </operations>
    </primitive>
 </clone> 
 -------
 ======
 
 [IMPORTANT]
 ===========
 It is crucial that _every_ monitor operation has a different interval!
 Pacemaker currently differentiates between operations
 only by resource and interval; so if (for example) a promotable clone resource
 had the same monitor interval for both roles, Pacemaker would ignore the
 role when checking the status -- which would cause unexpected return
 codes, and therefore unnecessary complications.
 ===========
 
 [[s-promotion-scores]]
 === Determining Which Instance is Promoted ===
 
 Pacemaker can choose a promotable clone instance to be promoted in one of two
 ways:
 
 * Promotion scores: These are node attributes set via the `crm_master` utility,
   which generally would be called by the resource agent's start action if it
   supports promotable clones. This tool automatically detects both the resource
   and host, and should be used to set a preference for being promoted. Based on
   this, +promoted-max+, and +promoted-node-max+, the instance(s) with the
   highest preference will be promoted.
 
 * Constraints: Location constraints can indicate which nodes are most preferred
   as masters.
 
 .Explicitly preferring node1 to be promoted to master
 ======
 [source,XML]
 -------
 <rsc_location id="master-location" rsc="myMasterRsc">
     <rule id="master-rule" score="100" role="Master">
       <expression id="master-exp" attribute="#uname" operation="eq" value="node1"/>
     </rule>
 </rsc_location> 
 -------
 ======
 
 [[s-resource-bundle]]
 == Bundles - Isolated Environments ==
 indexterm:[bundle]
 indexterm:[Resource,bundle]
 indexterm:[Docker,bundle]
 indexterm:[rkt,bundle]
 
 Pacemaker supports a special syntax for launching a
 https://en.wikipedia.org/wiki/Operating-system-level_virtualization[container]
 with any infrastructure it requires: the 'bundle'.
 
 Pacemaker bundles support https://www.docker.com/[Docker] and
 https://coreos.com/rkt/[rkt] container technologies.
 footnote:[Docker is a trademark of Docker, Inc. No endorsement by or
 association with Docker, Inc. is implied.]
 
 .A bundle for a containerized web server
 ====
 [source,XML]
 ----
 <bundle id="httpd-bundle">
    <docker image="pcmk:http" replicas="3"/>
    <network ip-range-start="192.168.122.131"
             host-netmask="24"
             host-interface="eth0">
       <port-mapping id="httpd-port" port="80"/>
    </network>
    <storage>
       <storage-mapping id="httpd-syslog"
                        source-dir="/dev/log"
                        target-dir="/dev/log"
                        options="rw"/>
       <storage-mapping id="httpd-root"
                        source-dir="/srv/html"
                        target-dir="/var/www/html"
                        options="rw"/>
       <storage-mapping id="httpd-logs"
                        source-dir-root="/var/log/pacemaker/bundles"
                        target-dir="/etc/httpd/logs"
                        options="rw"/>
    </storage>
    <primitive class="ocf" id="httpd" provider="heartbeat" type="apache"/>
 </bundle>
 ----
 ====
 
 === Bundle Properties ===
 
 .Properties of a Bundle
 [width="95%",cols="3m,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Description
 
 |id
 |A unique name for the bundle (required)
  indexterm:[id,bundle]
  indexterm:[bundle,Property,id]
 
 |description
 |Arbitrary text (not used by Pacemaker)
  indexterm:[description,bundle]
  indexterm:[bundle,Property,description]
 
 |=========================================================
 
 A bundle must contain exactly one +<docker>+ or +<rkt>+ element.
 
 === Docker Properties ===
 
 Before configuring a Docker bundle in Pacemaker, the user must install Docker
 and supply a fully configured Docker image on every node allowed to run the
 bundle.
 
 Pacemaker will create an implicit +ocf:heartbeat:docker+ resource to manage
 a bundle's Docker container. The user must ensure that resource agent is
 installed on every node allowed to run the bundle.
 
 .Properties of a Bundle's Docker Element
 [width="95%",cols="3m,4,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |image
 |
 |Docker image tag (required)
  indexterm:[image,Docker]
  indexterm:[Docker,Property,image]
 
 |replicas
 |Value of +promoted-max+ if that is positive, else 1
 |A positive integer specifying the number of container instances to launch
  indexterm:[replicas,Docker]
  indexterm:[Docker,Property,replicas]
 
 |replicas-per-host
 |1
 |A positive integer specifying the number of container instances allowed to run
  on a single node
  indexterm:[replicas-per-host,Docker]
  indexterm:[Docker,Property,replicas-per-host]
 
 |promoted-max
 |0
 |A non-negative integer that, if positive, indicates that the containerized
  service should be treated as a promotable service, with this many replicas
  allowed to run the service in the master role
  indexterm:[promoted-max,Docker]
  indexterm:[Docker,Property,promoted-max]
 
 |network
 |
 |If specified, this will be passed to +docker run+ as the
  https://docs.docker.com/engine/reference/run/#network-settings[network setting]
  for the Docker container.
  indexterm:[network,Docker]
  indexterm:[Docker,Property,network]
 
 |run-command
 |`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
 |This command will be run inside the container when launching it ("PID 1"). If
  the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
  (but could, for example, be a script that does other stuff, too). If the
  container image has a pre-2.0.0 version of Pacemaker, set this to
  +/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
  indexterm:[run-command,Docker]
  indexterm:[Docker,Property,run-command]
 
 |options
 |
 |Extra command-line options to pass to `docker run`
  indexterm:[options,Docker]
  indexterm:[Docker,Property,options]
 
 |=========================================================
 
 For backward compatibility, +masters+ is accepted as an alias for
 +promoted-max+, but is deprecated since 2.0.0, and support for it will be
 removed in a future version.
 
 === rkt Properties ===
 
 Before configuring a rkt bundle in Pacemaker, the user must install rkt
 and supply a fully configured container image on every node allowed to run the
 bundle.
 
 Pacemaker will create an implicit +ocf:heartbeat:rkt+ resource to manage
 a bundle's rkt container. The user must ensure that resource agent is
 installed on every node allowed to run the bundle.
 
 .Properties of a Bundle's rkt Element
 [width="95%",cols="3m,4,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |image
 |
 |Container image tag (required)
  indexterm:[image,rkt]
  indexterm:[rkt,Property,image]
 
 |replicas
 |Value of +promoted-max+ if that is positive, else 1
 |A positive integer specifying the number of container instances to launch
  indexterm:[replicas,rkt]
  indexterm:[rkt,Property,replicas]
 
 |replicas-per-host
 |1
 |A positive integer specifying the number of container instances allowed to run
  on a single node
  indexterm:[replicas-per-host,rkt]
  indexterm:[rkt,Property,replicas-per-host]
 
 |promoted-max
 |0
 |A non-negative integer that, if positive, indicates that the containerized
  service should be treated as a promotable service, with this many replicas
  allowed to run the service in the master role
  indexterm:[promoted-max,rkt]
  indexterm:[rkt,Property,promoted-max]
 
 |network
 |
 |If specified, this will be passed to +rkt run+ as the
  network setting for the rkt container.
  indexterm:[network,rkt]
  indexterm:[rkt,Property,network]
 
 |run-command
 |`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
 |This command will be run inside the container when launching it ("PID 1"). If
  the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
  (but could, for example, be a script that does other stuff, too). If the
  container image has a pre-2.0.0 version of Pacemaker, set this to
  +/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
  indexterm:[run-command,rkt]
  indexterm:[rkt,Property,run-command]
 
 |options
 |
 |Extra command-line options to pass to `rkt run`
  indexterm:[options,rkt]
  indexterm:[rkt,Property,options]
 
 |=========================================================
 
 For backward compatibility, +masters+ is accepted as an alias for
 +promoted-max+, but is deprecated since 2.0.0, and support for it will be
 removed in a future version.
 
 === Bundle Network Properties ===
 
 A bundle may optionally contain one +<network>+ element.
 indexterm:[bundle,network]
 
 .Properties of a Bundle's Network Element
 [width="95%",cols="2m,1,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |add-host
 |TRUE
 |If TRUE, and +ip-range-start+ is used, Pacemaker will automatically ensure
  that +/etc/hosts+ inside the containers has entries for each
  <<s-resource-bundle-note-replica-names,replica name>> and its assigned IP.
  indexterm:[add-host,network]
  indexterm:[network,Property,add-host]
 
 |ip-range-start
 |
 |If specified, Pacemaker will create an implicit +ocf:heartbeat:IPaddr2+
  resource for each container instance, starting with this IP address,
  using up to +replicas+ sequential addresses. These addresses can be used
  from the host's network to reach the service inside the container, though
  it is not visible within the container itself. Only IPv4 addresses are
  currently supported.
  indexterm:[ip-range-start,network]
  indexterm:[network,Property,ip-range-start]
 
 |host-netmask
 |32
 |If +ip-range-start+ is specified, the IP addresses are created with this
  CIDR netmask (as a number of bits).
  indexterm:[host-netmask,network]
  indexterm:[network,Property,host-netmask]
 
 |host-interface
 |
 |If +ip-range-start+ is specified, the IP addresses are created on this
  host interface (by default, it will be determined from the IP address).
  indexterm:[host-interface,network]
  indexterm:[network,Property,host-interface]
 
 |control-port
 |3121
 |If the bundle contains a +primitive+, the cluster will use this integer TCP
  port for communication with Pacemaker Remote inside the container. Changing
  this is useful when the container is unable to listen on the default port,
  for example, when the container uses the host's network rather than
  +ip-range-start+ (in which case +replicas-per-host+ must be 1), or when the
  bundle may run on a Pacemaker Remote node that is already listening on the
  default port. Any PCMK_remote_port environment variable set on the host or in
  the container is ignored for bundle connections.
  indexterm:[control-port,network]
  indexterm:[network,Property,control-port]
 
 |=========================================================
 
 [[s-resource-bundle-note-replica-names]]
 [NOTE]
 ====
 Replicas are named by the bundle id plus a dash and an integer counter starting
 with zero. For example, if a bundle named +httpd-bundle+ has +replicas=2+, its
 containers will be named +httpd-bundle-0+ and +httpd-bundle-1+.
 ====
 
 Additionally, a +<network>+ element may optionally contain one or more
 +<port-mapping>+ elements.
 indexterm:[bundle,network,port-mapping]
 
 .Properties of a Bundle's Port-Mapping Element
 [width="95%",cols="2m,1,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the port mapping (required)
  indexterm:[id,port-mapping]
  indexterm:[port-mapping,Property,id]
 
 |port
 |
 |If this is specified, connections to this TCP port number on the host network
  (on the container's assigned IP address, if +ip-range-start+ is specified)
  will be forwarded to the container network. Exactly one of +port+ or +range+
  must be specified in a +port-mapping+.
  indexterm:[port,port-mapping]
  indexterm:[port-mapping,Property,port]
 
 |internal-port
 |value of +port+
 |If +port+ and this are specified, connections to +port+ on the host's network
  will be forwarded to this port on the container network.
  indexterm:[internal-port,port-mapping]
  indexterm:[port-mapping,Property,internal-port]
 
 |range
 |
 |If this is specified, connections to these TCP port numbers (expressed as
  'first_port'-'last_port') on the host network (on the container's assigned IP
  address, if +ip-range-start+ is specified) will be forwarded to the same ports
  in the container network. Exactly one of +port+ or +range+ must be specified
  in a +port-mapping+.
  indexterm:[range,port-mapping]
  indexterm:[port-mapping,Property,range]
 
 |=========================================================
 
 [NOTE]
 ====
 If the bundle contains a +primitive+, Pacemaker will automatically map the
 +control-port+, so it is not necessary to specify that port in a
 +port-mapping+.
 ====
 
 === Bundle Storage Properties ===
 
 A bundle may optionally contain one +<storage>+ element. A +<storage>+ element
 has no properties of its own, but may contain one or more +<storage-mapping>+
 elements.
 indexterm:[bundle,storage,storage-mapping]
 
 .Properties of a Bundle's Storage-Mapping Element
 [width="95%",cols="2m,1,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the storage mapping (required)
  indexterm:[id,storage-mapping]
  indexterm:[storage-mapping,Property,id]
 
 |source-dir
 |
 |The absolute path on the host's filesystem that will be mapped into the
  container. Exactly one of +source-dir+ and +source-dir-root+ must be specified
  in a +storage-mapping+.
  indexterm:[source-dir,storage-mapping]
  indexterm:[storage-mapping,Property,source-dir]
 
 |source-dir-root
 |
 |The start of a path on the host's filesystem that will be mapped into the
  container, using a different subdirectory on the host for each container
  instance. The subdirectory will be named the same as the
  <<s-resource-bundle-note-replica-names,replica name>>.
  Exactly one of +source-dir+ and +source-dir-root+ must be specified in a
  +storage-mapping+.
  indexterm:[source-dir-root,storage-mapping]
  indexterm:[storage-mapping,Property,source-dir-root]
 
 |target-dir
 |
 |The path name within the container where the host storage will be mapped
  (required)
  indexterm:[target-dir,storage-mapping]
  indexterm:[storage-mapping,Property,target-dir]
 
 |options
 |
 |File system mount options to use when mapping the storage
  indexterm:[options,storage-mapping]
  indexterm:[storage-mapping,Property,options]
 
 |=========================================================
 
 [NOTE]
 ====
 Pacemaker does not define the behavior if the source directory does not already
 exist on the host. However, it is expected that the container technology and/or
 its resource agent will create the source directory in that case.
 ====
 
 [NOTE]
 ====
 If the bundle contains a +primitive+,
 Pacemaker will automatically map the equivalent of
 +source-dir=/etc/pacemaker/authkey target-dir=/etc/pacemaker/authkey+
 and +source-dir-root=/var/log/pacemaker/bundles target-dir=/var/log+ into the
 container, so it is not necessary to specify those paths in a
 +storage-mapping+.
 ====
 
 [IMPORTANT]
 ====
 The +PCMK_authkey_location+ environment variable must not be set to anything
 other than the default of `/etc/pacemaker/authkey` on any node in the cluster.
 ====
 
 === Bundle Primitive ===
 
 A bundle may optionally contain one +<primitive>+ resource
 (see <<s-resource-primitive>>). The primitive may have operations,
 instance attributes and meta-attributes defined, as usual.
 
 If a bundle contains a primitive resource, the container image must include
 the Pacemaker Remote daemon, and at least one of +ip-range-start+ or
 +control-port+ must be configured in the bundle. Pacemaker will create an
 implicit +ocf:pacemaker:remote+ resource for the connection, launch
 Pacemaker Remote within the container, and monitor and manage the primitive
 resource via Pacemaker Remote.
 
 If the bundle has more than one container instance (replica), the primitive
 resource will function as an implicit clone (see <<s-resource-clone>>) --
 a promotable clone if the bundle has +masters+ greater than zero
 (see <<s-resource-promotable>>).
  
 [IMPORTANT]
 ====
 Containers in bundles with a +primitive+ must have an accessible networking
 environment, so that Pacemaker on the cluster nodes can contact
 Pacemaker Remote inside the container. For example, the Docker option
 `--net=none` should not be used with a +primitive+. The default (using a
 distinct network space inside the container) works in combination with
 +ip-range-start+. If the Docker option `--net=host` is used (making the
 container share the host's network space), a unique +control-port+ should be
 specified for each bundle. Any firewall must allow access to the
 +control-port+.
 ====
 
 [[s-bundle-attributes]]
 === Bundle Node Attributes ===
 
 If the bundle has a +primitive+, the primitive's resource agent may want to set
 node attributes such as <<s-promotion-scores,promotion scores>>. However, with
 containers, it is not apparent which node should get the attribute.
 
 If the container uses shared storage that is the same no matter which node the
 container is hosted on, then it is appropriate to use the promotion score on the
 bundle node itself.
 
 On the other hand, if the container uses storage exported from the underlying host,
 then it may be more appropriate to use the promotion score on the underlying host.
 
 Since this depends on the particular situation, the
 +container-attribute-target+ resource meta-attribute allows the user to specify
 which approach to use. If it is set to +host+, then user-defined node attributes
 will be checked on the underlying host. If it is anything else, the local node
 (in this case the bundle node) is used as usual.
 
 This only applies to user-defined attributes; the cluster will always check the
 local node for cluster-defined attributes such as +#uname+.
 
 If +container-attribute-target+ is +host+, the cluster will pass additional
 environment variables to the primitive's resource agent that allow it to set
 node attributes appropriately: +CRM_meta_container_attribute_target+ (identical
 to the meta-attribute value) and +CRM_meta_physical_host+ (the name of the
 underlying host).
 
 [NOTE]
 ====
 When called by a resource agent, the attrd_updater and crm_attribute commands
 will automatically check those environment variables and set attributes
 appropriately.
 ====
 
 === Bundle Meta-Attributes ===
 
 Any meta-attribute set on a bundle will be inherited by the bundle's
 primitive and any resources implicitly created by Pacemaker for the bundle.
 
 This includes options such as +priority+, +target-role+, and +is-managed+. See
 <<s-resource-options>> for more information.
 
 === Limitations of Bundles ===
 
 Restarting pacemaker while a bundle is unmanaged or the cluster is in
 maintenance mode may cause the bundle to fail.
 
 Bundles may not be explicitly cloned or included in groups. This includes the
 bundle's primitive and any resources implicitly created by Pacemaker for the
 bundle. (If +replicas+ is greater than 1, the bundle will behave like a clone
 implicitly.)
 
 Bundles do not have instance attributes, utilization attributes, or operations,
 though a bundle's primitive may have them.
 
 A bundle with a primitive can run on a Pacemaker Remote node only if the bundle
 uses a distinct +control-port+.
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt b/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt
index 32b959e03f..3f76d8a475 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt
@@ -1,882 +1,887 @@
 :compat-mode: legacy
 = Resource Constraints =
 
+//// 
+We prefer [[ch-constraints]], but older versions of asciidoc don't deal well
+with that construct for chapter headings
+////
+anchor:ch-constraints[Chapter 7, Alerts]
 indexterm:[Resource,Constraints]
 
 == Scores ==
 
 Scores of all kinds are integral to how the cluster works.
 Practically everything from moving a resource to deciding which
 resource to stop in a degraded cluster is achieved by manipulating
 scores in some way.
 
 Scores are calculated per resource and node. Any node with a
 negative score for a resource can't run that resource. The cluster
 places a resource on the node with the highest score for it.
 
 === Infinity Math ===
 
 Pacemaker implements +INFINITY+ (or equivalently, ++INFINITY+) internally as a
 score of 1,000,000. Addition and subtraction with it follow these three basic
 rules:
 
 * Any value + +INFINITY+ = +INFINITY+
 * Any value - +INFINITY+ = +-INFINITY+
 * +INFINITY+ - +INFINITY+ = +-INFINITY+
 
 [NOTE]
 ======
 What if you want to use a score higher than 1,000,000? Typically this possibility
 arises when someone wants to base the score on some external metric that might
 go above 1,000,000.
 
 The short answer is you can't.
 
 The long answer is it is sometimes possible work around this limitation
 creatively. You may be able to set the score to some computed value based on
 the external metric rather than use the metric directly. For nodes, you can
 store the metric as a node attribute, and query the attribute when computing
 the score (possibly as part of a custom resource agent).
 ======
 
 == Deciding Which Nodes a Resource Can Run On ==
 
 indexterm:[Location Constraints]
 indexterm:[Resource,Constraints,Location]
 'Location constraints' tell the cluster which nodes a resource can run on.
 
 There are two alternative strategies. One way is to say that, by default,
 resources can run anywhere, and then the location constraints specify nodes
 that are not allowed (an 'opt-out' cluster). The other way is to start with
 nothing able to run anywhere, and use location constraints to selectively
 enable allowed nodes (an 'opt-in' cluster).
 
 Whether you should choose opt-in or opt-out depends on your
 personal preference and the make-up of your cluster.  If most of your
 resources can run on most of the nodes, then an opt-out arrangement is
 likely to result in a simpler configuration.  On the other-hand, if
 most resources can only run on a small subset of nodes, an opt-in
 configuration might be simpler.
 
 === Location Properties ===
 
 .Properties of a rsc_location Constraint
 [width="95%",cols="2m,1,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the constraint
 indexterm:[id,Location Constraints]
 indexterm:[Constraints,Location,id]
 
 |rsc
 |
 |The name of the resource to which this constraint applies
 indexterm:[rsc,Location Constraints]
 indexterm:[Constraints,Location,rsc]
 
 |rsc-pattern
 |
 |An extended regular expression (as defined in
  http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04[POSIX])
  matching the names of resources to which this constraint
  applies, if +rsc+ is not specified; if the regular expression contains
  submatches and the constraint is governed by a rule (see <<ch-rules>>), the
  submatches can be referenced as +%0+ through +%9+ in the rule's
  +score-attribute+ or a rule expression's +attribute+
 indexterm:[rsc-pattern,Location Constraints]
 indexterm:[Constraints,Location,rsc-pattern]
 
 |node
 |
 |A node's name
 indexterm:[node,Location Constraints]
 indexterm:[Constraints,Location,node]
 
 |score
 |
 |Positive values indicate a preference for running the affected resource(s) on
  this node -- the higher the value, the stronger the preference. Negative values
  indicate the resource(s) should avoid this node (a value of +-INFINITY+
  changes "should" to "must").
 indexterm:[score,Location Constraints]
 indexterm:[Constraints,Location,score]
 
 |resource-discovery
 |always
 a|Whether Pacemaker should perform resource discovery (that is, check whether
  the resource is already running) for this resource on this node. This should
  normally be left as the default, so that rogue instances of a service can be
  stopped when they are running where they are not supposed to be. However,
  there are two situations where disabling resource discovery is a good idea:
  when a service is not installed on a node, discovery might return an error
  (properly written OCF agents will not, so this is usually only seen with other
  agent types); and when Pacemaker Remote is used to scale a cluster to hundreds
  of nodes, limiting resource discovery to allowed nodes can significantly boost
  performance.
 
 * +always:+ Always perform resource discovery for the specified resource on this node.
 * +never:+ Never perform resource discovery for the specified resource on this node.
   This option should generally be used with a -INFINITY score, although that is not strictly
   required.
 * +exclusive:+ Perform resource discovery for the specified resource only on
   this node (and other nodes similarly marked as +exclusive+). Multiple location
   constraints using +exclusive+ discovery for the same resource across
   different nodes creates a subset of nodes resource-discovery is exclusive to.
   If a resource is marked for +exclusive+ discovery on one or more nodes, that
   resource is only allowed to be placed within that subset of nodes.
 
 indexterm:[Resource Discovery,Location Constraints]
 indexterm:[Constraints,Location,Resource Discovery]
 
 |=========================================================
 
 [WARNING]
 =========
 Setting resource-discovery to +never+ or +exclusive+ removes Pacemaker's
 ability to detect and stop unwanted instances of a service running
 where it's not supposed to be. It is up to the system administrator (you!)
 to make sure that the service can 'never' be active on nodes without
 resource-discovery (such as by leaving the relevant software uninstalled).
 =========
 
 === Asymmetrical "Opt-In" Clusters ===
 indexterm:[Asymmetrical Opt-In Clusters]
 indexterm:[Cluster Type,Asymmetrical Opt-In]
 
 To create an opt-in cluster, start by preventing resources from
 running anywhere by default:
 
 ----
 # crm_attribute --name symmetric-cluster --update false
 ----
 
 Then start enabling nodes.  The following fragment says that the web
 server prefers *sles-1*, the database prefers *sles-2* and both can
 fail over to *sles-3* if their most preferred node fails.
 
 .Opt-in location constraints for two resources
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_location id="loc-1" rsc="Webserver" node="sles-1" score="200"/>
     <rsc_location id="loc-2" rsc="Webserver" node="sles-3" score="0"/>
     <rsc_location id="loc-3" rsc="Database" node="sles-2" score="200"/>
     <rsc_location id="loc-4" rsc="Database" node="sles-3" score="0"/>
 </constraints>
 -------
 ======
 
 === Symmetrical "Opt-Out" Clusters ===
 indexterm:[Symmetrical Opt-Out Clusters]
 indexterm:[Cluster Type,Symmetrical Opt-Out]
 
 To create an opt-out cluster, start by allowing resources to run
 anywhere by default:
 
 ----
 # crm_attribute --name symmetric-cluster --update true
 ----
 
 Then start disabling nodes.  The following fragment is the equivalent
 of the above opt-in configuration.
 
 .Opt-out location constraints for two resources
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_location id="loc-1" rsc="Webserver" node="sles-1" score="200"/>
     <rsc_location id="loc-2-dont-run" rsc="Webserver" node="sles-2" score="-INFINITY"/>
     <rsc_location id="loc-3-dont-run" rsc="Database" node="sles-1" score="-INFINITY"/>
     <rsc_location id="loc-4" rsc="Database" node="sles-2" score="200"/>
 </constraints>
 -------
 ======
 
 [[node-score-equal]]
 === What if Two Nodes Have the Same Score ===
 
 If two nodes have the same score, then the cluster will choose one.
 This choice may seem random and may not be what was intended, however
 the cluster was not given enough information to know any better.
 
 .Constraints where a resource prefers two nodes equally
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_location id="loc-1" rsc="Webserver" node="sles-1" score="INFINITY"/>
     <rsc_location id="loc-2" rsc="Webserver" node="sles-2" score="INFINITY"/>
     <rsc_location id="loc-3" rsc="Database" node="sles-1" score="500"/>
     <rsc_location id="loc-4" rsc="Database" node="sles-2" score="300"/>
     <rsc_location id="loc-5" rsc="Database" node="sles-2" score="200"/>
 </constraints>
 -------
 ======
 
 In the example above, assuming no other constraints and an inactive
 cluster, +Webserver+ would probably be placed on +sles-1+ and +Database+ on
 +sles-2+.  It would likely have placed +Webserver+ based on the node's
 uname and +Database+ based on the desire to spread the resource load
 evenly across the cluster.  However other factors can also be involved
 in more complex configurations.
 
 [[s-resource-ordering]]
 == Specifying the Order in which Resources Should Start/Stop ==
 
 indexterm:[Resource,Constraints,Ordering]
 indexterm:[Resource,Start Order]
 indexterm:[Ordering Constraints]
 
 'Ordering constraints' tell the cluster the order in which resources should
 start.
 
 [IMPORTANT]
 ====
 Ordering constraints affect 'only' the ordering of resources;
 they do 'not' require that the resources be placed on the
 same node. If you want resources to be started on the same node
 'and' in a specific order, you need both an ordering constraint 'and'
 a colocation constraint (see <<s-resource-colocation>>), or
 alternatively, a group (see <<group-resources>>).
 ====
 
 === Ordering Properties ===
 
 .Properties of a rsc_order Constraint
 [width="95%",cols="1m,1,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the constraint
 indexterm:[id,Ordering Constraints]
 indexterm:[Constraints,Ordering,id]
 
 |first
 |
 |Name of the resource that the +then+ resource depends on
 indexterm:[first,Ordering Constraints]
 indexterm:[Constraints,Ordering,first]
 
 |then
 |
 |Name of the dependent resource
 indexterm:[then,Ordering Constraints]
 indexterm:[Constraints,Ordering,then]
 
 |first-action
 |start
 |The action that the +first+ resource must complete before +then-action+
  can be initiated for the +then+ resource.  Allowed values: +start+,
  +stop+, +promote+, +demote+.
  indexterm:[first-action,Ordering Constraints]
  indexterm:[Constraints,Ordering,first-action]
 
 |then-action
 |value of +first-action+
 |The action that the +then+ resource can execute only after the
  +first-action+ on the +first+ resource has completed.  Allowed
  values: +start+, +stop+, +promote+, +demote+.
  indexterm:[then-action,Ordering Constraints]
  indexterm:[Constraints,Ordering,then-action]
 
 |kind
 |Mandatory
 a|How to enforce the constraint. Allowed values:
 
 * +Optional:+ Just a suggestion. Only applies if both resources are
   executing the specified actions. Any change in state by the +first+ resource
   will have no effect on the +then+ resource.
 * +Mandatory:+ Always. If +first+ does not perform +first-action+, +then+ will
   not be allowed to performed +then-action+. If +first+ is restarted, +then+
   (if running) will be stopped beforehand and started afterward.
 * +Serialize:+ Ensure that no two stop/start actions occur concurrently
   for the resources. +First+ and +then+ can start in either order,
   but one must complete starting before the other can be started. A typical use
   case is when resource start-up puts a high load on the host.
 
 indexterm:[kind,Ordering Constraints]
 indexterm:[Constraints,Ordering,kind]
 
 |symmetrical
 |TRUE for +Mandatory+ and +Optional+ kinds. FALSE for +Serialize+ kind.
 |If true, the reverse of the constraint applies for the opposite action (for
  example, if B starts after A starts, then B stops before A stops).
  +Serialize+ orders cannot be symmetrical.
 indexterm:[symmetrical,Ordering Constraints]
 indexterm:[Ordering Constraints,symmetrical]
 
 |=========================================================
 
 +Promote+ and +demote+ apply to the master role of
 <<s-resource-promotable,promotable>> resources.
 
 === Optional and mandatory ordering ===
 
 Here is an example of ordering constraints where +Database+ 'must' start before
 +Webserver+, and +IP+ 'should' start before +Webserver+ if they both need to be
 started:
 
 .Optional and mandatory ordering constraints
 ======
 [source,XML]
 -------
 <constraints>
 <rsc_order id="order-1" first="IP" then="Webserver" kind="Optional"/>
 <rsc_order id="order-2" first="Database" then="Webserver" kind="Mandatory" />
 </constraints>
 -------
 ======
 
 Because the above example lets +symmetrical+ default to TRUE, 
 +Webserver+ must be stopped before +Database+ can be stopped,
 and +Webserver+ should be stopped before +IP+
 if they both need to be stopped.
 
 [[s-resource-colocation]]
 == Placing Resources Relative to other Resources ==
 
 indexterm:[Resource,Constraints,Colocation]
 indexterm:[Resource,Location Relative to other Resources]
 'Colocation constraints' tell the cluster that the location of one resource
 depends on the location of another one.
 
 Colocation has an important side-effect: it affects the order in which
 resources are assigned to a node. Think about it: You can't place A relative to
 B unless you know where B is.
 footnote:[
 While the human brain is sophisticated enough to read the constraint
 in any order and choose the correct one depending on the situation,
 the cluster is not quite so smart. Yet.
 ]
 
 So when you are creating colocation constraints, it is important to
 consider whether you should colocate A with B, or B with A.
 
 Another thing to keep in mind is that, assuming A is colocated with
 B, the cluster will take into account A's preferences when
 deciding which node to choose for B.
 
 For a detailed look at exactly how this occurs, see
 http://clusterlabs.org/doc/Colocation_Explained.pdf[Colocation Explained].
 
 [IMPORTANT]
 ====
 Colocation constraints affect 'only' the placement of resources; they do 'not'
 require that the resources be started in a particular order. If you want
 resources to be started on the same node 'and' in a specific order, you need
 both an ordering constraint (see <<s-resource-ordering>>) 'and' a colocation
 constraint, or alternatively, a group (see <<group-resources>>).
 ====
 
 === Colocation Properties ===
 
 .Properties of a rsc_colocation Constraint
 [width="95%",cols="1m,1,<4",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the constraint (required).
  indexterm:[id,Colocation Constraints]
  indexterm:[Constraints,Colocation,id]
 
 |rsc
 |
 |The name of a resource that should be located relative to +with-rsc+ (required).
  indexterm:[rsc,Colocation Constraints]
  indexterm:[Constraints,Colocation,rsc]
 
 |with-rsc
 |
 |The name of the resource used as the colocation target. The cluster will
  decide where to put this resource first and then decide where to put +rsc+ (required).
  indexterm:[with-rsc,Colocation Constraints]
  indexterm:[Constraints,Colocation,with-rsc]
 
 |node-attribute
 |#uname
 |The node attribute that must be the same on the node running +rsc+ and the
  node running +with-rsc+ for the constraint to be satisfied. (For details,
  see <<s-coloc-attribute>>.)
  indexterm:[node-attribute,Colocation Constraints]
  indexterm:[Constraints,Colocation,node-attribute]
 
 |score
 |
 |Positive values indicate the resources should run on the same
  node. Negative values indicate the resources should run on
  different nodes. Values of \+/- +INFINITY+ change "should" to "must".
  indexterm:[score,Colocation Constraints]
  indexterm:[Constraints,Colocation,score]
 
 |=========================================================
 
 === Mandatory Placement ===
 
 Mandatory placement occurs when the constraint's score is
 ++INFINITY+ or +-INFINITY+.  In such cases, if the constraint can't be
 satisfied, then the +rsc+ resource is not permitted to run.  For
 +score=INFINITY+, this includes cases where the +with-rsc+ resource is
 not active.
 
 If you need resource +A+ to always run on the same machine as
 resource +B+, you would add the following constraint:
 
 .Mandatory colocation constraint for two resources
 ====
 [source,XML]
 <rsc_colocation id="colocate" rsc="A" with-rsc="B" score="INFINITY"/>
 ====
 
 Remember, because +INFINITY+ was used, if +B+ can't run on any
 of the cluster nodes (for whatever reason) then +A+ will not
 be allowed to run. Whether +A+ is running or not has no effect on +B+.
 
 Alternatively, you may want the opposite -- that +A+ 'cannot'
 run on the same machine as +B+.  In this case, use
 +score="-INFINITY"+.
 
 .Mandatory anti-colocation constraint for two resources
 ====
 [source,XML]
 <rsc_colocation id="anti-colocate" rsc="A" with-rsc="B" score="-INFINITY"/>
 ====
 
 Again, by specifying +-INFINITY+, the constraint is binding.  So if the
 only place left to run is where +B+ already is, then
 +A+ may not run anywhere.
 
 As with +INFINITY+, +B+ can run even if +A+ is stopped.
 However, in this case +A+ also can run if +B+ is stopped, because it still
 meets the constraint of +A+ and +B+ not running on the same node.
 
 === Advisory Placement ===
 
 If mandatory placement is about "must" and "must not", then advisory
 placement is the "I'd prefer if" alternative.  For constraints with
 scores greater than +-INFINITY+ and less than +INFINITY+, the cluster
 will try to accommodate your wishes but may ignore them if the
 alternative is to stop some of the cluster resources.
 
 As in life, where if enough people prefer something it effectively
 becomes mandatory, advisory colocation constraints can combine with
 other elements of the configuration to behave as if they were
 mandatory.
 
 .Advisory colocation constraint for two resources
 ====
 [source,XML]
 <rsc_colocation id="colocate-maybe" rsc="A" with-rsc="B" score="500"/>
 ====
 
 [[s-coloc-attribute]]
 === Colocation by Node Attribute ===
 
 The +node-attribute+ property of a colocation constraints allows you to express
 the requirement, "these resources must be on similar nodes".
 
 As an example, imagine that you have two Storage Area Networks (SANs) that are
 not controlled by the cluster, and each node is connected to one or the other.
 You may have two resources +r1+ and +r2+ such that +r2+ needs to use the same
 SAN as +r1+, but doesn't necessarily have to be on the same exact node.
 In such a case, you could define a <<s-node-attributes,node attribute>> named
 +san+, with the value +san1+ or +san2+ on each node as appropriate. Then, you
 could colocate +r2+ with +r1+ using +node-attribute+ set to +san+.
 
 [[s-resource-sets]]
 == Resource Sets ==
 
 'Resource sets' allow multiple resources to be affected by a single constraint.
 
 .A set of 3 resources
 ====
 [source,XML]
 ----
 <resource_set id="resource-set-example">
    <resource_ref id="A"/>
    <resource_ref id="B"/>
    <resource_ref id="C"/>
 </resource_set>
 ----
 ====
 
 Resource sets are valid inside +rsc_location+,
 +rsc_order+ (see <<s-resource-sets-ordering>>),
 +rsc_colocation+ (see <<s-resource-sets-colocation>>),
 and +rsc_ticket+ (see <<s-ticket-constraints>>) constraints.
 
 A resource set has a number of properties that can be set,
 though not all have an effect in all contexts.
 
 .Properties of a resource_set
 [width="95%",cols="2m,1,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the set
 indexterm:[id,Resource Sets]
 indexterm:[Constraints,Resource Sets,id]
 
 |sequential
 |true
 |Whether the members of the set must be acted on in order.
  Meaningful within +rsc_order+ and +rsc_colocation+.
 indexterm:[sequential,Resource Sets]
 indexterm:[Constraints,Resource Sets,sequential]
 
 |require-all
 |true
 |Whether all members of the set must be active before continuing.
  With the current implementation, the cluster may continue even if only one
  member of the set is started, but if more than one member of the set is
  starting at the same time, the cluster will still wait until all of those have
  started before continuing (this may change in future versions).
  Meaningful within +rsc_order+.
 indexterm:[require-all,Resource Sets]
 indexterm:[Constraints,Resource Sets,require-all]
 
 |role
 |
 |Limit the effect of the constraint to the specified role.
  Meaningful within +rsc_location+, +rsc_colocation+ and +rsc_ticket+.
 indexterm:[role,Resource Sets]
 indexterm:[Constraints,Resource Sets,role]
 
 |action
 |
 |Limit the effect of the constraint to the specified action.
  Meaningful within +rsc_order+.
 indexterm:[action,Resource Sets]
 indexterm:[Constraints,Resource Sets,action]
 
 |score
 |
 |'Advanced use only.' Use a specific score for this set within the constraint.
 indexterm:[score,Resource Sets]
 indexterm:[Constraints,Resource Sets,score]
 
 |=========================================================
   
 [[s-resource-sets-ordering]]
 == Ordering Sets of Resources ==
 
 A common situation is for an administrator to create a chain of
 ordered resources, such as:
 
 .A chain of ordered resources
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1" first="A" then="B" />
     <rsc_order id="order-2" first="B" then="C" />
     <rsc_order id="order-3" first="C" then="D" />
 </constraints>
 -------
 ======
 
 .Visual representation of the four resources' start order for the above constraints
 image::images/resource-set.png["Ordered set",width="16cm",height="2.5cm",align="center"]
 
 === Ordered Set ===
 
 To simplify this situation, resource sets (see <<s-resource-sets>>) can be used
 within ordering constraints:
 
 .A chain of ordered resources expressed as a set
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1">
       <resource_set id="ordered-set-example" sequential="true">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
     </rsc_order>
 </constraints>
 -------
 ======
 
 While the set-based format is not less verbose, it is significantly
 easier to get right and maintain.
 
 [IMPORTANT]
 =========
 If you use a higher-level tool, pay attention to how it exposes this
 functionality. Depending on the tool, creating a set +A B+ may be equivalent to
 +A then B+, or +B then A+.
 =========
 
 === Ordering Multiple Sets ===
 
 The syntax can be expanded to allow sets of resources to be ordered relative to
 each other, where the members of each individual set may be ordered or
 unordered (controlled by the +sequential+ property). In the example below, +A+
 and +B+ can both start in parallel, as can +C+ and +D+, however +C+ and +D+ can
 only start once _both_ +A+ _and_ +B+ are active.
 
 .Ordered sets of unordered resources
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1">
       <resource_set id="ordered-set-1" sequential="false">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
       </resource_set>
       <resource_set id="ordered-set-2" sequential="false">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
     </rsc_order>
   </constraints>
 -------
 ======
 
 .Visual representation of the start order for two ordered sets of unordered resources
 image::images/two-sets.png["Two ordered sets",width="13cm",height="7.5cm",align="center"]
 
 Of course either set -- or both sets -- of resources can also be
 internally ordered (by setting +sequential="true"+) and there is no
 limit to the number of sets that can be specified.
 
 .Advanced use of set ordering - Three ordered sets, two of which are internally unordered
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1">
       <resource_set id="ordered-set-1" sequential="false">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
       </resource_set>
       <resource_set id="ordered-set-2" sequential="true">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
       <resource_set id="ordered-set-3" sequential="false">
         <resource_ref id="E"/>
         <resource_ref id="F"/>
       </resource_set>
     </rsc_order>
 </constraints>
 -------
 ======
 
 .Visual representation of the start order for the three sets defined above
 image::images/three-sets.png["Three ordered sets",width="16cm",height="7.5cm",align="center"]
 
 [IMPORTANT]
 ====
 An ordered set with +sequential=false+ makes sense only if there is another
 set in the constraint. Otherwise, the constraint has no effect.
 ====
 
 === Resource Set OR Logic ===
 
 The unordered set logic discussed so far has all been "AND" logic.
 To illustrate this take the 3 resource set figure in the previous section.
 Those sets can be expressed, +(A and B) then \(C) then (D) then (E and F)+.
 
 Say for example we want to change the first set, +(A and B)+, to use "OR" logic
 so the sets look like this: +(A or B) then \(C) then (D) then (E and F)+.
 This functionality can be achieved through the use of the +require-all+
 option.  This option defaults to TRUE which is why the
 "AND" logic is used by default.  Setting +require-all=false+ means only one
 resource in the set needs to be started before continuing on to the next set.
 
 .Resource Set "OR" logic: Three ordered sets, where the first set is internally unordered with "OR" logic
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_order id="order-1">
       <resource_set id="ordered-set-1" sequential="false" require-all="false">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
       </resource_set>
       <resource_set id="ordered-set-2" sequential="true">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
       <resource_set id="ordered-set-3" sequential="false">
         <resource_ref id="E"/>
         <resource_ref id="F"/>
       </resource_set>
     </rsc_order>
 </constraints>
 -------
 ======
 
 [IMPORTANT]
 ====
 An ordered set with +require-all=false+ makes sense only in conjunction with
 +sequential=false+. Think of it like this: +sequential=false+ modifies the set
 to be an unordered set using "AND" logic by default, and adding
 +require-all=false+ flips the unordered set's "AND" logic to "OR" logic.
 ====
 
 [[s-resource-sets-colocation]]
 == Colocating Sets of Resources ==
 
 Another common situation is for an administrator to create a set of
 colocated resources.
 
 One way to do this would be to define a resource group (see
 <<group-resources>>), but that cannot always accurately express the desired
 state.
 
 Another way would be to define each relationship as an individual constraint,
 but that causes a constraint explosion as the number of resources and
 combinations grow. An example of this approach:
 
 .Chain of colocated resources
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_colocation id="coloc-1" rsc="D" with-rsc="C" score="INFINITY"/>
     <rsc_colocation id="coloc-2" rsc="C" with-rsc="B" score="INFINITY"/>
     <rsc_colocation id="coloc-3" rsc="B" with-rsc="A" score="INFINITY"/>
 </constraints>
 -------
 ======
 
 To make things easier, resource sets (see <<s-resource-sets>>) can be used
 within colocation constraints. As with the chained version, a
 resource that can't be active prevents any resource that must be
 colocated with it from being active.  For example, if +B+ is not
 able to run, then both +C+ and by inference +D+ must also remain
 stopped. Here is an example +resource_set+:
 
 .Equivalent colocation chain expressed using +resource_set+
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_colocation id="coloc-1" score="INFINITY" >
       <resource_set id="colocated-set-example" sequential="true">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
         <resource_ref id="C"/>
         <resource_ref id="D"/>
       </resource_set>
     </rsc_colocation>
 </constraints>
 -------
 ======
 
 [IMPORTANT]
 =========
 If you use a higher-level tool, pay attention to how it exposes this
 functionality. Depending on the tool, creating a set +A B+ may be equivalent to
 +A with B+, or +B with A+.
 =========
 
 This notation can also be used to tell the cluster that sets of resources must
 be colocated relative to each other, where the individual members of each set
 may or may not depend on each other being active (controlled by the
 +sequential+ property).
 
 In this example, +A+, +B+, and +C+ will each be colocated with +D+.
 +D+ must be active, but any of +A+, +B+, or +C+ may be inactive without
 affecting any other resources.
 
 .Using colocated sets to specify a common peer
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_colocation id="coloc-1" score="INFINITY" >
       <resource_set id="colocated-set-1" sequential="false">
         <resource_ref id="A"/>
         <resource_ref id="B"/>
         <resource_ref id="C"/>
       </resource_set>
       <resource_set id="colocated-set-2" sequential="true">
         <resource_ref id="D"/>
       </resource_set>
     </rsc_colocation>
 </constraints>
 -------
 ======
 
 [IMPORTANT]
 ====
 A colocated set with +sequential=false+ makes sense only if there is another
 set in the constraint. Otherwise, the constraint has no effect.
 ====
 
 There is no inherent limit to the number and size of the sets used.
 The only thing that matters is that in order for any member of one set
 in the constraint to be active, all members of sets listed after it must also
 be active (and naturally on the same node); and if a set has +sequential="true"+,
 then in order for one member of that set to be active, all members listed
 before it must also be active.
 
 If desired, you can restrict the dependency to instances of promotable clone
 resources that are in a specific role, using the set's +role+ property.
 
 .Colocation chain in which the members of the middle set have no interdependencies, and the last listed set (which the cluster places first) is restricted to instances in master status.
 ======
 [source,XML]
 -------
 <constraints>
     <rsc_colocation id="coloc-1" score="INFINITY" >
       <resource_set id="colocated-set-1" sequential="true">
         <resource_ref id="B"/>
         <resource_ref id="A"/>
       </resource_set>
       <resource_set id="colocated-set-2" sequential="false">
         <resource_ref id="C"/>
         <resource_ref id="D"/>
         <resource_ref id="E"/>
       </resource_set>
       <resource_set id="colocated-set-3" sequential="true" role="Master">
         <resource_ref id="G"/>
         <resource_ref id="F"/>
       </resource_set>
     </rsc_colocation>
 </constraints>
 -------
 ======
 
 .Visual representation the above example (resources to the left are placed first)
 image::images/three-sets-complex.png["Colocation chain",width="16cm",height="9cm",align="center"]
 
 [NOTE]
 ====
 Pay close attention to the order in which resources and sets are listed.
 While the colocation dependency for members of any one set is last-to-first,
 the colocation dependency for multiple sets is first-to-last. In the above
 example, +B+ is colocated with +A+, but +colocated-set-1+ is
 colocated with +colocated-set-2+.
 
 Unlike ordered sets, colocated sets do not use the +require-all+ option.
 ====
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt b/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt
index 1ae131fbf2..0d8f289281 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt
@@ -1,334 +1,340 @@
 :compat-mode: legacy
 = Multi-Site Clusters and Tickets =
 
 Apart from local clusters, Pacemaker also supports multi-site clusters.
 That means you can have multiple, geographically dispersed sites, each with a
 local cluster. Failover between these clusters can be coordinated
 manually by the administrator, or automatically by a higher-level entity called
 a 'Cluster Ticket Registry (CTR)'.
 
 == Challenges for Multi-Site Clusters ==
 
 Typically, multi-site environments are too far apart to support
 synchronous communication and data replication between the sites.
 That leads to significant challenges:
 
 - How do we make sure that a cluster site is up and running?
 
 - How do we make sure that resources are only started once?
 
 - How do we make sure that quorum can be reached between the different
 sites and a split-brain scenario avoided?
 
 - How do we manage failover between sites?
 
 - How do we deal with high latency in case of resources that need to be
 stopped? 
 
 In the following sections, learn how to meet these challenges.
 
 == Conceptual Overview ==
 
 Multi-site clusters can be considered as “overlay” clusters where
 each cluster site corresponds to a cluster node in a traditional cluster.
 The overlay cluster can be managed by a CTR in order to
 guarantee that any cluster resource will be active
 on no more than one cluster site. This is achieved by using
 'tickets' that are treated as failover domain between cluster
 sites, in case a site should be down.
 
 The following sections explain the individual components and mechanisms
 that were introduced for multi-site clusters in more detail.
 
 === Ticket ===
 
 Tickets are, essentially, cluster-wide attributes. A ticket grants the
 right to run certain resources on a specific cluster site. Resources can
 be bound to a certain ticket by +rsc_ticket+ constraints. Only if the
 ticket is available at a site can the respective resources be started there.
 Vice versa, if the ticket is revoked, the resources depending on that
 ticket must be stopped.
 
 The ticket thus is similar to a 'site quorum', i.e. the permission to
 manage/own resources associated with that site. (One can also think of the
 current +have-quorum+ flag as a special, cluster-wide ticket that is granted in
 case of node majority.)
 
 Tickets can be granted and revoked either manually by administrators
 (which could be the default for classic enterprise clusters), or via
 the automated CTR mechanism described below.
 
 A ticket can only be owned by one site at a time. Initially, none
 of the sites has a ticket. Each ticket must be granted once by the cluster
 administrator. 
 
 The presence or absence of tickets for a site is stored in the CIB as a
 cluster status. With regards to a certain ticket, there are only two states
 for a site: +true+ (the site has the ticket) or +false+ (the site does
 not have the ticket). The absence of a certain ticket (during the initial
 state of the multi-site cluster) is the same as the value +false+.
 
 === Dead Man Dependency ===
 
 A site can only activate resources safely if it can be sure that the
 other site has deactivated them. However after a ticket is revoked, it can
 take a long time until all resources depending on that ticket are stopped
 "cleanly", especially in case of cascaded resources. To cut that process
 short, the concept of a 'Dead Man Dependency' was introduced.
 
 If a dead man dependency is in force, if a ticket is revoked from a site, the
 nodes that are hosting dependent resources are fenced. This considerably speeds
 up the recovery process of the cluster and makes sure that resources can be
 migrated more quickly.
 
 This can be configured by specifying a +loss-policy="fence"+ in
 +rsc_ticket+ constraints.
 
 === Cluster Ticket Registry ===
 
 A CTR is a coordinated group of network daemons that automatically handles
 granting, revoking, and timing out tickets (instead of the administrator
 revoking the ticket somewhere, waiting for everything to stop, and then
 granting it on the desired site).
 
 Pacemaker does not implement its own CTR, but interoperates with external
 software designed for that purpose (similar to how resource and fencing agents
 are not directly part of pacemaker).
 
 Participating clusters run the CTR daemons, which connect to each other, exchange
 information about their connectivity, and vote on which sites gets which
 tickets.
 
 A ticket is granted to a site only once the CTR is sure that the ticket
 has been relinquished by the previous owner, implemented via a timer in most
 scenarios. If a site loses connection to its peers, its tickets time out and
 recovery occurs. After the connection timeout plus the recovery timeout has
 passed, the other sites are allowed to re-acquire the ticket and start the
 resources again.
 
 This can also be thought of as a "quorum server", except that it is not
 a single quorum ticket, but several.
 
 === Configuration Replication ===
 
 As usual, the CIB is synchronized within each cluster, but it is 'not' synchronized
 across cluster sites of a multi-site cluster. You have to configure the resources
 that will be highly available across the multi-site cluster for every site
 accordingly.
 
 
 [[s-ticket-constraints]]
 == Configuring Ticket Dependencies ==
 
 The `rsc_ticket` constraint lets you specify the resources depending on a certain
 ticket. Together with the constraint, you can set a `loss-policy` that defines
 what should happen to the respective resources if the ticket is revoked. 
 
 The attribute `loss-policy` can have the following values:
 
 * +fence:+ Fence the nodes that are running the relevant resources.
 
 * +stop:+ Stop the relevant resources.
 
 * +freeze:+ Do nothing to the relevant resources.
 
 * +demote:+ Demote relevant resources that are running in master mode to slave mode. 
 
 
 .Constraint that fences node if +ticketA+ is revoked
 ====
 [source,XML]
 -------
 <rsc_ticket id="rsc1-req-ticketA" rsc="rsc1" ticket="ticketA" loss-policy="fence"/>
 -------
 ====
 
 The example above creates a constraint with the ID +rsc1-req-ticketA+. It
 defines that the resource +rsc1+ depends on +ticketA+ and that the node running
 the resource should be fenced if +ticketA+ is revoked.
 
 If resource +rsc1+ were a promotable resource (i.e. it could run in master or
 slave mode), you might want to configure that only master mode
 depends on +ticketA+. With the following configuration, +rsc1+ will be
 demoted to slave mode if +ticketA+ is revoked:
 
 .Constraint that demotes +rsc1+ if +ticketA+ is revoked
 ====
 [source,XML]
 -------
 <rsc_ticket id="rsc1-req-ticketA" rsc="rsc1" rsc-role="Master" ticket="ticketA" loss-policy="demote"/>
 -------
 ====
 
 You can create multiple `rsc_ticket` constraints to let multiple resources
 depend on the same ticket. However, `rsc_ticket` also supports resource sets
 (see <<s-resource-sets>>),
 so one can easily list all the resources in one `rsc_ticket` constraint instead.
 
 .Ticket constraint for multiple resources
 ====
 [source,XML]
 -------
 <rsc_ticket id="resources-dep-ticketA" ticket="ticketA" loss-policy="fence">
   <resource_set id="resources-dep-ticketA-0" role="Started">
     <resource_ref id="rsc1"/>
     <resource_ref id="group1"/>
     <resource_ref id="clone1"/>
   </resource_set>
   <resource_set id="resources-dep-ticketA-1" role="Master">
     <resource_ref id="ms1"/>
   </resource_set>
 </rsc_ticket>
 -------
 ====
 
 In the example above, there are two resource sets, so we can list resources
 with different roles in a single +rsc_ticket+ constraint. There's no dependency
 between the two resource sets, and there's no dependency among the
 resources within a resource set. Each of the resources just depends on
 +ticketA+.
 
 Referencing resource templates in +rsc_ticket+ constraints, and even
 referencing them within resource sets, is also supported. 
 
 If you want other resources to depend on further tickets, create as many
 constraints as necessary with +rsc_ticket+.
 
 
 == Managing Multi-Site Clusters ==
 
 === Granting and Revoking Tickets Manually ===
 
 You can grant tickets to sites or revoke them from sites manually.
 If you want to re-distribute a ticket, you should wait for
 the dependent resources to stop cleanly at the previous site before you
 grant the ticket to the new site.
 
 Use the `crm_ticket` command line tool to grant and revoke tickets. 
 
+////
+These commands will actually just print a message telling the user that they
+requre '--force'. That is probably a good exercise rather than letting novice
+users cut and paste '--force' here.
+////
+
 To grant a ticket to this site:
 -------
 # crm_ticket --ticket ticketA --grant
 -------
 
 To revoke a ticket from this site:
 -------
 # crm_ticket --ticket ticketA --revoke
 -------
 
 [IMPORTANT]
 ====
 If you are managing tickets manually, use the `crm_ticket` command with
 great care, because it cannot check whether the same ticket is already
 granted elsewhere. 
 ====
 
 
 === Granting and Revoking Tickets via a Cluster Ticket Registry ===
 
 We will use https://github.com/ClusterLabs/booth[Booth] here as an example of
 software that can be used with pacemaker as a Cluster Ticket Registry.  Booth
 implements the
 http://en.wikipedia.org/wiki/Raft_%28computer_science%29[Raft]
 algorithm to guarantee the distributed consensus among different
 cluster sites, and manages the ticket distribution (and thus the failover
 process between sites).
 
 Each of the participating clusters and 'arbitrators' runs the Booth daemon
 `boothd`.
 
 An 'arbitrator' is the multi-site equivalent of a quorum-only node in a local
 cluster. If you have a setup with an even number of sites,
 you need an additional instance to reach consensus about decisions such
 as failover of resources across sites. In this case, add one or more
 arbitrators running at additional sites. Arbitrators are single machines
 that run a booth instance in a special mode. An arbitrator is especially
 important for a two-site scenario, otherwise there is no way for one site
 to distinguish between a network failure between it and the other site, and
 a failure of the other site.
 
 The most common multi-site scenario is probably a multi-site cluster with two
 sites and a single arbitrator on a third site. However, technically, there are
 no limitations with regards to the number of sites and the number of
 arbitrators involved.
 
 `Boothd` at each site connects to its peers running at the other sites and
 exchanges connectivity details. Once a ticket is granted to a site, the
 booth mechanism will manage the ticket automatically: If the site which
 holds the ticket is out of service, the booth daemons will vote which
 of the other sites will get the ticket. To protect against brief
 connection failures, sites that lose the vote (either explicitly or
 implicitly by being disconnected from the voting body) need to
 relinquish the ticket after a time-out. Thus, it is made sure that a
 ticket will only be re-distributed after it has been relinquished by the
 previous site.  The resources that depend on that ticket will fail over
 to the new site holding the ticket. The nodes that have run the 
 resources before will be treated according to the `loss-policy` you set
 within the `rsc_ticket` constraint.
 
 Before the booth can manage a certain ticket within the multi-site cluster,
 you initially need to grant it to a site manually via the `booth` command-line
 tool. After you have initially granted a ticket to a site, `boothd`
 will take over and manage the ticket automatically.  
 
 [IMPORTANT]
 ====
 The `booth` command-line tool can be used to grant, list, or
 revoke tickets and can be run on any machine where `boothd` is running. 
 If you are managing tickets via Booth, use only `booth` for manual
 intervention, not `crm_ticket`. That ensures the same ticket
 will only be owned by one cluster site at a time.
 ====
 
 ==== Booth Requirements ====
 
 * All clusters that will be part of the multi-site cluster must be based on
   Pacemaker.
 
 * Booth must be installed on all cluster nodes and on all arbitrators that will
   be part of the multi-site cluster. 
 
 * Nodes belonging to the same cluster site should be synchronized via NTP. However,
   time synchronization is not required between the individual cluster sites.
 
 === General Management of Tickets ===
 
 Display the information of tickets:
 -------
 # crm_ticket --info
 -------
 
 Or you can monitor them with:
 -------
 # crm_mon --tickets
 -------
 
 Display the +rsc_ticket+ constraints that apply to a ticket:
 -------
 # crm_ticket --ticket ticketA --constraints
 -------
 
 When you want to do maintenance or manual switch-over of a ticket,
 revoking the ticket would trigger the loss policies. If
 +loss-policy="fence"+, the dependent resources could not be gracefully
 stopped/demoted, and other unrelated resources could even be affected. 
 
 The proper way is making the ticket 'standby' first with:
 -------
 # crm_ticket --ticket ticketA --standby
 -------
 
 Then the dependent resources will be stopped or demoted gracefully without
 triggering the loss policies.
 
 If you have finished the maintenance and want to activate the ticket again,
 you can run:
 -------
 # crm_ticket --ticket ticketA --activate
 -------
 
 == For more information ==
 
 * https://www.suse.com/documentation/sle-ha-geo-12/art_ha_geo_quick/data/art_ha_geo_quick.html[SUSE's Geo Clustering quick start]
 
 * https://github.com/ClusterLabs/booth[Booth]
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt
index 61710b64b9..4fa69cd5b1 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt
@@ -1,887 +1,897 @@
 :compat-mode: legacy
 = Cluster Resources =
 
 [[s-resource-primitive]]
 == What is a Cluster Resource? ==
 
 indexterm:[Resource]
 
 A resource is a service made highly available by a cluster.
 The simplest type of resource, a 'primitive' resource, is described
 in this chapter. More complex forms, such as groups and clones,
 are described in later chapters.
 
 Every primitive resource has a 'resource agent'. A resource agent is an
 external program that abstracts the service it provides and present a
 consistent view to the cluster.
 
 This allows the cluster to be agnostic about the resources it manages.
 The cluster doesn't need to understand how the resource works because
 it relies on the resource agent to do the right thing when given a
 `start`, `stop` or `monitor` command. For this reason, it is crucial that
 resource agents are well-tested.
 
 Typically, resource agents come in the form of shell scripts. However,
 they can be written using any technology (such as C, Python or Perl)
 that the author is comfortable with.
 
 [[s-resource-supported]]
 == Resource Classes ==
 
 indexterm:[Resource,class]
 
 Pacemaker supports several classes of agents:
 
 * OCF
 * LSB
 * Upstart
 * Systemd
 * Service
 * Fencing
 * Nagios Plugins
 
 === Open Cluster Framework ===
 
 indexterm:[Resource,OCF]
 indexterm:[OCF,Resources]
 indexterm:[Open Cluster Framework,Resources]
 
 The OCF standard
 footnote:[See
 http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD
  -- at least as it relates to resource agents.  The Pacemaker implementation has
 been somewhat extended from the OCF specs, but none of those changes are
 incompatible with the original OCF specification.]
 is basically an extension of the Linux Standard Base conventions for
 init scripts to:
 
 * support parameters,
 * make them self-describing, and
 * make them extensible
 
 OCF specs have strict definitions of the exit codes that actions must return.
 footnote:[
 The resource-agents source code includes the `ocf-tester` script, which
 can be useful in this regard.
 ]
 
 The cluster follows these specifications exactly, and giving the wrong
 exit code will cause the cluster to behave in ways you will likely
 find puzzling and annoying.  In particular, the cluster needs to
 distinguish a completely stopped resource from one which is in some
 erroneous and indeterminate state.
 
 Parameters are passed to the resource agent as environment variables, with the
 special prefix +OCF_RESKEY_+.  So, a parameter which the user thinks
 of as +ip+ will be passed to the resource agent as +OCF_RESKEY_ip+.  The
 number and purpose of the parameters is left to the resource agent; however,
 the resource agent should use the `meta-data` command to advertise any that it
 supports.
 
 The OCF class is the most preferred as it is an industry standard,
 highly flexible (allowing parameters to be passed to agents in a
 non-positional manner) and self-describing.
 
 For more information, see the
 http://www.linux-ha.org/wiki/OCF_Resource_Agents[reference] and
 the 'Resource Agents' chapter of 'Pacemaker Administration'.
 
 === Linux Standard Base ===
 indexterm:[Resource,LSB]
 indexterm:[LSB,Resources]
 indexterm:[Linux Standard Base,Resources]
 
-'LSB' resource agents are rather known as 'init scripts' (service startup
-scripts), located in +/etc/init.d+.
+'LSB' resource agents are more commonly known as 'init scripts'. If a full path
+is not given, they are assumed to be located in +/etc/init.d+.
 
-Commonly, they are provided by the OS distribution and, in order to be used
-with the cluster, they must conform to the LSB Spec.
+Commonly, they are provided by the OS distribution. In order to be used
+with a Pacemaker cluster, they must conform to the LSB specification.
 footnote:[
 See
 http://refspecs.linux-foundation.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
 for the LSB Spec as it relates to init scripts.
 ]
 
 [WARNING]
 ====
 Many distributions or particular software packages claim LSB compliance
 but ship with broken init scripts.  For details on how to check whether
 your init script is LSB-compatible, see the 'Resource Agents' chapter of
 'Pacemaker Administration'. Common problematic violations of the LSB
 standard include:
 
 * Not implementing the +status+ operation at all
 * Not observing the correct exit status codes for
   +start+/+stop+/+status+ actions
 * Starting a started resource returns an error
 * Stopping a stopped resource returns an error
-
-Since the LSB standard is pragmatic enough so as _not_ to elaborate
-on clean and reliable (busy-waiting-free) service dependency chains beyond
-symbolic system facilities names to order against (one of the strongest
-guarantees set forth is with _syslog_ in particular, denoting that,
-when satisfied, it's actually _operational_ -- something not demanded
-universally with the standard) and because explicit dependency-based
-ordering is crucial for stacked HA applications, additionally this
-imminent setback, possibly rooted deeper in the lack of synchronization
-after initial forking in daemons themselves (something that currently
-spoils also Pacemaker's own user-facing ones) and hence nothing init
-scripts alone could be blamed for, stands out:
-
-* Insufficient causality discreetness on either service start-up (for
-  the dependency chains, it's rather essential the service is also
-  _operational_, with the minimal viable interpretation being that
-  subsequent +status+ returns success but preferably in the strict
-  sense, once the respective init script invocation finishes with
-  success) or shutdown (ditto with no child processes left behind)
-footnote:[
-There's an inherent difference between _started_ and _ready_ state
-of the service at hand, see discussion at
-https://jdebp.eu/FGA/unix-daemon-readiness-protocol-problems.html
-also showing how suitably prepared <<s-resource-supported-systemd,systemd
-resources>> may possibly improve on this through a native arrangement scheme.
-]
 ====
 
 [IMPORTANT]
 ====
 Remember to make sure the computer is _not_ configured to start any
 services at boot time -- that should be controlled by the cluster.
 ====
 
 [[s-resource-supported-systemd]]
 === Systemd ===
 indexterm:[Resource,Systemd]
 indexterm:[Systemd,Resources]
 
 Some newer distributions have replaced the old
 http://en.wikipedia.org/wiki/Init#SysV-style["SysV"] style of
 initialization daemons and scripts with an alternative called
 http://www.freedesktop.org/wiki/Software/systemd[Systemd].
 
 Pacemaker is able to manage these services _if they are present_.
 
 Instead of init scripts, systemd has 'unit files'.  Generally, the
 services (unit files) are provided by the OS distribution, but there
 are online guides for converting from init scripts.
 footnote:[For example,
 http://0pointer.de/blog/projects/systemd-for-admins-3.html]
 
 [IMPORTANT]
 ====
 Remember to make sure the computer is _not_ configured to start any
 services at boot time -- that should be controlled by the cluster.
 ====
 
 === Upstart ===
 indexterm:[Resource,Upstart]
 indexterm:[Upstart,Resources]
 
 Some newer distributions have replaced the old
 http://en.wikipedia.org/wiki/Init#SysV-style["SysV"] style of
 initialization daemons (and scripts) with an alternative called
 http://upstart.ubuntu.com/[Upstart].
 
 Pacemaker is able to manage these services _if they are present_.
 
 Instead of init scripts, upstart has 'jobs'.  Generally, the
 services (jobs) are provided by the OS distribution.
 
 [IMPORTANT]
 ====
 Remember to make sure the computer is _not_ configured to start any
 services at boot time -- that should be controlled by the cluster.
 ====
 
 === System Services ===
 indexterm:[Resource,System Services]
 indexterm:[System Service,Resources]
 
 Since there are various types of system services (+systemd+,
 +upstart+, and +lsb+), Pacemaker supports a special +service+ alias which
 intelligently figures out which one applies to a given cluster node.
 
 This is particularly useful when the cluster contains a mix of
 +systemd+, +upstart+, and +lsb+.
 
 In order, Pacemaker will try to find the named service as:
 
 . an LSB init script
 . a Systemd unit file
 . an Upstart job
 
 === STONITH ===
 indexterm:[Resource,STONITH]
 indexterm:[STONITH,Resources]
 
 The STONITH class is used exclusively for fencing-related resources.  This is
 discussed later in <<ch-stonith>>.
 
 === Nagios Plugins ===
 indexterm:[Resource,Nagios Plugins]
 indexterm:[Nagios Plugins,Resources]
 
 Nagios Plugins
 footnote:[The project has two independent forks, hosted at
 https://www.nagios-plugins.org/ and https://www.monitoring-plugins.org/. Output
 from both projects' plugins is similar, so plugins from either project can be
 used with pacemaker.]
 allow us to monitor services on remote hosts.
 
 Pacemaker is able to do remote monitoring with the plugins _if they are
 present_.
 
 A common use case is to configure them as resources belonging to a resource
 container (usually a virtual machine), and the container will be restarted
 if any of them has failed. Another use is to configure them as ordinary
 resources to be used for monitoring hosts or services via the network.
 
 The supported parameters are same as the long options of the plugin.
 
 [[primitive-resource]]
 == Resource Properties ==
 
 These values tell the cluster which resource agent to use for the resource,
 where to find that resource agent and what standards it conforms to.
 
 .Properties of a Primitive Resource
 [width="95%",cols="1m,<6",options="header",align="center"]
 |=========================================================
 
 |Field
 |Description
 
 |id
 |Your name for the resource
  indexterm:[id,Resource]
  indexterm:[Resource,Property,id]
 
 |class
 
 |The standard the resource agent conforms to. Allowed values:
 +lsb+, +nagios+, +ocf+, +service+, +stonith+, +systemd+, +upstart+
  indexterm:[class,Resource]
  indexterm:[Resource,Property,class]
 
 |type
 |The name of the Resource Agent you wish to use. E.g. +IPaddr+ or +Filesystem+
  indexterm:[type,Resource]
  indexterm:[Resource,Property,type]
 
 |provider
 |The OCF spec allows multiple vendors to supply the same
  resource agent. To use the OCF resource agents supplied by
  the Heartbeat project, you would specify +heartbeat+ here.
  indexterm:[provider,Resource]
  indexterm:[Resource,Property,provider]
 
 |=========================================================
 
 The XML definition of a resource can be queried with the `crm_resource` tool.
 For example:
 
 ----
 # crm_resource --resource Email --query-xml
 ----
 
 might produce:
 
 .A system resource definition
 =====
 [source,XML]
 <primitive id="Email" class="service" type="exim"/>
 =====
 
 [NOTE]
 =====
 One of the main drawbacks to system services (LSB, systemd or
 Upstart) resources is that they do not allow any parameters!
 =====
 
 ////
 See https://tools.ietf.org/html/rfc5737 for choice of example IP address
 ////
 
 .An OCF resource definition
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
    <instance_attributes id="Public-IP-params">
       <nvpair id="Public-IP-ip" name="ip" value="192.0.2.2"/>
    </instance_attributes>
 </primitive>
 -------
 =====
 
 [[s-resource-options]]
 == Resource Options ==
 
 Resources have two types of options: 'meta-attributes' and 'instance attributes'.
 Meta-attributes apply to any type of resource, while instance attributes
 are specific to each resource agent.
 
 === Resource Meta-Attributes ===
 
 Meta-attributes are used by the cluster to decide how a resource should
 behave and can be easily set using the `--meta` option of the
 `crm_resource` command.
 
 .Meta-attributes of a Primitive Resource
 [width="95%",cols="2m,2,<5",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |priority
 |0
 |If not all resources can be active, the cluster will stop lower
 priority resources in order to keep higher priority ones active.
 indexterm:[priority,Resource Option]
 indexterm:[Resource,Option,priority]
 
 |target-role
 |Started
 a|What state should the cluster attempt to keep this resource in? Allowed values:
 
 * +Stopped:+ Force the resource to be stopped
 * +Started:+ Allow the resource to be started (and in the case of
   <<s-resource-promotable,promotable clone resources>>, promoted to master if
   appropriate)
 * +Slave:+ Allow the resource to be started, but only in Slave mode if
   the resource is <<s-resource-promotable,promotable>>
 * +Master:+ Equivalent to +Started+
 indexterm:[target-role,Resource Option]
 indexterm:[Resource,Option,target-role]
 
 |is-managed
 |TRUE
 |Is the cluster allowed to start and stop the resource?  Allowed
  values: +true+, +false+
  indexterm:[is-managed,Resource Option]
  indexterm:[Resource,Option,is-managed]
 
 |resource-stickiness
-|value of +resource-stickiness+ in the +rsc_defaults+ section
-|How much does the resource prefer to stay where it is?
+|1 for individual clone instances, 0 for all other resources
+|A score that will be added to the current node when a resource is already
+ active. This allows running resources to stay where they are, even if
+ they would be placed elsewhere if they were being started from a stopped
+ state.
  indexterm:[resource-stickiness,Resource Option]
  indexterm:[Resource,Option,resource-stickiness]
 
 |requires
 |+quorum+ for resources with a +class+ of +stonith+,
  otherwise +unfencing+ if unfencing is active in the cluster,
  otherwise +fencing+ if +stonith-enabled+ is true, otherwise +quorum+
 a|Conditions under which the resource can be started
 Allowed values:
 
 * +nothing:+ can always be started
 * +quorum:+ The cluster can only start this resource if a majority of
   the configured nodes are active
 * +fencing:+ The cluster can only start this resource if a majority
   of the configured nodes are active _and_ any failed or unknown nodes
   have been <<ch-stonith,fenced>>
 * +unfencing:+
   The cluster can only start this resource if a majority
   of the configured nodes are active _and_ any failed or unknown nodes
   have been fenced _and_ only on nodes that have been
   <<s-unfencing,unfenced>>
 
 indexterm:[requires,Resource Option]
 indexterm:[Resource,Option,requires]
 
 |migration-threshold
 |INFINITY
 |How many failures may occur for this resource on a node, before this
  node is marked ineligible to host this resource. A value of 0 indicates that
  this feature is disabled (the node will never be marked ineligible); by
  constrast, the cluster treats INFINITY (the default) as a very large but
  finite number. This option has an effect only if the failed operation
  specifies +on-fail+ as +restart+ (the default), and additionally for
  failed +start+ operations, if the cluster property +start-failure-is-fatal+
  is +false+.
  indexterm:[migration-threshold,Resource Option]
  indexterm:[Resource,Option,migration-threshold]
 
 |failure-timeout
 |0
 |How many seconds to wait before acting as if the failure had not
  occurred, and potentially allowing the resource back to the node on
  which it failed. A value of 0 indicates that this feature is disabled.
  As with any time-based actions, this is not guaranteed to be checked more
  frequently than the value of +cluster-recheck-interval+ (see
  <<s-cluster-options>>).
  indexterm:[failure-timeout,Resource Option]
  indexterm:[Resource,Option,failure-timeout]
 
 |multiple-active
 |stop_start
 a|What should the cluster do if it ever finds the resource active on
  more than one node? Allowed values:
 
 * +block:+ mark the resource as unmanaged
 * +stop_only:+ stop all active instances and leave them that way
 * +stop_start:+ stop all active instances and start the resource in
   one location only
 
 indexterm:[multiple-active,Resource Option]
 indexterm:[Resource,Option,multiple-active]
 
 |allow-migrate
 |TRUE for ocf:pacemaker:remote resources, FALSE otherwise
 |Whether the cluster should try to "live migrate" this resource when it needs
 to be moved (see <<s-migrating-resources>>)
 
 |container-attribute-target
 |
 |Specific to bundle resources; see <<s-bundle-attributes>>
 
 |remote-node
 |
 |The name of the Pacemaker Remote guest node this resource is associated with,
  if any. If specified, this both enables the resource as a guest node and
  defines the unique name used to identify the guest node. The guest must be
  configured to run the Pacemaker Remote daemon when it is started. +WARNING:+
  This value cannot overlap with any resource or node IDs.
 
 |remote-port
 |3121
 |If +remote-node+ is specified, the port on the guest used for its
  Pacemaker Remote connection. The Pacemaker Remote daemon on the guest must be
  configured to listen on this port.
 
 |remote-addr
 |value of +remote-node+
 |If +remote-node+ is specified, the IP address or hostname used to connect to
  the guest via Pacemaker Remote. The Pacemaker Remote daemon on the guest
  must be configured to accept connections on this address.
 
 |remote-connect-timeout
 |60s
 |If +remote-node+ is specified, how long before a pending guest connection will
  time out.
 
 |=========================================================
 
 As an example of setting resource options, if you performed the following
 commands on an LSB Email resource:
 
 -------
 # crm_resource --meta --resource Email --set-parameter priority --parameter-value 100
 # crm_resource -m -r Email -p multiple-active -v block
 -------
 
 the resulting resource definition might be:
 
 .An LSB resource with cluster options
 =====
 [source,XML]
 -------
 <primitive id="Email" class="lsb" type="exim">
   <meta_attributes id="Email-meta_attributes">
     <nvpair id="Email-meta_attributes-priority" name="priority" value="100"/>
     <nvpair id="Email-meta_attributes-multiple-active" name="multiple-active" value="block"/>
   </meta_attributes>
 </primitive>
 -------
 =====
 
+In addition to the cluster-defined meta-attributes described above, you may
+also configure arbitrary meta-attributes of your own choosing. Most commonly,
+this would be done for use in <<ch-rules,rules>>. For example, an IT department
+might define a custom meta-attribute to indicate which company department each
+resource is intended for. To reduce the chance of name collisions with
+cluster-defined meta-attributes added in the future, it is recommended to use
+a unique, organization-specific prefix for such attributes.
+
 [[s-resource-defaults]]
 === Setting Global Defaults for Resource Meta-Attributes ===
 
 To set a default value for a resource option, add it to the
 +rsc_defaults+ section with `crm_attribute`. For example,
 
 ----
 # crm_attribute --type rsc_defaults --name is-managed --update false
 ----
 
 would prevent the cluster from starting or stopping any of the
 resources in the configuration (unless of course the individual
 resources were specifically enabled by having their +is-managed+ set to
 +true+).
 
 === Resource Instance Attributes ===
 
 The resource agents of some resource classes (lsb, systemd and upstart 'not' among them)
 can be given parameters which determine how they behave and which instance
 of a service they control.
 
 If your resource agent supports parameters, you can add them with the
 `crm_resource` command. For example,
 
 ----
 # crm_resource --resource Public-IP --set-parameter ip --parameter-value 192.0.2.2
 ----
 
 would create an entry in the resource like this:
 
 .An example OCF resource with instance attributes
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
    <instance_attributes id="params-public-ip">
       <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
    </instance_attributes>
 </primitive>
 -------
 =====
 
 For an OCF resource, the result would be an environment variable
 called +OCF_RESKEY_ip+ with a value of +192.0.2.2+.
 
 The list of instance attributes supported by an OCF resource agent can be
 found by calling the resource agent with the `meta-data` command.
 The output contains an XML description of all the supported
 attributes, their purpose and default values.
 
 .Displaying the metadata for the Dummy resource agent template
 =====
 ----
 # export OCF_ROOT=/usr/lib/ocf
 # $OCF_ROOT/resource.d/pacemaker/Dummy meta-data
 ----
 [source,XML]
 -------
 <?xml version="1.0"?>
 <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
 <resource-agent name="Dummy" version="1.0">
 <version>1.0</version>
 
 <longdesc lang="en">
 This is a Dummy Resource Agent. It does absolutely nothing except 
 keep track of whether its running or not.
 Its purpose in life is for testing and to serve as a template for RA writers.
 
 NB: Please pay attention to the timeouts specified in the actions
 section below. They should be meaningful for the kind of resource
 the agent manages. They should be the minimum advised timeouts,
 but they shouldn't/cannot cover _all_ possible resource
 instances. So, try to be neither overly generous nor too stingy,
 but moderate. The minimum timeouts should never be below 10 seconds.
 </longdesc>
 <shortdesc lang="en">Example stateless resource agent</shortdesc>
 
 <parameters>
 <parameter name="state" unique="1">
 <longdesc lang="en">
 Location to store the resource state in.
 </longdesc>
 <shortdesc lang="en">State file</shortdesc>
 <content type="string" default="/var/run/Dummy-default.state" />
 </parameter>
 
 <parameter name="fake" unique="0">
 <longdesc lang="en">
 Fake attribute that can be changed to cause a reload
 </longdesc>
 <shortdesc lang="en">Fake attribute that can be changed to cause a reload</shortdesc>
 <content type="string" default="dummy" />
 </parameter>
 
 <parameter name="op_sleep" unique="1">
 <longdesc lang="en">
 Number of seconds to sleep during operations.  This can be used to test how
 the cluster reacts to operation timeouts.
 </longdesc>
 <shortdesc lang="en">Operation sleep duration in seconds.</shortdesc>
 <content type="string" default="0" />
 </parameter>
 
 </parameters>
 
 <actions>
 <action name="start"        timeout="20" />
 <action name="stop"         timeout="20" />
 <action name="monitor"      timeout="20" interval="10" depth="0"/>
 <action name="reload"       timeout="20" />
 <action name="migrate_to"   timeout="20" />
 <action name="migrate_from" timeout="20" />
 <action name="validate-all" timeout="20" />
 <action name="meta-data"    timeout="5" />
 </actions>
 </resource-agent>
 -------
 =====
 
 == Resource Operations ==
 
 indexterm:[Resource,Action]
 
 'Operations' are actions the cluster can perform on a resource by calling the
 resource agent. Resource agents must support certain common operations such as
-start, stop and monitor, and may implement any others.
+start, stop, and monitor, and may implement any others.
 
-Some operations are generated by the cluster itself, for example, stopping and
-starting resources as needed.
+Operations may be explicitly configured for two purposes: to override defaults
+for options (such as timeout) that the cluster will use whenever it initiates
+the operation, and to run an operation on a recurring basis (for example, to
+monitor the resource for failure).
 
-You can configure operations in the cluster configuration. As an example, by
-default the cluster will 'not' ensure your resources stay healthy once they are
-started. footnote:[Currently, anyway. Automatic monitoring operations may be
-added in a future version of Pacemaker.] To instruct the cluster to do this,
-you need to add a +monitor+ operation to the resource's definition.
-
-.An OCF resource with a recurring health check
+.An OCF resource with a non-default start timeout
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
   <operations>
-     <op id="public-ip-check" name="monitor" interval="60s"/>
+     <op id="Public-IP-start" name="start" timeout="60s"/>
   </operations>
   <instance_attributes id="params-public-ip">
      <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
   </instance_attributes>
 </primitive>
 -------
 =====
 
+Pacemaker identifies operations by a combination of name and interval, so this
+combination must be unique for each resource. That is, you should not configure
+two operations for the same resource with the same name and interval.
+
 .Properties of an Operation
 [width="95%",cols="2m,3,<6",options="header",align="center"]
 |=========================================================
 
 |Field
 |Default
 |Description
 
 |id
 |
 |A unique name for the operation.
  indexterm:[id,Action Property]
  indexterm:[Action,Property,id]
 
 |name
 |
 |The action to perform. This can be any action supported by the agent; common
  values include +monitor+, +start+, and +stop+.
  indexterm:[name,Action Property]
  indexterm:[Action,Property,name]
 
 |interval
 |0
-|How frequently (in seconds) to perform the operation. A value of 0 means never.
- A positive value defines a 'recurring action', which is typically used with
- <<s-resource-monitoring,monitor>>.
+|How frequently (in seconds) to perform the operation. A value of 0 means "when
+ needed". A positive value defines a 'recurring action', which is typically
+ used with <<s-resource-monitoring,monitor>>.
  indexterm:[interval,Action Property]
  indexterm:[Action,Property,interval]
 
 |timeout
 |
 |How long to wait before declaring the action has failed
  indexterm:[timeout,Action Property]
  indexterm:[Action,Property,timeout]
 
 |on-fail
 |restart '(except for +stop+ operations, which default to' fence 'when
  STONITH is enabled and' block 'otherwise)'
 a|The action to take if this action ever fails. Allowed values:
 
 * +ignore:+ Pretend the resource did not fail.
 * +block:+ Don't perform any further operations on the resource.
 * +stop:+ Stop the resource and do not start it elsewhere.
 * +restart:+ Stop the resource and start it again (possibly on a different node).
 * +fence:+ STONITH the node on which the resource failed.
 * +standby:+ Move _all_ resources away from the node on which the resource failed.
 
 indexterm:[on-fail,Action Property]
 indexterm:[Action,Property,on-fail]
 
 |enabled
 |TRUE
 |If +false+, ignore this operation definition.  This is typically used to pause
  a particular recurring +monitor+ operation; for instance, it can complement
  the respective resource being unmanaged (+is-managed=false+), as this alone
  will <<s-monitoring-unmanaged,not block any configured monitoring>>.
  Disabling the operation does not suppress all actions of the given type.
  Allowed values: +true+, +false+.
  indexterm:[enabled,Action Property]
  indexterm:[Action,Property,enabled]
 
 |record-pending
 |FALSE
 |If +true+, the intention to perform the operation is recorded so that
  GUIs and CLI tools can indicate that an operation is in progress.
- This is best set as an _operation default_ (see next section).
+ This is best set as an _operation default_ (see <<s-operation-defaults>>).
  Allowed values: +true+, +false+.
  indexterm:[enabled,Action Property]
  indexterm:[Action,Property,enabled]
 
 |role
 |
 |Run the operation only on node(s) that the cluster thinks should be in
  the specified role. This only makes sense for recurring +monitor+ operations.
  Allowed (case-sensitive) values: +Stopped+, +Started+, and in the
  case of <<s-resource-promotable,promotable clone resources>>, +Slave+ and +Master+.
  indexterm:[role,Action Property]
  indexterm:[Action,Property,role]
 
 |=========================================================
 
 [[s-resource-monitoring]]
 === Monitoring Resources for Failure ===
 
 When Pacemaker first starts a resource, it runs one-time +monitor+ operations
 (referred to as 'probes') to ensure the resource is running where it's
 supposed to be, and not running where it's not supposed to be. (This behavior
 can be affected by the +resource-discovery+ location constraint property.)
 
-Other than those initial probes, Pacemaker will not (by default) check that
-the resource continues to stay healthy. As in the example above, you must
-configure +monitor+ operations explicitly to perform these checks.
+Other than those initial probes, Pacemaker will 'not' (by default) check that
+the resource continues to stay healthy.
+footnote:[Currently, anyway. Automatic monitoring operations may be
+added in a future version of Pacemaker.]
+You must configure +monitor+ operations explicitly to perform these checks.
+
+.An OCF resource with a recurring health check
+=====
+[source,XML]
+-------
+<primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
+  <operations>
+     <op id="Public-IP-start" name="start" timeout="60s"/>
+     <op id="Public-IP-monitor" name="monitor" interval="60s"/>
+  </operations>
+  <instance_attributes id="params-public-ip">
+     <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
+  </instance_attributes>
+</primitive>
+-------
+=====
 
 By default, a +monitor+ operation will ensure that the resource is running
 where it is supposed to. The +target-role+ property can be used for further
 checking.
 
 For example, if a resource has one +monitor+ operation with
 +interval=10 role=Started+ and a second +monitor+ operation with
 +interval=11 role=Stopped+, the cluster will run the first monitor on any nodes
 it thinks 'should' be running the resource, and the second monitor on any nodes
 that it thinks 'should not' be running the resource (for the truly paranoid,
 who want to know when an administrator manually starts a service by mistake).
 
+[NOTE]
+====
+Currently, monitors with +role=Stopped+ are not implemented for
+<<s-resource-clone,clone>> resources.
+====
+
 [[s-monitoring-unmanaged]]
 === Monitoring Resources When Administration is Disabled ===
 
 Recurring +monitor+ operations behave differently under various administrative
 settings:
 
 * When a resource is unmanaged (by setting +is-managed=false+): No monitors
   will be stopped.
 +
 If the unmanaged resource is stopped on a node where the cluster thinks it
 should be running, the cluster will detect and report that it is not, but it
 will not consider the monitor failed, and will not try to start the resource
 until it is managed again.
 +
 Starting the unmanaged resource on a different node is strongly discouraged
 and will at least cause the cluster to consider the resource failed, and
 may require the resource's +target-role+ to be set to +Stopped+ then +Started+
 to be recovered.
 
 * When a node is put into standby: All resources will be moved away from the
   node, and all +monitor+ operations will be stopped on the node, except those
-  specifying +role+ as +Stopped+. Such rather atypical monitoring will
-  consequently be started on the node if appropriate.
+  specifying +role+ as +Stopped+ (which will be newly initiated if
+  appropriate).
 
 * When the cluster is put into maintenance mode: All resources will be marked
-  as unmanaged. All monitor operations will be stopped, except those with
-  specifying +role+ as +Stopped+. As with single unmanaged resources, starting
+  as unmanaged. All monitor operations will be stopped, except those
+  specifying +role+ as +Stopped+ (which will be newly initiated if
+  appropriate). As with single unmanaged resources, starting
   a resource on a node other than where the cluster expects it to be will
   cause problems.
 
 [[s-operation-defaults]]
 === Setting Global Defaults for Operations ===
 
 You can change the global default values for operation properties
 in a given cluster. These are defined in an +op_defaults+ section 
 of the CIB's +configuration+ section, and can be set with `crm_attribute`.
 For example,
 
 ----
 # crm_attribute --type op_defaults --name timeout --update 20s
 ----
 
 would default each operation's +timeout+ to 20 seconds.  If an
 operation's definition also includes a value for +timeout+, then that
 value would be used for that operation instead.
 
 === When Implicit Operations Take a Long Time ===
 
 The cluster will always perform a number of implicit operations: +start+,
 +stop+ and a non-recurring +monitor+ operation used at startup to check
 whether the resource is already active.  If one of these is taking too long,
 then you can create an entry for them and specify a longer timeout.
 
 .An OCF resource with custom timeouts for its implicit actions
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
   <operations>
      <op id="public-ip-startup" name="monitor" interval="0" timeout="90s"/>
      <op id="public-ip-start" name="start" interval="0" timeout="180s"/>
      <op id="public-ip-stop" name="stop" interval="0" timeout="15min"/>
   </operations>
   <instance_attributes id="params-public-ip">
      <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
   </instance_attributes>
 </primitive>
 -------
 =====
 
 === Multiple Monitor Operations ===
 
 Provided no two operations (for a single resource) have the same name
 and interval, you can have as many +monitor+ operations as you like.
 In this way, you can do a superficial health check every minute and
 progressively more intense ones at higher intervals.
 
 To tell the resource agent what kind of check to perform, you need to
 provide each monitor with a different value for a common parameter.
 The OCF standard creates a special parameter called +OCF_CHECK_LEVEL+
 for this purpose and dictates that it is "made available to the
 resource agent without the normal +OCF_RESKEY+ prefix".
 
 Whatever name you choose, you can specify it by adding an
 +instance_attributes+ block to the +op+ tag. It is up to each
 resource agent to look for the parameter and decide how to use it.
 
 .An OCF resource with two recurring health checks, performing different levels of checks specified via +OCF_CHECK_LEVEL+.
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
    <operations>
       <op id="public-ip-health-60" name="monitor" interval="60">
          <instance_attributes id="params-public-ip-depth-60">
             <nvpair id="public-ip-depth-60" name="OCF_CHECK_LEVEL" value="10"/>
          </instance_attributes>
       </op>
       <op id="public-ip-health-300" name="monitor" interval="300">
          <instance_attributes id="params-public-ip-depth-300">
             <nvpair id="public-ip-depth-300" name="OCF_CHECK_LEVEL" value="20"/>
          </instance_attributes>
      </op>
    </operations>
    <instance_attributes id="params-public-ip">
        <nvpair id="public-ip-level" name="ip" value="192.0.2.2"/>
    </instance_attributes>
 </primitive>
 -------
 =====
 
 === Disabling a Monitor Operation ===
 
 The easiest way to stop a recurring monitor is to just delete it.
 However, there can be times when you only want to disable it
 temporarily.  In such cases, simply add +enabled=false+ to the
 operation's definition.
 
 .Example of an OCF resource with a disabled health check
 =====
 [source,XML]
 -------
 <primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
    <operations>
       <op id="public-ip-check" name="monitor" interval="60s" enabled="false"/>
    </operations>
    <instance_attributes id="params-public-ip">
       <nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
    </instance_attributes>
 </primitive>
 -------
 =====
 
 This can be achieved from the command line by executing:
 
 ----
 # cibadmin --modify --xml-text '<op id="public-ip-check" enabled="false"/>'
 ----
 
 Once you've done whatever you needed to do, you can then re-enable it with
 ----
 # cibadmin --modify --xml-text '<op id="public-ip-check" enabled="true"/>'
 ----
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Reusing-Configuration.txt b/doc/Pacemaker_Explained/en-US/Ch-Reusing-Configuration.txt
index c0d1883b04..a7569d1412 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Reusing-Configuration.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Reusing-Configuration.txt
@@ -1,373 +1,376 @@
 :compat-mode: legacy
 = Reusing Parts of the Configuration =
 
 Pacemaker provides multiple ways to simplify the configuration XML by reusing
 parts of it in multiple places.
 
 Besides simplifying the XML, this also allows you to manipulate multiple
 configuration elements with a single reference.
 
 == Reusing Resource Definitions ==
 
 If you want to create lots of resources with similar configurations, defining a
 'resource template' simplifies the task. Once defined, it can be referenced in
 primitives or in certain types of constraints.
 
 === Configuring Resources with Templates ===
 
 The primitives referencing the template will inherit all meta-attributes,
 instance attributes, utilization attributes and operations defined
 in the template. And you can define specific attributes and operations for any
 of the primitives. If any of these are defined in both the template and the
 primitive, the values defined in the primitive will take precedence over the
 ones defined in the template.
 
 Hence, resource templates help to reduce the amount of configuration work.
 If any changes are needed, they can be done to the template definition and
 will take effect globally in all resource definitions referencing that
 template.
 
 Resource templates have a syntax similar to that of primitives.
 
 .Resource template for a migratable Xen virtual machine
 ====
 [source,XML]
 ----
 <template id="vm-template" class="ocf" provider="heartbeat" type="Xen">
   <meta_attributes id="vm-template-meta_attributes">
     <nvpair id="vm-template-meta_attributes-allow-migrate" name="allow-migrate" value="true"/>
   </meta_attributes>
   <utilization id="vm-template-utilization">
     <nvpair id="vm-template-utilization-memory" name="memory" value="512"/>
   </utilization>
   <operations>
     <op id="vm-template-monitor-15s" interval="15s" name="monitor" timeout="60s"/>
     <op id="vm-template-start-0" interval="0" name="start" timeout="60s"/>
   </operations>
 </template>
 ----
 ====
 
 Once you define a resource template, you can use it in primitives by specifying the
 +template+ property.
 
 .Xen primitive resource using a resource template
 ====
 [source,XML]
 ----
 <primitive id="vm1" template="vm-template">
   <instance_attributes id="vm1-instance_attributes">
     <nvpair id="vm1-instance_attributes-name" name="name" value="vm1"/>
     <nvpair id="vm1-instance_attributes-xmfile" name="xmfile" value="/etc/xen/shared-vm/vm1"/>
   </instance_attributes>
 </primitive>
 ----
 ====
 
 In the example above, the new primitive +vm1+ will inherit everything from +vm-template+. For
 example, the equivalent of the above two examples would be:
 
 .Equivalent Xen primitive resource not using a resource template
 ====
 [source,XML]
 ----
 <primitive id="vm1" class="ocf" provider="heartbeat" type="Xen">
   <meta_attributes id="vm-template-meta_attributes">
     <nvpair id="vm-template-meta_attributes-allow-migrate" name="allow-migrate" value="true"/>
   </meta_attributes>
   <utilization id="vm-template-utilization">
     <nvpair id="vm-template-utilization-memory" name="memory" value="512"/>
   </utilization>
   <operations>
     <op id="vm-template-monitor-15s" interval="15s" name="monitor" timeout="60s"/>
     <op id="vm-template-start-0" interval="0" name="start" timeout="60s"/>
   </operations>
   <instance_attributes id="vm1-instance_attributes">
     <nvpair id="vm1-instance_attributes-name" name="name" value="vm1"/>
     <nvpair id="vm1-instance_attributes-xmfile" name="xmfile" value="/etc/xen/shared-vm/vm1"/>
   </instance_attributes>
 </primitive>
 ----
 ====
 
 If you want to overwrite some attributes or operations, add them to the
 particular primitive's definition.
 
 .Xen resource overriding template values
 ====
 [source,XML]
 ----
 <primitive id="vm2" template="vm-template">
   <meta_attributes id="vm2-meta_attributes">
     <nvpair id="vm2-meta_attributes-allow-migrate" name="allow-migrate" value="false"/>
   </meta_attributes>
   <utilization id="vm2-utilization">
     <nvpair id="vm2-utilization-memory" name="memory" value="1024"/>
   </utilization>
   <instance_attributes id="vm2-instance_attributes">
     <nvpair id="vm2-instance_attributes-name" name="name" value="vm2"/>
     <nvpair id="vm2-instance_attributes-xmfile" name="xmfile" value="/etc/xen/shared-vm/vm2"/>
   </instance_attributes>
   <operations>
     <op id="vm2-monitor-30s" interval="30s" name="monitor" timeout="120s"/>
     <op id="vm2-stop-0" interval="0" name="stop" timeout="60s"/>
   </operations>
 </primitive>
 ----
 ====
 
 In the example above, the new primitive +vm2+ has special
 attribute values. Its +monitor+ operation has a longer +timeout+ and +interval+, and
 the primitive has an additional +stop+ operation.
 
 To see the resulting definition of a resource, run:
 
 ----
 # crm_resource --query-xml --resource vm2
 ----
 
 To see the raw definition of a resource in the CIB, run:
 
 ----
 # crm_resource --query-xml-raw --resource vm2
 ----
 
 === Using Templates in Constraints ===
 
 A resource template can be referenced in the following types of constraints:
 
 - +order+ constraints (see <<s-resource-ordering>>)
 - +colocation+ constraints (see <<s-resource-colocation>>)
 - +rsc_ticket+ constraints (for multi-site clusters as described in <<s-ticket-constraints>>)
 
 Resource templates referenced in constraints stand for all primitives which are
 derived from that template. This means, the constraint applies to all primitive
 resources referencing the resource template. Referencing resource templates in
 constraints is an alternative to resource sets and can simplify the cluster
 configuration considerably.
 
 For example, given the example templates earlier in this chapter:
 
 [source,XML]
 <rsc_colocation id="vm-template-colo-base-rsc" rsc="vm-template" rsc-role="Started" with-rsc="base-rsc" score="INFINITY"/>
 
 would colocate all VMs with +base-rsc+ and is the equivalent of the following constraint configuration:
 
 [source,XML]
 ----
 <rsc_colocation id="vm-colo-base-rsc" score="INFINITY">
   <resource_set id="vm-colo-base-rsc-0" sequential="false" role="Started">
     <resource_ref id="vm1"/>
     <resource_ref id="vm2"/>
   </resource_set>
   <resource_set id="vm-colo-base-rsc-1">
     <resource_ref id="base-rsc"/>
   </resource_set>
 </rsc_colocation>
 ----
 
 [NOTE]
 ======
 In a colocation constraint, only one template may be referenced from either
 `rsc` or `with-rsc`; the other reference must be a regular resource.
 ======
 
 === Using Templates in Resource Sets ===
 
 Resource templates can also be referenced in resource sets.
 
 For example, given the example templates earlier in this section, then:
 
 [source,XML]
 ----
 <rsc_order id="order1" score="INFINITY">
   <resource_set id="order1-0">
     <resource_ref id="base-rsc"/>
     <resource_ref id="vm-template"/>
     <resource_ref id="top-rsc"/>
   </resource_set>
 </rsc_order>
 ----
 
 is the equivalent of the following constraint using a sequential resource set:
 
 [source,XML]
 ----
 <rsc_order id="order1" score="INFINITY">
   <resource_set id="order1-0">
     <resource_ref id="base-rsc"/>
     <resource_ref id="vm1"/>
     <resource_ref id="vm2"/>
     <resource_ref id="top-rsc"/>
   </resource_set>
 </rsc_order>
 ----
 
 Or, if the resources referencing the template can run in parallel, then:
 
 [source,XML]
 ----
 <rsc_order id="order2" score="INFINITY">
   <resource_set id="order2-0">
     <resource_ref id="base-rsc"/>
   </resource_set>
   <resource_set id="order2-1" sequential="false">
     <resource_ref id="vm-template"/>
   </resource_set>
   <resource_set id="order2-2">
     <resource_ref id="top-rsc"/>
   </resource_set>
 </rsc_order>
 ----
 
 is the equivalent of the following constraint configuration:
 
 [source,XML]
 ----
 <rsc_order id="order2" score="INFINITY">
   <resource_set id="order2-0">
     <resource_ref id="base-rsc"/>
   </resource_set>
   <resource_set id="order2-1" sequential="false">
     <resource_ref id="vm1"/>
     <resource_ref id="vm2"/>
   </resource_set>
   <resource_set id="order2-2">
     <resource_ref id="top-rsc"/>
   </resource_set>
 </rsc_order>
 ----
 
 [[s-reusing-config-elements]]
 == Reusing Rules, Options and Sets of Operations ==
 
 Sometimes a number of constraints need to use the same set of rules,
 and resources need to set the same options and parameters.  To
 simplify this situation, you can refer to an existing object using an
 +id-ref+ instead of an +id+.
 
 So if for one resource you have
 
 [source,XML]
 ------
 <rsc_location id="WebServer-connectivity" rsc="Webserver">
    <rule id="ping-prefer-rule" score-attribute="pingd" >
     <expression id="ping-prefer" attribute="pingd" operation="defined"/>
    </rule>
 </rsc_location>
 ------
 
 Then instead of duplicating the rule for all your other resources, you can instead specify:
 
 .Referencing rules from other constraints
 =====
 [source,XML]
 -------
 <rsc_location id="WebDB-connectivity" rsc="WebDB">
       <rule id-ref="ping-prefer-rule"/>
 </rsc_location>
 -------
 =====
 
 [IMPORTANT]
 ===========
 The cluster will insist that the +rule+ exists somewhere.  Attempting
 to add a reference to a non-existing rule will cause a validation
 failure, as will attempting to remove a +rule+ that is referenced
 elsewhere.
 ===========
 
 The same principle applies for +meta_attributes+ and
 +instance_attributes+ as illustrated in the example below:
 
 .Referencing attributes, options, and operations from other resources
 =====
 [source,XML]
 -------
 <primitive id="mySpecialRsc" class="ocf" type="Special" provider="me">
    <instance_attributes id="mySpecialRsc-attrs" score="1" >
      <nvpair id="default-interface" name="interface" value="eth0"/>
      <nvpair id="default-port" name="port" value="9999"/>
    </instance_attributes>
    <meta_attributes id="mySpecialRsc-options">
      <nvpair id="failure-timeout" name="failure-timeout" value="5m"/>
      <nvpair id="migration-threshold" name="migration-threshold" value="1"/>
      <nvpair id="stickiness" name="resource-stickiness" value="0"/>
    </meta_attributes>
    <operations id="health-checks">
      <op id="health-check" name="monitor" interval="60s"/>
      <op id="health-check" name="monitor" interval="30min"/>
    </operations>
 </primitive>
 <primitive id="myOtherlRsc" class="ocf" type="Other" provider="me">
    <instance_attributes id-ref="mySpecialRsc-attrs"/>
    <meta_attributes id-ref="mySpecialRsc-options"/>
    <operations id-ref="health-checks"/>
 </primitive>
 -------
 =====
 
++id-ref+ can similarly be used with +resource_set+ (in any constraint type),
++nvpair+, and +operations+.
+
 == Tagging Configuration Elements ==
 
 Pacemaker allows you to 'tag' any configuration element that has an XML ID.
 
 The main purpose of tagging is to support higher-level user interface tools;
 Pacemaker itself only uses tags within constraints. Therefore, what you can
 do with tags mostly depends on the tools you use.
 
 === Configuring Tags ===
 
 A tag is simply a named list of XML IDs.
 
 .Tag referencing three resources
 ====
 [source,XML]
 ----
 <tags>
   <tag id="all-vms">
     <obj_ref id="vm1"/>
     <obj_ref id="vm2"/>
     <obj_ref id="vm3"/>
   </tag>
 </tags>
 ----
 ====
 
 What you can do with this new tag depends on what your higher-level tools
 support. For example, a tool might allow you to enable or disable all of
 the tagged resources at once, or show the status of just the tagged
 resources.
 
 A single configuration element can be listed in any number of tags.
 
 === Using Tags in Constraints and Resource Sets ===
 
 Pacemaker itself only uses tags in constraints. If you supply a tag name
 instead of a resource name in any constraint, the constraint will apply to
 all resources listed in that tag.
 
 .Constraint using a tag
 ====
 [source,XML]
 ----
 <rsc_order id="order1" first="storage" then="all-vms" kind="Mandatory" />
 ----
 ====
 
 In the example above, assuming the +all-vms+ tag is defined as in the previous
 example, the constraint will behave the same as:
 
 .Equivalent constraints without tags
 ====
 [source,XML]
 ----
 <rsc_order id="order1-1" first="storage" then="vm1" kind="Mandatory" />
 <rsc_order id="order1-2" first="storage" then="vm2" kind="Mandatory" />
 <rsc_order id="order1-3" first="storage" then="vm2" kind="Mandatory" />
 ----
 ====
 
 A tag may be used directly in the constraint, or indirectly by being
 listed in a <<s-resource-sets,resource set>> used in the constraint.
 When used in a resource set, an expanded tag will honor the set's
 +sequential+ property.
diff --git a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml
index 9ad6e3561a..31bb67675d 100644
--- a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml
+++ b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml
@@ -1,39 +1,26 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE Book PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 ]>
 <book>
   <xi:include href="Book_Info.xml" xmlns:xi="http://www.w3.org/2001/XInclude"></xi:include>
   <xi:include href="Preface.xml" xmlns:xi="http://www.w3.org/2001/XInclude"></xi:include>
 
   <xi:include href="Ch-Intro.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Options.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Nodes.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Resources.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Constraints.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Alerts.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Rules.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Advanced-Options.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Advanced-Resources.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Reusing-Configuration.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Utilization.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Stonith.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Status.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ch-Multi-site-Clusters.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-  <xi:include href="Ap-FAQ.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
   <xi:include href="Ap-Samples.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
-  <appendix id="ap-further-reading">
-    <title>Further Reading</title>
-    <itemizedlist spacing="compact">
-      <listitem><para>Project Website: <ulink url="http://www.clusterlabs.org/"/></para></listitem>
-      <listitem><para>Project Documentation: <ulink url="http://www.clusterlabs.org/wiki/Documentation"/></para></listitem>
-      <listitem>
-        <para>SUSE High Availibility Guide: <ulink url="http://www.suse.com/documentation/sle_ha/book_sleha/data/book_sleha.html"/></para>
-      </listitem>
-      <listitem><para>Corosync Configuration: <ulink url="http://www.corosync.org/"/></para></listitem>
-    </itemizedlist>
-  </appendix>
-
   <xi:include href="Revision_History.xml" xmlns:xi="http://www.w3.org/2001/XInclude"></xi:include>
   <index></index>
 </book>
diff --git a/doc/Pacemaker_Explained/en-US/Revision_History.xml b/doc/Pacemaker_Explained/en-US/Revision_History.xml
index 839f62c750..3da97ac9b8 100644
--- a/doc/Pacemaker_Explained/en-US/Revision_History.xml
+++ b/doc/Pacemaker_Explained/en-US/Revision_History.xml
@@ -1,144 +1,159 @@
 <?xml version='1.0' encoding='utf-8' ?>
 <!DOCTYPE appendix PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
 ]>
 <appendix>
   <!-- see comment in Book_Info.xml for revision numbering -->
   <title>Revision History</title>
   <simpara>
     <revhistory>
       <revision>
         <revnumber>1-0</revnumber>
         <date>19 Oct 2009</date>
         <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
         <revdescription><simplelist><member>Import from Pages.app</member></simplelist></revdescription>
       </revision>
       <revision>
         <revnumber>2-0</revnumber>
         <date>26 Oct 2009</date>
         <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
         <revdescription><simplelist><member>Cleanup and reformatting of docbook xml complete</member></simplelist></revdescription>
       </revision>
       <revision>
         <revnumber>3-0</revnumber>
         <date>Tue Nov 12 2009</date>
         <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
         <revdescription>
           <simplelist>
             <member>Split book into chapters and pass validation</member>
             <member>Re-organize book for use with <ulink url="https://fedorahosted.org/publican/">Publican</ulink></member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>4-0</revnumber>
         <date>Mon Oct 8 2012</date>
         <author><firstname>Andrew</firstname><surname>Beekhof</surname><email>andrew@beekhof.net</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Converted to <ulink url="http://www.methods.co.nz/asciidoc">asciidoc</ulink>
-	      (which is converted to docbook for use with 
-	      <ulink url="https://fedorahosted.org/publican/">Publican</ulink>)
-	    </member>
+              Converted to <ulink url="http://www.methods.co.nz/asciidoc">asciidoc</ulink>
+              (which is converted to docbook for use with 
+              <ulink url="https://fedorahosted.org/publican/">Publican</ulink>)
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>5-0</revnumber>
         <date>Mon Feb 23 2015</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for clarity, stylistic consistency and current command-line syntax
-	    </member>
+              Update for clarity, stylistic consistency and current command-line syntax
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>6-0</revnumber>
         <date>Tue Dec 8 2015</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 1.1.14
-	    </member>
+              Update for Pacemaker 1.1.14
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>7-0</revnumber>
         <date>Tue May 3 2016</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 1.1.15
-	    </member>
+              Update for Pacemaker 1.1.15
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>7-1</revnumber>
         <date>Fri Oct 28 2016</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Overhaul upgrade documentation, and document node health strategies
-	    </member>
+              Overhaul upgrade documentation, and document node health strategies
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>8-0</revnumber>
         <date>Tue Oct 25 2016</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 1.1.16
-	    </member>
+              Update for Pacemaker 1.1.16
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>9-0</revnumber>
         <date>Tue Jul 11 2017</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 1.1.17
-	    </member>
+              Update for Pacemaker 1.1.17
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>10-0</revnumber>
         <date>Fri Oct 6 2017</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 1.1.18
-	    </member>
+              Update for Pacemaker 1.1.18
+            </member>
           </simplelist>
         </revdescription>
       </revision>
       <revision>
         <revnumber>11-0</revnumber>
         <date>Fri Jan 12 2018</date>
         <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
         <revdescription>
           <simplelist>
             <member>
-	      Update for Pacemaker 2.0.0
-	    </member>
+              Update for Pacemaker 2.0.0
+            </member>
+          </simplelist>
+        </revdescription>
+      </revision>
+      <revision>
+        <revnumber>12-0</revnumber>
+        <date>Fri Dec 7 2018</date>
+        <author><firstname>Ken</firstname><surname>Gaillot</surname><email>kgaillot@redhat.com</email></author>
+        <author><firstname>Reid</firstname><surname>Wahl</surname><email>nwahl@redhat.com</email></author>
+        <author><firstname>Jan</firstname><surname>Pokorný</surname><email>jpokorny@redhat.com</email></author>
+        <revdescription>
+          <simplelist>
+            <member>
+              Update for Pacemaker 2.0.1, remove "Further Reading" and "FAQ" sections,
+              and add minor clarifications and reformatting
+            </member>
           </simplelist>
         </revdescription>
       </revision>
     </revhistory>
   </simpara>
 </appendix>
diff --git a/doc/shared/en-US/pacemaker-intro.txt b/doc/shared/en-US/pacemaker-intro.txt
index 02335c9a1d..9626448e0a 100644
--- a/doc/shared/en-US/pacemaker-intro.txt
+++ b/doc/shared/en-US/pacemaker-intro.txt
@@ -1,185 +1,186 @@
 :compat-mode: legacy
 == What Is 'Pacemaker'? ==
 
 *Pacemaker* is a high-availability 'cluster resource manager' -- software that
-runs on a set of hosts (a 'cluster' of 'nodes') in order to minimize downtime of
-desired services ('resources').
+runs on a set of hosts (a 'cluster' of 'nodes') in order to preserve integrity
+and minimize downtime of desired services ('resources').
 footnote:[
 'Cluster' is sometimes used in other contexts to refer to hosts grouped
 together for other purposes, such as high-performance computing (HPC), but
 Pacemaker is not intended for those purposes.
 ]
+It is maintained by the https://www.ClusterLabs.org/[ClusterLabs] community.
 
 Pacemaker's key features include:
 
  * Detection of and recovery from node- and service-level failures
  * Ability to ensure data integrity by fencing faulty nodes
  * Support for one or more nodes per cluster
  * Support for multiple resource interface standards (anything that can be
    scripted can be clustered)
  * Support (but no requirement) for shared storage
  * Support for practically any redundancy configuration (active/passive, N+1,
    etc.)
  * Automatically replicated configuration that can be updated from any node
  * Ability to specify cluster-wide relationships between services,
    such as ordering, colocation and anti-colocation
  * Support for advanced service types, such as 'clones' (services that need to
    be active on multiple nodes), 'stateful resources' (clones that can run in
    one of two modes), and containerized services
  * Unified, scriptable cluster management tools
 
 .Fencing
 [NOTE]
 ====
 'Fencing', also known as 'STONITH' (an acronym for Shoot The Other Node In The
 Head), is the ability to ensure that it is not possible for a node to be
 running a service. This is accomplished via 'fence devices' such as
 intelligent power switches that cut power to the target, or intelligent
 network switches that cut the target's access to the local network.
 
 Pacemaker represents fence devices as a special class of resource.
 
 A cluster cannot safely recover from certain failure conditions, such as an
 unresponsive node, without fencing.
 ====
 
 == Cluster Architecture ==
 
 At a high level, a cluster can viewed as having these parts (which together are
 often referred to as the 'cluster stack'):
 
  * *Resources:* These are the reason for the cluster's being -- the services
    that need to be kept highly available.
 
  * *Resource agents:* These are scripts or operating system components that
    start, stop, and monitor resources, given a set of resource parameters.
    These provide a uniform interface between Pacemaker and the managed
    services.
 
  * *Fence agents:* These are scripts that execute node fencing actions,
    given a target and fence device parameters.
 
  * *Cluster membership layer:* This component provides reliable
    messaging, membership, and quorum information about the cluster.
    Currently, Pacemaker supports http://www.corosync.org/[Corosync]
    as this layer.
 
  * *Cluster resource manager:* Pacemaker provides the brain that processes
    and reacts to events that occur in the cluster. These events may include
    nodes joining or leaving the cluster; resource events caused by failures,
    maintenance, or scheduled activities; and other administrative actions.
    To achieve the desired availability, Pacemaker may start and stop resources
    and fence nodes.
 
  * *Cluster tools:* These provide an interface for users to interact with the
    cluster. Various command-line and graphical (GUI) interfaces are available.
 
 Most managed services are not, themselves, cluster-aware. However, many popular
 open-source cluster filesystems make use of a common 'Distributed Lock
 Manager' (DLM), which makes direct use of Corosync for its messaging and
 membership capabilities and Pacemaker for the ability to fence nodes.
 
 .Example Cluster Stack
 image::images/pcmk-stack.png["Example cluster stack",width="10cm",height="7.5cm",align="center"]
 
 == Pacemaker Architecture ==
 
 Pacemaker itself is composed of multiple daemons that work together:
 
  * pacemakerd
  * pacemaker-attrd
  * pacemaker-based
  * pacemaker-controld
  * pacemaker-execd
  * pacemaker-fenced
  * pacemaker-schedulerd
 
 .Internal Components
 image::images/pcmk-internals.png["Pacemaker software components",align="center",scaledwidth="65%"]
 
 The Pacemaker master process (pacemakerd) spawns all the other daemons, and
 respawns them if they unexpectedly exit.
 
 The 'Cluster Information Base' (CIB) is an
 https://en.wikipedia.org/wiki/XML[XML] representation of the cluster's
 configuration and the state of all nodes and resources. The 'CIB manager'
 (pacemaker-based) keeps the CIB synchronized across the cluster, and handles
 requests to modify it.
 
 The attribute manager (pacemaker-attrd) maintains a database of attributes for
 all nodes, keeps it synchronized across the cluster, and handles requests to
 modify them. These attributes are usually recorded in the CIB.
 
 Given a snapshot of the CIB as input, the 'scheduler' (pacemaker-schedulerd)
 determines what actions are necessary to achieve the desired state of the
 cluster.
 
 The 'local executor' (pacemaker-execd) handles requests to execute
 resource agents on the local cluster node, and returns the result.
 
 The 'fencer' (pacemaker-fenced) handles requests to fence nodes. Given a target
 node, the fencer decides which cluster node(s) should execute which fencing
 device(s), and calls the necessary fencing agents (either directly, or via
 requests to the fencer peers on other nodes), and returns the result.
 
 The 'controller' (pacemaker-controld) is Pacemaker's coordinator,
 maintaining a consistent view of the cluster membership and orchestrating all
 the other components.
 
 Pacemaker centralizes cluster decision-making by electing one of the controller
 instances as the 'Designated Controller' ('DC'). Should the elected DC
 process (or the node it is on) fail, a new one is quickly established.
 The DC responds to cluster events by taking a current snapshot of the CIB,
 feeding it to the scheduler, then asking the executors (either directly on
 the local node, or via requests to controller peers on other nodes) and
 the fencer to execute any necessary actions.
 
 .Old daemon names
 [NOTE]
 ====
 The Pacemaker daemons were renamed in version 2.0. You may still find
 references to the old names, especially in documentation targeted to version
 1.1.
 
 [width="95%",cols="1,2",options="header",align="center"]
 |=========================================================
 | Old name | New name
 | attrd | pacemaker-attrd
 | cib | pacemaker-based
 | crmd | pacemaker-controld
 | lrmd | pacemaker-execd
 | stonithd | pacemaker-fenced
 | pacemaker_remoted | pacemaker-remoted
 |=========================================================
 
 ====
 
 == Node Redundancy Designs ==
 
 Pacemaker supports practically any
 https://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[node
 redundancy configuration] including 'Active/Active', 'Active/Passive', 'N+1',
 'N+M', 'N-to-1' and 'N-to-N'.
 
 Active/passive clusters with two (or more) nodes using Pacemaker and
 https://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device:[DRBD] are
 a cost-effective high-availability solution for many situations. One of the
 nodes provides the desired services, and if it fails, the other node takes
 over.
 
 .Active/Passive Redundancy
 image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"]
 
 Pacemaker also supports multiple nodes in a shared-failover design,
 reducing hardware costs by allowing several active/passive clusters to be
 combined and share a common backup node.
 
 .Shared Failover
 image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"]
 
 When shared storage is available, every node can potentially be used for
 failover. Pacemaker can even run multiple copies of services to spread out the
 workload.
 
 .N to N Redundancy
 image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"]
diff --git a/extra/buildbot.helper b/extra/buildbot.helper
deleted file mode 100755
index 459f1755a7..0000000000
--- a/extra/buildbot.helper
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-set -x
-self=`basename $0`
-
-if [ x$1 = xinstall ]; then
-    # Basic test phase
-    mock --configdir=$PWD --root=mock --resultdir=./mock -v --install nano lcov psmisc sudo valgrind ./mock/*.rpm
-elif [ x$1 = xdownloads ]; then
-    # Extra test phase
-    mock --configdir=$PWD --root=mock --resultdir=./mock -v --install ./downloads/*.rpm nano sudo valgrind lcov
-    
-elif [ x$1 = xlint ]; then
-    rpmlint -i -f rpmlintrc ./mock/*.rpm
-else
-    echo "Unknown sub-command: $1"
-    exit 1
-fi