diff --git a/cts/CIB.py b/cts/CIB.py index 23543eecd7..b53bdd0825 100644 --- a/cts/CIB.py +++ b/cts/CIB.py @@ -1,480 +1,531 @@ '''CTS: Cluster Testing System: CIB generator ''' __copyright__ = ''' Author: Andrew Beekhof Copyright (C) 2008 Andrew Beekhof ''' import os, string, warnings from cts.CTSvars import * class CibBase: def __init__(self, Factory, tag, _id, **kwargs): self.tag = tag self.name = _id self.kwargs = kwargs self.children = [] self.Factory = Factory def __repr__(self): return "%s-%s" % (self.tag, self.name) def add_child(self, child): self.children.append(child) def __setitem__(self, key, value): if value: self.kwargs[key] = value else: self.kwargs.pop(key, None) from cib_xml import * class ConfigBase: cts_cib = None version = "unknown" feature_set = "unknown" Factory = None def __init__(self, CM, factory, tmpfile=None): self.CM = CM self.Factory = factory if not tmpfile: warnings.filterwarnings("ignore") tmpfile = os.tmpnam() warnings.resetwarnings() self.Factory.tmpfile = tmpfile def version(self): return self.version def NextIP(self): ip = self.CM.Env["IPBase"] if ":" in ip: (prefix, sep, suffix) = ip.rpartition(":") suffix = str(hex(int(suffix, 16)+1)).lstrip("0x") else: (prefix, sep, suffix) = ip.rpartition(".") suffix = str(int(suffix)+1) ip = prefix + sep + suffix self.CM.Env["IPBase"] = ip return ip.strip() class CIB11(ConfigBase): feature_set = "3.0" version = "pacemaker-1.1" counter = 1 def _show(self, command=""): output = "" (rc, result) = self.Factory.rsh(self.Factory.target, "HOME=/root CIB_file="+self.Factory.tmpfile+" cibadmin -Ql "+command, None, ) for line in result: output += line self.Factory.debug("Generated Config: "+line) return output def NewIP(self, name=None, standard="ocf"): if self.CM.Env["IPagent"] == "IPaddr2": ip = self.NextIP() if not name: if ":" in ip: (prefix, sep, suffix) = ip.rpartition(":") name = "r"+suffix else: name = "r"+ip r = Resource(self.Factory, name, self.CM.Env["IPagent"], standard) r["ip"] = ip if ":" in ip: r["cidr_netmask"] = "64" r["nic"] = "eth0" else: r["cidr_netmask"] = "32" else: if not name: name = "r%s%d" % (self.CM.Env["IPagent"], self.counter) self.counter = self.counter + 1 r = Resource(self.Factory, name, self.CM.Env["IPagent"], standard) r.add_op("monitor", "5s") return r + def get_node_id(self, node_name): + """ Check the cluster configuration for a node ID. """ + + # We can't account for every possible configuration, + # so we only return a node ID if: + # * The node is specified in /etc/corosync/corosync.conf + # with "ring0_addr:" equal to node_name and "nodeid:" + # explicitly specified. + # * Or, the node is specified in /etc/cluster/cluster.conf + # with name="node_name" nodeid="X" + # In all other cases, we return 0. + node_id = 0 + + # awkward command: use } as record separator + # so each corosync.conf "object" is one record; + # match the "node {" record that has "ring0_addr: node_name"; + # then print the substring of that record after "nodeid:" + (rc, output) = self.Factory.rsh(self.Factory.target, + r"""awk -v RS="}" """ + r"""'/^(\s*nodelist\s*{)?\s*node\s*{.*ring0_addr:\s*%s(\s+|$)/""" + r"""{gsub(/.*nodeid:\s*/,"");gsub(/\s+.*$/,"");print}'""" + r""" /etc/corosync/corosync.conf""" % node_name, None) + if rc == 0 and len(output) == 1: + try: + node_id = int(output[0]) + except ValueError: + node_id = 0 + + # another awkward command: use < or > as record separator + # so each cluster.conf XML tag is one record; + # match the clusternode record that has name="node_name"; + # then print the substring of that record for nodeid="X" + if node_id == 0: + (rc, output) = self.Factory.rsh(self.Factory.target, + r"""awk -v RS="[<>]" """ + r"""'/^clusternode\s+.*name="%s".*/""" + r"""{gsub(/.*nodeid="/,"");gsub(/".*/,"");print}'""" + r""" /etc/cluster/cluster.conf""" % node_name, None) + if rc == 0 and len(output) == 1: + try: + node_id = int(output[0]) + except ValueError: + node_id = 0 + + return node_id + def install(self, target): old = self.Factory.tmpfile # Force a rebuild self.cts_cib = None self.Factory.tmpfile = CTSvars.CRM_CONFIG_DIR+"/cib.xml" self.contents(target) self.Factory.rsh(self.Factory.target, "chown "+CTSvars.CRM_DAEMON_USER+" "+self.Factory.tmpfile) self.Factory.tmpfile = old def contents(self, target=None): # fencing resource if self.cts_cib: return self.cts_cib if target: self.Factory.target = target self.Factory.rsh(self.Factory.target, "HOME=/root cibadmin --empty %s > %s" % (self.version, self.Factory.tmpfile)) #cib_base = self.cib_template % (self.feature_set, self.version, ''' remote-tls-port='9898' remote-clear-port='9999' ''') self.num_nodes = len(self.CM.Env["nodes"]) no_quorum = "stop" if self.num_nodes < 3: no_quorum = "ignore" self.Factory.log("Cluster only has %d nodes, configuring: no-quorum-policy=ignore" % self.num_nodes) # We don't need a nodes section unless we add attributes stn = None # Fencing resource # Define first so that the shell doesn't reject every update if self.CM.Env["DoFencing"]: st = Resource(self.Factory, "Fencing", ""+self.CM.Env["stonith-type"], "stonith") # Set a threshold for unreliable stonith devices such as the vmware one st.add_meta("migration-threshold", "5") st.add_op("monitor", "120s", timeout="120s") st.add_op("stop", "0", timeout="60s") st.add_op("start", "0", timeout="60s") entries = string.split(self.CM.Env["stonith-params"], ',') for entry in entries: (name, value) = string.split(entry, '=') if name == "hostlist" and value == "all": value = string.join(self.CM.Env["nodes"], " ") elif name == "pcmk_host_list" and value == "all": value = string.join(self.CM.Env["nodes"], " ") st[name] = value st.commit() # Test advanced fencing logic if True: stf_nodes = [] stt_nodes = [] - attr_nodes = [] + attr_nodes = {} # Create the levels stl = FencingTopology(self.Factory) for node in self.CM.Env["nodes"]: # Randomly assign node to a fencing method ftype = self.CM.Env.RandomGen.choice(["levels-and", "levels-or ", "broadcast "]) # For levels-and, randomly choose targeting by node name or attribute by = "" if ftype == "levels-and": if self.CM.Env.RandomGen.choice([True, False]): - attr_nodes.append(node) - by = " (by attribute)" + node_id = self.get_node_id(node) + if node_id == 0: + # We couldn't find a node ID, so revert to targeting by name + by = " (by name)" + else: + attr_nodes[node] = node_id + by = " (by attribute)" else: by = " (by name)" self.CM.log(" - Using %s fencing for node: %s%s" % (ftype, node, by)) # for baremetal remote node tests (is this really necessary?) stt_nodes.append("remote_%s" % node) if ftype == "levels-and": if node not in attr_nodes: stl.level(1, node, "FencingPass,Fencing") stt_nodes.append(node) elif ftype == "levels-or ": stl.level(1, node, "FencingFail") stl.level(2, node, "Fencing") stf_nodes.append(node) # If any levels-and nodes were targeted by attribute, # create the attributes and a level for the attribute. - if len(attr_nodes): + if attr_nodes: stn = Nodes(self.Factory) - for node in attr_nodes: - stn[node] = { "cts-fencing" : "levels-and" } + for (node_name, node_id) in attr_nodes.items(): + stn.add_node(node_name, node_id, { "cts-fencing" : "levels-and" }) stl.level(1, "cts-fencing=levels-and", "FencingPass,Fencing") # Create a Dummy agent that always passes for levels-and if len(stt_nodes): self.CM.install_helper("fence_dummy", destdir="/usr/sbin", sourcedir=CTSvars.Fencing_home) stt = Resource(self.Factory, "FencingPass", "fence_dummy", "stonith") stt["pcmk_host_list"] = string.join(stt_nodes, " ") # Wait this many seconds before doing anything, handy for letting disks get flushed too stt["random_sleep_range"] = "30" stt["mode"] = "pass" stt.commit() # Create a Dummy agent that always fails for levels-or if len(stf_nodes): self.CM.install_helper("fence_dummy", destdir="/usr/sbin", sourcedir=CTSvars.Fencing_home) stf = Resource(self.Factory, "FencingFail", "fence_dummy", "stonith") stf["pcmk_host_list"] = string.join(stf_nodes, " ") # Wait this many seconds before doing anything, handy for letting disks get flushed too stf["random_sleep_range"] = "30" stf["mode"] = "fail" stf.commit() # Now commit the levels themselves stl.commit() o = Option(self.Factory, "stonith-enabled", self.CM.Env["DoFencing"]) o["start-failure-is-fatal"] = "false" o["pe-input-series-max"] = "5000" o["default-action-timeout"] = "90s" o["shutdown-escalation"] = "5min" o["batch-limit"] = "10" o["dc-deadtime"] = "5s" o["no-quorum-policy"] = no_quorum o["expected-quorum-votes"] = self.num_nodes o["notification-agent"] = "/var/lib/pacemaker/notify.sh" o["notification-recipient"] = "/var/lib/pacemaker/notify.log" if self.CM.Env["DoBSC"] == 1: o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!" o.commit() # Commit the nodes section if we defined one if stn is not None: stn.commit() # Add resources? if self.CM.Env["CIBResource"] == 1: self.add_resources() if self.CM.cluster_monitor == 1: mon = Resource(self.Factory, "cluster_mon", "ocf", "ClusterMon", "pacemaker") mon.add_op("start", "0", requires="nothing") mon.add_op("monitor", "5s", requires="nothing") mon["update"] = "10" mon["extra_options"] = "-r -n" mon["user"] = "abeekhof" mon["htmlfile"] = "/suse/abeekhof/Export/cluster.html" mon.commit() #self._create('''location prefer-dc cluster_mon rule -INFINITY: \#is_dc eq false''') # generate cib self.cts_cib = self._show() if self.Factory.tmpfile != CTSvars.CRM_CONFIG_DIR+"/cib.xml": self.Factory.rsh(self.Factory.target, "rm -f "+self.Factory.tmpfile) return self.cts_cib def add_resources(self): # Per-node resources for node in self.CM.Env["nodes"]: name = "rsc_"+node r = self.NewIP(name) r.prefer(node, "100") r.commit() # Migrator # Make this slightly sticky (since we have no other location constraints) to avoid relocation during Reattach m = Resource(self.Factory, "migrator","Dummy", "ocf", "pacemaker") m["passwd"] = "whatever" m.add_meta("resource-stickiness","1") m.add_meta("allow-migrate", "1") m.add_op("monitor", "P10S") m.commit() # Ping the test master p = Resource(self.Factory, "ping-1","ping", "ocf", "pacemaker") p.add_op("monitor", "60s") p["host_list"] = self.CM.Env["cts-master"] p["name"] = "connected" p["debug"] = "true" c = Clone(self.Factory, "Connectivity", p) c["globally-unique"] = "false" c.commit() #master slave resource s = Resource(self.Factory, "stateful-1", "Stateful", "ocf", "pacemaker") s.add_op("monitor", "15s", timeout="60s") s.add_op("monitor", "16s", timeout="60s", role="Master") ms = Master(self.Factory, "master-1", s) ms["clone-max"] = self.num_nodes ms["master-max"] = 1 ms["clone-node-max"] = 1 ms["master-node-max"] = 1 # Require conectivity to run the master r = Rule(self.Factory, "connected", "-INFINITY", op="or") r.add_child(Expression(self.Factory, "m1-connected-1", "connected", "lt", "1")) r.add_child(Expression(self.Factory, "m1-connected-2", "connected", "not_defined", None)) ms.prefer("connected", rule=r) ms.commit() # Group Resource g = Group(self.Factory, "group-1") g.add_child(self.NewIP()) if self.CM.Env["have_systemd"]: # It would be better to put the python in a separate file, so we # could loop "while True" rather than sleep for 24 hours. We can't # put a loop in a single-line python command; only simple commands # may be separated by semicolon in python. dummy_service_file = """ [Unit] Description=Dummy resource that takes a while to start [Service] Type=notify ExecStart=/usr/bin/python -c 'import time, systemd.daemon; time.sleep(10); systemd.daemon.notify("READY=1"); time.sleep(86400)' ExecStop=/bin/sleep 10 ExecStop=/bin/kill -s KILL \$MAINPID """ os.system("cat <<-END >/tmp/DummySD.service\n%s\nEND" % (dummy_service_file)) self.CM.install_helper("DummySD.service", destdir="/usr/lib/systemd/system/", sourcedir="/tmp") sysd = Resource(self.Factory, "petulant", "DummySD", "service") sysd.add_op("monitor", "P10S") g.add_child(sysd) else: g.add_child(self.NewIP()) g.add_child(self.NewIP()) # Group with the master g.after("master-1", first="promote", then="start") g.colocate("master-1", "INFINITY", withrole="Master") g.commit() # LSB resource lsb_agent = self.CM.install_helper("LSBDummy") lsb = Resource(self.Factory, "lsb-dummy",lsb_agent, "lsb") lsb.add_op("monitor", "5s") # LSB with group lsb.after("group-1") lsb.colocate("group-1") lsb.commit() class CIB12(CIB11): feature_set = "3.0" version = "pacemaker-1.2" class CIB20(CIB11): feature_set = "3.0" version = "pacemaker-2.0" #class HASI(CIB10): # def add_resources(self): # # DLM resource # self._create('''primitive dlm ocf:pacemaker:controld op monitor interval=120s''') # self._create('''clone dlm-clone dlm meta globally-unique=false interleave=true''') # O2CB resource # self._create('''primitive o2cb ocf:ocfs2:o2cb op monitor interval=120s''') # self._create('''clone o2cb-clone o2cb meta globally-unique=false interleave=true''') # self._create('''colocation o2cb-with-dlm INFINITY: o2cb-clone dlm-clone''') # self._create('''order start-o2cb-after-dlm mandatory: dlm-clone o2cb-clone''') class ConfigFactory: def __init__(self, CM): self.CM = CM self.rsh = self.CM.rsh self.register("pacemaker11", CIB11, CM, self) self.register("pacemaker12", CIB12, CM, self) self.register("pacemaker20", CIB20, CM, self) # self.register("hae", HASI, CM, self) self.target = self.CM.Env["nodes"][0] self.tmpfile = None def log(self, args): self.CM.log("cib: %s" % args) def debug(self, args): self.CM.debug("cib: %s" % args) def register(self, methodName, constructor, *args, **kargs): """register a constructor""" _args = [constructor] _args.extend(args) setattr(self, methodName, ConfigFactoryItem(*_args, **kargs)) def unregister(self, methodName): """unregister a constructor""" delattr(self, methodName) def createConfig(self, name="pacemaker-1.0"): if name == "pacemaker-1.0": name = "pacemaker10"; elif name == "pacemaker-1.1": name = "pacemaker11"; elif name == "pacemaker-1.2": name = "pacemaker12"; elif name == "pacemaker-2.0": name = "pacemaker20"; elif name == "hasi": name = "hae"; if hasattr(self, name): return getattr(self, name)() else: self.CM.log("Configuration variant '%s' is unknown. Defaulting to latest config" % name) return self.pacemaker12() class ConfigFactoryItem: def __init__(self, function, *args, **kargs): self._function = function self._args = args self._kargs = kargs def __call__(self, *args, **kargs): """call function""" _args = list(self._args) _args.extend(args) _kargs = self._kargs.copy() _kargs.update(kargs) return self._function(*_args,**_kargs) if __name__ == '__main__': """ Unit test (pass cluster node names as command line arguments) """ import CTS import CM_ais import sys if len(sys.argv) < 2: print("Usage: %s ..." % sys.argv[0]) sys.exit(1) args = [ "--nodes", " ".join(sys.argv[1:]), "--clobber-cib", "--populate-resources", "--stack", "corosync", "--test-ip-base", "fe80::1234:56:7890:1000", "--stonith", "rhcs", "--stonith-args", "pcmk_arg_map=domain:uname" ] env = CTS.CtsLab(args) cm = CM_ais.crm_mcp(env) CibFactory = ConfigFactory(cm) cib = CibFactory.createConfig("pacemaker-1.1") print(cib.contents()) diff --git a/cts/CTS.py b/cts/CTS.py index 634348a553..4c7448cbea 100644 --- a/cts/CTS.py +++ b/cts/CTS.py @@ -1,1001 +1,1002 @@ '''CTS: Cluster Testing System: Main module Classes related to testing high-availability clusters... ''' __copyright__ = ''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. import string, sys, time, re, os, traceback from UserDict import UserDict from cts.CTSvars import * from cts.logging import LogFactory from cts.watcher import LogWatcher from cts.remote import RemoteFactory from cts.environment import EnvFactory from cts.patterns import PatternSelector has_log_stats = {} log_stats_bin = CTSvars.CRM_DAEMON_DIR + "/cts_log_stats.sh" log_stats = """ #!/bin/bash # Tool for generating system load reports while CTS runs trap "" 1 f=$1; shift action=$1; shift base=`basename $0` if [ ! -e $f ]; then echo "Time, Load 1, Load 5, Load 15, Test Marker" > $f fi function killpid() { if [ -e $f.pid ]; then kill -9 `cat $f.pid` rm -f $f.pid fi } function status() { if [ -e $f.pid ]; then kill -0 `cat $f.pid` return $? else return 1 fi } function start() { # Is it already running? if status then return fi echo Active as $$ echo $$ > $f.pid while [ 1 = 1 ]; do uptime | sed s/up.*:/,/ | tr '\\n' ',' >> $f #top -b -c -n1 | grep -e usr/libexec/pacemaker | grep -v -e grep -e python | head -n 1 | sed s@/usr/libexec/pacemaker/@@ | awk '{print " 0, "$9", "$10", "$12}' | tr '\\n' ',' >> $f echo 0 >> $f sleep 5 done } case $action in start) start ;; start-bg|bg) # Use c --ssh -- ./stats.sh file start-bg nohup $0 $f start >/dev/null 2>&1 > $f echo " $*" >> $f start ;; *) echo "Unknown action: $action." ;; esac """ class CtsLab: '''This class defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. It is where you define the set of nodes that are in your test lab what kind of reset mechanism you use, etc. This class is derived from a UserDict because we hold many different parameters of different kinds, and this provides provide a uniform and extensible interface useful for any kind of communication between the user/administrator/tester and CTS. At this point in time, it is the intent of this class to model static configuration and/or environmental data about the environment which doesn't change as the tests proceed. Well-known names (keys) are an important concept in this class. The HasMinimalKeys member function knows the minimal set of well-known names for the class. The following names are standard (well-known) at this time: nodes An array of the nodes in the cluster reset A ResetMechanism object logger An array of objects that log strings... CMclass The type of ClusterManager we are running (This is a class object, not a class instance) RandSeed Random seed. It is a triple of bytes. (optional) The CTS code ignores names it doesn't know about/need. The individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions or other information to the tests through this mechanism. ''' def __init__(self, args=None): self.Env = EnvFactory().getInstance(args) self.Scenario = None self.logger = LogFactory() self.rsh = RemoteFactory().getInstance() def dump(self): self.Env.dump() def has_key(self, key): return key in self.Env.keys() def __getitem__(self, key): return self.Env[key] def __setitem__(self, key, value): self.Env[key] = value def run(self, Scenario, Iterations): if not Scenario: self.logger.log("No scenario was defined") return 1 self.logger.log("Cluster nodes: ") for node in self.Env["nodes"]: self.logger.log(" * %s" % (node)) if not Scenario.SetUp(): return 1 try : Scenario.run(Iterations) except : self.logger.log("Exception by %s" % sys.exc_info()[0]) self.logger.traceback(traceback) Scenario.summarize() Scenario.TearDown() return 1 #ClusterManager.oprofileSave(Iterations) Scenario.TearDown() Scenario.summarize() if Scenario.Stats["failure"] > 0: return Scenario.Stats["failure"] elif Scenario.Stats["success"] != Iterations: self.logger.log("No failure count but success != requested iterations") return 1 return 0 def __CheckNode(self, node): "Raise a ValueError if the given node isn't valid" if not self.IsValidNode(node): raise ValueError("Invalid node [%s] in CheckNode" % node) class NodeStatus: def __init__(self, env): self.Env = env def IsNodeBooted(self, node): '''Return TRUE if the given node is booted (responds to pings)''' if self.Env["docker"]: return RemoteFactory().getInstance()("localhost", "docker inspect --format {{.State.Running}} %s | grep -q true" % node, silent=True) == 0 return RemoteFactory().getInstance()("localhost", "ping -nq -c1 -w1 %s" % node, silent=True) == 0 def IsSshdUp(self, node): rc = RemoteFactory().getInstance()(node, "true", silent=True) return rc == 0 def WaitForNodeToComeUp(self, node, Timeout=300): '''Return TRUE when given node comes up, or None/FALSE if timeout''' timeout = Timeout anytimeouts = 0 while timeout > 0: if self.IsNodeBooted(node) and self.IsSshdUp(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) LogFactory().debug("Node %s now up" % node) return 1 time.sleep(30) if (not anytimeouts): LogFactory().debug("Waiting for node %s to come up" % node) anytimeouts = 1 timeout = timeout - 1 LogFactory().log("%s did not come up within %d tries" % (node, Timeout)) answer = raw_input('Continue? [nY]') if answer and answer == "n": raise ValueError("%s did not come up within %d tries" % (node, Timeout)) def WaitForAllNodesToComeUp(self, nodes, timeout=300): '''Return TRUE when all nodes come up, or FALSE if timeout''' for node in nodes: if not self.WaitForNodeToComeUp(node, timeout): return None return 1 class ClusterManager(UserDict): '''The Cluster Manager class. This is an subclass of the Python dictionary class. (this is because it contains lots of {name,value} pairs, not because it's behavior is that terribly similar to a dictionary in other ways.) This is an abstract class which class implements high-level operations on the cluster and/or its cluster managers. Actual cluster managers classes are subclassed from this type. One of the things we do is track the state we think every node should be in. ''' def __InitialConditions(self): #if os.geteuid() != 0: # raise ValueError("Must Be Root!") None def _finalConditions(self): for key in list(self.keys()): if self[key] == None: raise ValueError("Improper derivation: self[" + key + "] must be overridden by subclass.") def __init__(self, Environment, randseed=None): self.Env = EnvFactory().getInstance() self.templates = PatternSelector(self.Env["Name"]) self.__InitialConditions() self.logger = LogFactory() self.clear_cache = 0 self.TestLoggingLevel=0 self.data = {} self.name = self.Env["Name"] self.rsh = RemoteFactory().getInstance() self.ShouldBeStatus={} self.ns = NodeStatus(self.Env) self.OurNode = string.lower(os.uname()[1]) self.__instance_errorstoignore = [] def __getitem__(self, key): if key == "Name": return self.name print("FIXME: Getting %s from %s" % (key, repr(self))) if key in self.data: return self.data[key] return self.templates.get_patterns(self.Env["Name"], key) def __setitem__(self, key, value): print("FIXME: Setting %s=%s on %s" % (key, value, repr(self))) self.data[key] = value def key_for_node(self, node): return node def instance_errorstoignore_clear(self): '''Allows the test scenario to reset instance errors to ignore on each iteration.''' self.__instance_errorstoignore = [] def instance_errorstoignore(self): '''Return list of errors which are 'normal' for a specific test instance''' return self.__instance_errorstoignore def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] def log(self, args): self.logger.log(args) def debug(self, args): self.logger.debug(args) def prepare(self): '''Finish the Initialization process. Prepare to test...''' print(repr(self)+"prepare") for node in self.Env["nodes"]: if self.StataCM(node): self.ShouldBeStatus[node] = "up" else: self.ShouldBeStatus[node] = "down" self.unisolate_node(node) def upcount(self): '''How many nodes are up?''' count = 0 for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": count = count + 1 return count def install_helper(self, filename, destdir=None, nodes=None, sourcedir=None): if sourcedir == None: sourcedir = CTSvars.CTS_home file_with_path = "%s/%s" % (sourcedir, filename) if not nodes: nodes = self.Env["nodes"] if not destdir: destdir = CTSvars.CTS_home self.debug("Installing %s to %s on %s" % (filename, destdir, repr(self.Env["nodes"]))) for node in nodes: self.rsh(node, "mkdir -p %s" % destdir) self.rsh.cp(file_with_path, "root@%s:%s/%s" % (node, destdir, filename)) return file_with_path def install_config(self, node): return None def clear_all_caches(self): if self.clear_cache: for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "down": self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") else: self.debug("NOT Removing cache file on: "+node) def prepare_fencing_watcher(self, name): # If we don't have quorum now but get it as a result of starting this node, # then a bunch of nodes might get fenced upnode = None if self.HasQuorum(None): self.debug("Have quorum") return None if not self.templates["Pat:Fencing_start"]: print("No start pattern") return None if not self.templates["Pat:Fencing_ok"]: print("No ok pattern") return None stonith = None stonithPats = [] for peer in self.Env["nodes"]: if self.ShouldBeStatus[peer] != "up": stonithPats.append(self.templates["Pat:Fencing_ok"] % peer) stonithPats.append(self.templates["Pat:Fencing_start"] % peer) elif self.Env["Stack"] == "corosync (cman)": # There is a delay between gaining quorum and CMAN starting fencing # This can mean that even nodes that are fully up get fenced # There is no use fighting it, just look for everyone so that CTS doesn't get confused stonithPats.append(self.templates["Pat:Fencing_ok"] % peer) stonithPats.append(self.templates["Pat:Fencing_start"] % peer) stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) stonith.setwatch() return stonith def fencing_cleanup(self, node, stonith): peer_list = [] peer_state = {} self.debug("Looking for nodes that were fenced as a result of %s starting" % node) # If we just started a node, we may now have quorum (and permission to fence) if not stonith: self.debug("Nothing to do") return peer_list q = self.HasQuorum(None) if not q and len(self.Env["nodes"]) > 2: # We didn't gain quorum - we shouldn't have shot anyone self.debug("Quorum: %d Len: %d" % (q, len(self.Env["nodes"]))) return peer_list for n in self.Env["nodes"]: peer_state[n] = "unknown" # Now see if any states need to be updated self.debug("looking for: " + repr(stonith.regexes)) shot = stonith.look(0) while shot: line = repr(shot) self.debug("Found: " + line) del stonith.regexes[stonith.whichmatch] # Extract node name for n in self.Env["nodes"]: if re.search(self.templates["Pat:Fencing_ok"] % n, shot): peer = n peer_state[peer] = "complete" self.__instance_errorstoignore.append(self.templates["Pat:Fencing_ok"] % peer) elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot): # TODO: Correctly detect multiple fencing operations for the same host peer = n peer_state[peer] = "in-progress" self.__instance_errorstoignore.append(self.templates["Pat:Fencing_start"] % peer) if not peer: self.logger.log("ERROR: Unknown stonith match: %s" % line) elif not peer in peer_list: self.debug("Found peer: " + peer) peer_list.append(peer) # Get the next one shot = stonith.look(60) for peer in peer_list: self.debug(" Peer %s was fenced as a result of %s starting: %s" % (peer, node, peer_state[peer])) if self.Env["at-boot"]: self.ShouldBeStatus[peer] = "up" else: self.ShouldBeStatus[peer] = "down" if peer_state[peer] == "in-progress": # Wait for any in-progress operations to complete shot = stonith.look(60) while len(stonith.regexes) and shot: line = repr(shot) self.debug("Found: " + line) del stonith.regexes[stonith.whichmatch] shot = stonith.look(60) # Now make sure the node is alive too self.ns.WaitForNodeToComeUp(peer, self.Env["DeadTime"]) # Poll until it comes up if self.Env["at-boot"]: if not self.StataCM(peer): time.sleep(self.Env["StartTime"]) if not self.StataCM(peer): self.logger.log("ERROR: Peer %s failed to restart after being fenced" % peer) return None return peer_list def StartaCM(self, node, verbose=False): '''Start up the cluster manager on a given node''' if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node)) else: self.debug("Starting %s on node %s" % (self.templates["Name"], node)) ret = 1 if not node in self.ShouldBeStatus: self.ShouldBeStatus[node] = "down" if self.ShouldBeStatus[node] != "down": return 1 patterns = [] # Technically we should always be able to notice ourselves starting patterns.append(self.templates["Pat:Local_started"] % node) if self.upcount() == 0: patterns.append(self.templates["Pat:Master_started"] % node) else: patterns.append(self.templates["Pat:Slave_started"] % node) watch = LogWatcher( self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) self.install_config(node) self.ShouldBeStatus[node] = "any" if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.logger.log ("%s was already started" % (node)) return 1 # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self.templates["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"]) stonith = self.prepare_fencing_watcher(node) watch.setwatch() if self.rsh(node, startCmd) != 0: self.logger.log ("Warn: Start command failed on node %s" % (node)) self.fencing_cleanup(node, stonith) return None self.ShouldBeStatus[node] = "up" watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) if watch_result and self.cluster_stable(self.Env["DeadTime"]): #self.debug("Found match: "+ repr(watch_result)) self.fencing_cleanup(node, stonith) return 1 elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.fencing_cleanup(node, stonith) return 1 self.logger.log ("Warn: Start failed for node %s" % (node)) return None def StartaCMnoBlock(self, node, verbose=False): '''Start up the cluster manager on a given node with none-block mode''' if verbose: self.logger.log("Starting %s on node %s" % (self["Name"], node)) else: self.debug("Starting %s on node %s" % (self["Name"], node)) # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") self.install_config(node) if not(self.Env["valgrind-tests"]): startCmd = self.templates["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"]) self.rsh(node, startCmd, synchronous=0) self.ShouldBeStatus[node] = "up" return 1 def StopaCM(self, node, verbose=False, force=False): '''Stop the cluster manager on a given node''' if verbose: self.logger.log("Stopping %s on node %s" % (self["Name"], node)) else: self.debug("Stopping %s on node %s" % (self["Name"], node)) if self.ShouldBeStatus[node] != "up" and force == False: return 1 if self.rsh(node, self.templates["StopCmd"]) == 0: # Make sure we can continue even if corosync leaks # fdata-* is the old name #self.rsh(node, "rm -f /dev/shm/qb-* /dev/shm/fdata-*") self.ShouldBeStatus[node] = "down" self.cluster_stable(self.Env["DeadTime"]) return 1 else: self.logger.log ("ERROR: Could not stop %s on node %s" % (self["Name"], node)) return None def StopaCMnoBlock(self, node): '''Stop the cluster manager on a given node with none-block mode''' self.debug("Stopping %s on node %s" % (self["Name"], node)) self.rsh(node, self.templates["StopCmd"], synchronous=0) self.ShouldBeStatus[node] = "down" return 1 def cluster_stable(self, timeout = None): time.sleep(self.Env["StableTime"]) return 1 def node_stable(self, node): return 1 def RereadCM(self, node): '''Force the cluster manager on a given node to reread its config This may be a no-op on certain cluster managers. ''' rc=self.rsh(node, self.templates["RereadCmd"]) if rc == 0: return 1 else: self.logger.log ("Could not force %s on node %s to reread its config" % (self["Name"], node)) return None def StataCM(self, node): '''Report the status of the cluster manager on a given node''' out=self.rsh(node, self.templates["StatusCmd"] % node, 1) ret= (string.find(out, 'stopped') == -1) try: if ret: if self.ShouldBeStatus[node] == "down": self.logger.log( "Node status for %s is %s but we think it should be %s" % (node, "up", self.ShouldBeStatus[node])) else: if self.ShouldBeStatus[node] == "up": self.logger.log( "Node status for %s is %s but we think it should be %s" % (node, "down", self.ShouldBeStatus[node])) except KeyError: pass if ret: self.ShouldBeStatus[node] = "up" else: self.ShouldBeStatus[node] = "down" return ret def startall(self, nodelist=None, verbose=False, quick=False): '''Start the cluster manager on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist = self.Env["nodes"] for node in nodelist: if self.ShouldBeStatus[node] == "down": self.ns.WaitForAllNodesToComeUp(nodelist, 300) if not quick: + # This is used for "basic sanity checks", so only start one node ... if not self.StartaCM(node, verbose=verbose): return 0 return 1 # Approximation of SimulStartList for --boot watchpats = [ ] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in nodelist: watchpats.append(self.templates["Pat:Local_started"] % node) watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) # Start all the nodes - at about the same time... watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) watch.setwatch() if not self.StartaCM(nodelist[0], verbose=verbose): return 0 for node in nodelist: self.StartaCMnoBlock(node, verbose=verbose) watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) if not self.cluster_stable(): self.logger.log("Cluster did not stabilize") return 0 return 1 def stopall(self, nodelist=None, verbose=False, force=False): '''Stop the cluster managers on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' ret = 1 map = {} if not nodelist: nodelist = self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up" or force == True: if not self.StopaCM(node, verbose=verbose, force=force): ret = 0 return ret def rereadall(self, nodelist=None): '''Force the cluster managers on every node in the cluster to reread their config files. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist = self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": self.RereadCM(node) def statall(self, nodelist=None): '''Return the status of the cluster managers in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' result = {} if not nodelist: nodelist = self.Env["nodes"] for node in nodelist: if self.StataCM(node): result[node] = "up" else: result[node] = "down" return result def isolate_node(self, target, nodes=None): '''isolate the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: rc = self.rsh(target, self.templates["BreakCommCmd"] % self.key_for_node(node)) if rc != 0: self.logger.log("Could not break the communication between %s and %s: %d" % (target, node, rc)) return None else: self.debug("Communication cut between %s and %s" % (target, node)) return 1 def unisolate_node(self, target, nodes=None): '''fix the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: restored = 0 # Limit the amount of time we have asynchronous connectivity for # Restore both sides as simultaneously as possible self.rsh(target, self.templates["FixCommCmd"] % self.key_for_node(node), synchronous=0) self.rsh(node, self.templates["FixCommCmd"] % self.key_for_node(target), synchronous=0) self.debug("Communication restored between %s and %s" % (target, node)) def reducecomm_node(self,node): '''reduce the communication between the nodes''' rc = self.rsh(node, self.templates["ReduceCommCmd"]%(self.Env["XmitLoss"],self.Env["RecvLoss"])) if rc == 0: return 1 else: self.logger.log("Could not reduce the communication between the nodes from node: %s" % node) return None def restorecomm_node(self,node): '''restore the saved communication between the nodes''' rc = 0 if float(self.Env["XmitLoss"]) != 0 or float(self.Env["RecvLoss"]) != 0 : rc = self.rsh(node, self.templates["RestoreCommCmd"]); if rc == 0: return 1 else: self.logger.log("Could not restore the communication between the nodes from node: %s" % node) return None def HasQuorum(self, node_list): "Return TRUE if the cluster currently has quorum" # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes raise ValueError("Abstract Class member (HasQuorum)") def Components(self): raise ValueError("Abstract Class member (Components)") def oprofileStart(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStart(n) elif node in self.Env["oprofile"]: self.debug("Enabling oprofile on %s" % node) self.rsh(node, "opcontrol --init") self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") self.rsh(node, "opcontrol --start") self.rsh(node, "opcontrol --reset") def oprofileSave(self, test, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileSave(test, n) elif node in self.Env["oprofile"]: self.rsh(node, "opcontrol --dump") self.rsh(node, "opcontrol --save=cts.%d" % test) # Read back with: opreport -l session:cts.0 image:/usr/lib/heartbeat/c* if None: self.rsh(node, "opcontrol --reset") else: self.oprofileStop(node) self.oprofileStart(node) def oprofileStop(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStop(n) elif node in self.Env["oprofile"]: self.debug("Stopping oprofile on %s" % node) self.rsh(node, "opcontrol --reset") self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") def StatsExtract(self): if not self.Env["stats"]: return for host in self.Env["nodes"]: log_stats_file = "%s/cts-stats.csv" % CTSvars.CRM_DAEMON_DIR if host in has_log_stats: self.rsh(host, '''bash %s %s stop''' % (log_stats_bin, log_stats_file)) (rc, lines) = self.rsh(host, '''cat %s''' % log_stats_file, stdout=2) self.rsh(host, '''bash %s %s delete''' % (log_stats_bin, log_stats_file)) fname = "cts-stats-%d-nodes-%s.csv" % (len(self.Env["nodes"]), host) print("Extracted stats: %s" % fname) fd = open(fname, "a") fd.writelines(lines) fd.close() def StatsMark(self, testnum): '''Mark the test number in the stats log''' global has_log_stats if not self.Env["stats"]: return for host in self.Env["nodes"]: log_stats_file = "%s/cts-stats.csv" % CTSvars.CRM_DAEMON_DIR if not host in has_log_stats: global log_stats global log_stats_bin script=log_stats #script = re.sub("\\\\", "\\\\", script) script = re.sub('\"', '\\\"', script) script = re.sub("'", "\'", script) script = re.sub("`", "\`", script) script = re.sub("\$", "\\\$", script) self.debug("Installing %s on %s" % (log_stats_bin, host)) self.rsh(host, '''echo "%s" > %s''' % (script, log_stats_bin), silent=True) self.rsh(host, '''bash %s %s delete''' % (log_stats_bin, log_stats_file)) has_log_stats[host] = 1 # Now mark it self.rsh(host, '''bash %s %s mark %s''' % (log_stats_bin, log_stats_file, testnum), synchronous=0) class Resource: ''' This is an HA resource (not a resource group). A resource group is just an ordered list of Resource objects. ''' def __init__(self, cm, rsctype=None, instance=None): self.CM = cm self.ResourceType = rsctype self.Instance = instance self.needs_quorum = 1 def Type(self): return self.ResourceType def Instance(self, nodename): return self.Instance def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. It is analagous to the "status" operation on SystemV init scripts and heartbeat scripts. FailSafe calls it the "exclusive" operation. ''' raise ValueError("Abstract Class member (IsRunningOn)") return None def IsWorkingCorrectly(self, nodename): ''' This member function returns true if our resource is operating correctly on the given node in the cluster. Heartbeat does not require this operation, but it might be called the Monitor operation, which is what FailSafe calls it. For remotely monitorable resources (like IP addresses), they *should* be monitored remotely for testing. ''' raise ValueError("Abstract Class member (IsWorkingCorrectly)") return None def Start(self, nodename): ''' This member function starts or activates the resource. ''' raise ValueError("Abstract Class member (Start)") return None def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' raise ValueError("Abstract Class member (Stop)") return None def __repr__(self): if (self.Instance and len(self.Instance) > 1): return "{" + self.ResourceType + "::" + self.Instance + "}" else: return "{" + self.ResourceType + "}" class Component: def kill(self, node): None class Process(Component): def __init__(self, cm, name, process=None, dc_only=0, pats=[], dc_pats=[], badnews_ignore=[], common_ignore=[], triggersreboot=0): self.name = str(name) self.dc_only = dc_only self.pats = pats self.dc_pats = dc_pats self.CM = cm self.badnews_ignore = badnews_ignore self.badnews_ignore.extend(common_ignore) self.triggersreboot = triggersreboot if process: self.proc = str(process) else: self.proc = str(name) self.KillCmd = "killall -9 " + self.proc def kill(self, node): if self.CM.rsh(node, self.KillCmd) != 0: self.CM.log ("ERROR: Kill %s failed on node %s" % (self.name,node)) return None return 1 diff --git a/cts/cib_xml.py b/cts/cib_xml.py index feb4e05a9c..2d4470548e 100644 --- a/cts/cib_xml.py +++ b/cts/cib_xml.py @@ -1,297 +1,290 @@ '''CTS: Cluster Testing System: CIB generator ''' __copyright__ = ''' Author: Andrew Beekhof Copyright (C) 2008 Andrew Beekhof ''' import sys import string from cts.CTSvars import * from cts.CIB import CibBase class XmlBase(CibBase): def __init__(self, Factory, tag, _id, **kwargs): CibBase.__init__(self, Factory, tag, _id, **kwargs) def show(self): text = '''<%s''' % self.tag if self.name: text += ''' id="%s"''' % (self.name) for k in list(self.kwargs.keys()): text += ''' %s="%s"''' % (k, self.kwargs[k]) if not self.children: text += '''/>''' return text text += '''>''' for c in self.children: text += c.show() text += '''''' % self.tag return text def _run(self, operation, xml, section="all", options=""): if self.name: label = self.name else: label = "<%s>" % self.tag self.Factory.debug("Writing out %s" % label) fixed = "HOME=/root CIB_file="+self.Factory.tmpfile fixed += " cibadmin --%s --scope %s %s --xml-text '%s'" % (operation, section, options, xml) rc = self.Factory.rsh(self.Factory.target, fixed) if rc != 0: self.Factory.log("Configure call failed: "+fixed) sys.exit(1) class InstanceAttributes(XmlBase): """ Create an section with name-value pairs """ def __init__(self, Factory, name, attrs): XmlBase.__init__(self, Factory, "instance_attributes", name) # Create an for each attribute - for attr in list(attrs.keys()): + for (attr, value) in attrs.items(): self.add_child(XmlBase(Factory, "nvpair", "%s-%s" % (name, attr), - name=attr, value=attrs[attr])) + name=attr, value=value)) class Node(XmlBase): """ Create a section with node attributes for one node """ - def __init__(self, Factory, nodeid, uname, attrs): - XmlBase.__init__(self, Factory, "node", nodeid, uname=uname) - self.add_child(InstanceAttributes(Factory, "%s-1" % uname, attrs)) + def __init__(self, Factory, node_name, node_id, node_attrs): + XmlBase.__init__(self, Factory, "node", node_id, uname=node_name) + self.add_child(InstanceAttributes(Factory, "%s-1" % node_name, node_attrs)) class Nodes(XmlBase): """ Create a section """ def __init__(self, Factory): XmlBase.__init__(self, Factory, "nodes", None) - # The schema requires a node ID when defining node attributes, - # but we don't know it, so we fake it and let the cluster fix it - self.next_nodeid = 1000 - - def __setitem__(self, key, value): - """ For "nodes[UNAME] = ATTRS" where ATTRS is a dictionary of attribute name/value pairs """ - - self.add_child(Node(self.Factory, self.next_nodeid, key, value)) - self.next_nodeid = self.next_nodeid + 1 + def add_node(self, node_name, node_id, node_attrs): + self.add_child(Node(self.Factory, node_name, node_id, node_attrs)) def commit(self): self._run("modify", self.show(), "configuration", "--allow-create") class FencingTopology(XmlBase): def __init__(self, Factory): XmlBase.__init__(self, Factory, "fencing-topology", None) def level(self, index, target, devices): # Generate XML ID (sanitizing target-by-attribute levels) xml_id = "cts-%s.%d" % (string.replace(target, "=", "-"), index) self.add_child(XmlBase(self.Factory, "fencing-level", xml_id, target=target, index=index, devices=devices)) def commit(self): self._run("create", self.show(), "configuration", "--allow-create") class Option(XmlBase): def __init__(self, Factory, name=None, value=None, section="cib-bootstrap-options"): XmlBase.__init__(self, Factory, "cluster_property_set", section) if name and value: self.add_child(XmlBase(Factory, "nvpair", "cts-%s" % name, name=name, value=value)) def __setitem__(self, key, value): self.add_child(XmlBase(self.Factory, "nvpair", "cts-%s" % key, name=key, value=value)) def commit(self): self._run("modify", self.show(), "crm_config", "--allow-create") class Expression(XmlBase): def __init__(self, Factory, name, attr, op, value=None): XmlBase.__init__(self, Factory, "expression", name, attribute=attr, operation=op) if value: self["value"] = value class Rule(XmlBase): def __init__(self, Factory, name, score, op="and", expr=None): XmlBase.__init__(self, Factory, "rule", "%s" % name) self["boolean-op"] = op self["score"] = score if expr: self.add_child(expr) class Resource(XmlBase): def __init__(self, Factory, name, rtype, standard, provider=None): XmlBase.__init__(self, Factory, "native", name) self.rtype = rtype self.standard = standard self.provider = provider self.op = [] self.meta = {} self.param = {} self.scores = {} self.needs = {} self.coloc = {} if self.standard == "ocf" and not provider: self.provider = "heartbeat" elif self.standard == "lsb": self.provider = None def __setitem__(self, key, value): self.add_param(key, value) def add_op(self, name, interval, **kwargs): self.op.append( XmlBase(self.Factory, "op", "%s-%s" % (name, interval), name=name, interval=interval, **kwargs)) def add_param(self, name, value): self.param[name] = value def add_meta(self, name, value): self.meta[name] = value def prefer(self, node, score="INFINITY", rule=None): if not rule: rule = Rule(self.Factory, "prefer-%s-r" % node, score, expr=Expression(self.Factory, "prefer-%s-e" % node, "#uname", "eq", node)) self.scores[node] = rule def after(self, resource, kind="Mandatory", first="start", then="start", **kwargs): kargs = kwargs.copy() kargs["kind"] = kind if then: kargs["first-action"] = "start" kargs["then-action"] = then if first: kargs["first-action"] = first self.needs[resource] = kargs def colocate(self, resource, score="INFINITY", role=None, withrole=None, **kwargs): kargs = kwargs.copy() kargs["score"] = score if role: kargs["rsc-role"] = role if withrole: kargs["with-rsc-role"] = withrole self.coloc[resource] = kargs def constraints(self): text = "" for k in list(self.scores.keys()): text += '''''' % (k, self.name) text += self.scores[k].show() text += '''''' for k in list(self.needs.keys()): text += '''''' for k in list(self.coloc.keys()): text += '''''' text += "" return text def show(self): text = '''''' if len(self.meta) > 0: text += '''''' % self.name for p in list(self.meta.keys()): text += '''''' % (self.name, p, p, self.meta[p]) text += '''''' if len(self.param) > 0: text += '''''' % self.name for p in list(self.param.keys()): text += '''''' % (self.name, p, p, self.param[p]) text += '''''' if len(self.op) > 0: text += '''''' for o in self.op: key = o.name o.name = "%s-%s" % (self.name, key) text += o.show() o.name = key text += '''''' text += '''''' return text def commit(self): self._run("create", self.show(), "resources") self._run("modify", self.constraints()) class Group(Resource): def __init__(self, Factory, name): Resource.__init__(self, Factory, name, None, None) self.tag = "group" def __setitem__(self, key, value): self.add_meta(key, value) def show(self): text = '''<%s id="%s">''' % (self.tag, self.name) if len(self.meta) > 0: text += '''''' % self.name for p in list(self.meta.keys()): text += '''''' % (self.name, p, p, self.meta[p]) text += '''''' for c in self.children: text += c.show() text += '''''' % self.tag return text class Clone(Group): def __init__(self, Factory, name, child=None): Group.__init__(self, Factory, name) self.tag = "clone" if child: self.add_child(child) def add_child(self, resource): if not self.children: self.children.append(resource) else: self.Factory.log("Clones can only have a single child. Ignoring %s" % resource.name) class Master(Clone): def __init__(self, Factory, name, child=None): Clone.__init__(self, Factory, name, child) self.tag = "master" diff --git a/doc/Pacemaker_Remote/en-US/Book_Info.xml b/doc/Pacemaker_Remote/en-US/Book_Info.xml index 426599ecf8..a26494e742 100644 --- a/doc/Pacemaker_Remote/en-US/Book_Info.xml +++ b/doc/Pacemaker_Remote/en-US/Book_Info.xml @@ -1,56 +1,75 @@ %BOOK_ENTITIES; ]> Pacemaker Remote - Extending High Availablity into Virtual Nodes - 1 + Scaling High Availablity Clusters + + 4 0 The document exists as both a reference and deployment guide for the Pacemaker Remote service. - The KVM and Linux Container walk-through tutorials will use: + The example commands in this document will use: &DISTRO; &DISTRO_VERSION; as the host operating system - Pacemaker Remote to perform resource management within virtual nodes + Pacemaker Remote to perform resource management within guest nodes and remote nodes - libvirt to manage KVM and LXC virtual nodes + KVM for virtualization - Corosync to provide messaging and membership services on the host nodes + libvirt to manage guest nodes - Pacemaker to perform resource management on host nodes + Corosync to provide messaging and membership services on cluster nodes + + + + + Pacemaker to perform resource management on cluster nodes + + + + + pcs as the cluster configuration toolset + The concepts are the same for other distributions, + virtualization platforms, toolsets, and messaging + layers, and should be easily adaptable. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Alternatives.txt b/doc/Pacemaker_Remote/en-US/Ch-Alternatives.txt new file mode 100644 index 0000000000..d2fd9f42fd --- /dev/null +++ b/doc/Pacemaker_Remote/en-US/Ch-Alternatives.txt @@ -0,0 +1,77 @@ += Alternative Configurations = + +These alternative configurations may be appropriate in limited cases, such as a +test cluster, but are not the best method in most situations. They are +presented here for completeness and as an example of pacemaker's flexibility +to suit your needs. + +== Virtual Machines as Cluster Nodes == + +The preferred use of virtual machines in a pacemaker cluster is as a +cluster resource, whether opaque or as a guest node. However, it is +possible to run the full cluster stack on a virtual node instead. + +This is commonly used to set up test environments; a single physical host +(that does not participate in the cluster) runs two or more virtual machines, +all running the full cluster stack. This can be used to simulate a +larger cluster for testing purposes. + +In a production environment, fencing becomes more complicated, especially +if the underlying hosts run any services besides the clustered VMs. +If the VMs are not guaranteed a minimum amount of host resources, +CPU and I/O contention can cause timing issues for cluster components. + +Another situation where this approach is sometimes used is when +the cluster owner leases the VMs from a provider and does not have +direct access to the underlying host. The main concerns in this case +are proper fencing (usually via a custom resource agent that communicates +with the provider's APIs) and maintaining a static IP address between reboots, +as well as resource contention issues. + +== Virtual Machines as Remote Nodes == + +Virtual machines may be configured following the process for remote nodes +rather than guest nodes (i.e., using an *ocf:pacemaker:remote* resource +rather than letting the cluster manage the VM directly). + +This is mainly useful in testing, to use a single physical host to simulate a +larger cluster involving remote nodes. Pacemaker's Cluster Test Suite (CTS) +uses this approach to test remote node functionality. + +== Containers as Guest Nodes == + +Containers,footnote:[https://en.wikipedia.org/wiki/Operating-system-level_virtualization] +and in particular Linux containers (LXC) and Docker, have become a popular +method of isolating services in a resource-efficient manner. + +The preferred means of integrating containers into Pacemaker is as a +cluster resource, whether opaque or using Pacemaker's built-in +resource isolation support.footnote:[Documentation for this support is planned +but not yet available.] + +However, it is possible to run `pacemaker_remote` inside a container, +following the process for guest nodes. This is not recommended but can +be useful, for example, in testing scenarios, to simulate a large number of +guest nodes. + +The configuration process is very similar to that described for guest nodes +using virtual machines. Key differences: + +* The underlying host must install the libvirt driver for the desired container + technology -- for example, the +libvirt-daemon-lxc+ package to get the + http://libvirt.org/drvlxc.html:[libvirt-lxc] driver for LXC containers. + +* Libvirt XML definitions must be generated for the containers. The + +pacemaker-cts+ package includes a helpful script for this purpose, + +/usr/share/pacemaker/tests/cts/lxc_autogen.sh+. Run it with the + `--help` option for details on how to use it. Of course, you can create + XML definitions manually, following the appropriate libvirt driver + documentation. + +* To share the authentication key, either share the host's +/etc/pacemaker+ + directory with the container, or copy the key into the container's + filesystem. + +* The *VirtualDomain* resource for a container will need + *force_stop="true"* and an appropriate hypervisor option, + for example *hypervisor="lxc:///"* for LXC containers. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt index d0fd14b02f..c187b2536f 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt @@ -1,230 +1,306 @@ -= Baremetal Walk-through = += Remote Node Walk-through = -+What this tutorial is:+ This tutorial is an in-depth walk-through of how to get pacemaker to integrate a baremetal remote-node into the cluster as a node capable of running cluster resources. +*What this tutorial is:* An in-depth walk-through of how to get Pacemaker to +integrate a remote node into the cluster as a node capable of running cluster +resources. -+What this tutorial is not:+ This tutorial is not a realistic deployment scenario. The steps shown here are meant to get users familiar with the concept of remote-nodes as quickly as possible. +*What this tutorial is not:* A realistic deployment scenario. The steps shown +here are meant to get users familiar with the concept of remote nodes as +quickly as possible. -This tutorial requires three machines. Two machines to act as cluster-nodes and a third to act as the baremetal remote-node. +This tutorial requires three machines: two to act as cluster nodes, and +a third to act as the remote node. -This tutorial was tested using Fedora 20 on both the cluster-nodes and baremetal remote-node. Anything that is capable of running pacemaker v1.1.11 or greater will do though. An installation guide for installing Fedora 20 can be found here, http://docs.fedoraproject.org/en-US/Fedora/20/html/Installation_Guide/. +== Configure Remote Node == -Fedora 20 (or similar distro) host preparation steps. +=== Configure Firewall on Remote Node === -== SElinux and Firewall Considerations == -In order to simply this tutorial we will disable selinux and the firewall on all the nodes. -+WARNING:+ These actions will open a significant security threat to machines exposed to the outside world. -[source,C] +Allow cluster-related services through the local firewall: +---- +# firewall-cmd --permanent --add-service=high-availability +success +# firewall-cmd --reload +success +---- + +[NOTE] +====== +If you are using iptables directly, or some other firewall solution besides +firewalld, simply open the following ports, which can be used by various +clustering components: TCP ports 2224, 3121, and 21064, and UDP port 5405. + +If you run into any problems during testing, you might want to disable +the firewall and SELinux entirely until you have everything working. +This may create significant security issues and should not be performed on +machines that will be exposed to the outside world, but may be appropriate +during development and testing on a protected host. + +To disable security measures: ---- # setenforce 0 # sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config -# firewall-cmd --add-port 3121/tcp --permanent -# systemctl disable iptables.service -# systemctl disable ip6tables.service -# rm '/etc/systemd/system/basic.target.wants/iptables.service' -# rm '/etc/systemd/system/basic.target.wants/ip6tables.service' -# systemctl stop iptables.service -# systemctl stop ip6tables.service +# systemctl disable firewalld.service +# systemctl stop firewalld.service +# iptables --flush ---- +====== -== Setup Pacemaker Remote on Baremetal remote-node == +=== Configure pacemaker_remote on Remote Node === -On the baremetal remote-node machine run these commands to generate an authkey and copy it to the /etc/pacemaker folder. - -[source,C] +Install the pacemaker_remote daemon on the remote node. ---- -# mkdir /etc/pacemaker -# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 +# yum install -y pacemaker-remote resource-agents pcs ---- -Make sure to distribute this key to both of the cluster-nodes as well. All the nodes must have the same /etc/pacemaker/authkey installed for the communication to work correctly. +Create a location for the shared authentication key: +---- +# mkdir -p --mode=0750 /etc/pacemaker +# chgrp haclient /etc/pacemaker +---- -Now install and start the pacemaker_remote daemon on the baremetal remote-node. +All nodes (both cluster nodes and remote nodes) must have the same +authentication key installed for the communication to work correctly. +If you already have a key on an existing node, copy it to the new +remote node. Otherwise, create a new key, for example: +---- +# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 +---- -[source,C] +Now start and enable the pacemaker_remote daemon on the remote node. ---- -# yum install -y pacemaker-remote resource-agents pcs # systemctl enable pacemaker_remote.service # systemctl start pacemaker_remote.service ---- Verify the start is successful. -[source,C] ---- # systemctl status pacemaker_remote +pacemaker_remote.service - Pacemaker Remote Service + Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled) + Active: active (running) since Fri 2015-08-21 15:21:20 CDT; 20s ago + Main PID: 21273 (pacemaker_remot) + CGroup: /system.slice/pacemaker_remote.service + └─21273 /usr/sbin/pacemaker_remoted - pacemaker_remote.service - Pacemaker Remote Service - Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled) - Active: active (running) since Thu 2013-03-14 18:24:04 EDT; 2min 8s ago - Main PID: 1233 (pacemaker_remot) - CGroup: name=systemd:/system/pacemaker_remote.service - └─1233 /usr/sbin/pacemaker_remoted - - Mar 14 18:24:04 remote1 systemd[1]: Starting Pacemaker Remote Service... - Mar 14 18:24:04 remote1 systemd[1]: Started Pacemaker Remote Service. - Mar 14 18:24:04 remote1 pacemaker_remoted[1233]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121. +Aug 21 15:21:20 remote1 systemd[1]: Starting Pacemaker Remote Service... +Aug 21 15:21:20 remote1 systemd[1]: Started Pacemaker Remote Service. +Aug 21 15:21:20 remote1 pacemaker_remoted[21273]: notice: crm_add_logfile: Additional logging available in /var/log/pacemaker.log +Aug 21 15:21:20 remote1 pacemaker_remoted[21273]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121. +Aug 21 15:21:20 remote1 pacemaker_remoted[21273]: notice: bind_and_listen: Listening on address :: ---- -== Verify cluster-node Connection to baremetal-node == +== Verify Connection to Remote Node == -Before moving forward it's worth going ahead and verifying the cluster-nodes can contact the baremetal node on port 3121. Here's a trick you can use. Connect using telnet from each of the cluster-nodes. The connection will get destroyed, but how it is destroyed tells you whether it worked or not. +Before moving forward, it's worth verifying that the cluster nodes +can contact the remote node on port 3121. Here's a trick you can use. +Connect using ssh from each of the cluster nodes. The connection will get +destroyed, but how it is destroyed tells you whether it worked or not. -First add the baremetal remote-node's hostname (we're using remote1 in this tutorial) to the cluster-nodes' /etc/hosts files if you haven't already. This is required unless you have dns setup in a way where remote1's address can be discovered. +First, add the remote node's hostname (we're using *remote1* in this tutorial) +to the cluster nodes' +/etc/hosts+ files if you haven't already. This +is required unless you have DNS set up in a way where remote1's address can be +discovered. -Execute the following on each cluster-node, replacing the ip address with the actual ip address of the baremetal remote-node. -[source,C] +Execute the following on each cluster node, replacing the IP address with the +actual IP address of the remote node. ---- # cat << END >> /etc/hosts -192.168.122.10 remote1 +192.168.122.10 remote1 END ---- -If running the telnet command on one of the cluster-nodes results in this output before disconnecting, the connection works. -[source,C] +If running the ssh command on one of the cluster nodes results in this +output before disconnecting, the connection works. ---- -# telnet remote1 3121 - Trying 192.168.122.10... - Connected to remote1. - Escape character is '^]'. - Connection closed by foreign host. +# ssh -p 3121 remote1 +ssh_exchange_identification: read: Connection reset by peer ---- If you see this, the connection is not working. -[source,C] ---- -# telnet remote1 3121 -Trying 192.168.122.10... -telnet: connect to address 192.168.122.10: No route to host +# ssh -p 3121 remote1 +ssh: connect to host remote1 port 3121: No route to host ---- -Once you can successfully connect to the baremetal remote-node from the both cluster-nodes, move on to setting up pacemaker on the cluster-nodes. +Once you can successfully connect to the remote node from the both +cluster nodes, move on to setting up Pacemaker on the cluster nodes. + +== Configure Cluster Nodes == + +=== Configure Firewall on Cluster Nodes === -== Install cluster-node Software == +On each cluster node, allow cluster-related services through the local +firewall, following the same procedure as in <<_configure_firewall_on_remote_node>>. -On the two cluster-nodes install the following packages. +=== Install Pacemaker on Cluster Nodes === + +On the two cluster nodes, install the following packages. -[source,C] ---- # yum install -y pacemaker corosync pcs resource-agents ---- -== Setup Corosync on cluster-nodes == - -Corosync handles pacemaker's cluster membership and messaging. The corosync config file is located in /etc/corosync/corosync.conf. That config file must be initialized with information about the two cluster-nodes before pacemaker can start. +=== Copy Authentication Key to Cluster Nodes === -To initialize the corosync config file, execute the following pcs command on both nodes filling in the information in <> with your nodes' information. -[source,C] +Create a location for the shared authentication key, +and copy it from any existing node: ---- -# pcs cluster setup --local mycluster +# mkdir -p --mode=0750 /etc/pacemaker +# chgrp haclient /etc/pacemaker +# scp remote1:/etc/pacemaker/authkey /etc/pacemaker/authkey ---- -A recent syntax change in pcs may cause the above command to fail. If so try this alternative. -[source,C] +=== Configure Corosync on Cluster Nodes === + +Corosync handles Pacemaker's cluster membership and messaging. The corosync +config file is located in +/etc/corosync/corosync.conf+. That config file must be +initialized with information about the two cluster nodes before pacemaker can +start. + +To initialize the corosync config file, execute the following pcs command on +both nodes, filling in the information in <> with your nodes' information. ---- # pcs cluster setup --force --local --name mycluster ---- -== Start Pacemaker on cluster-nodes == +=== Start Pacemaker on Cluster Nodes === Start the cluster stack on both cluster nodes using the following command. -[source,C] ---- # pcs cluster start ---- Verify corosync membership -[source,C] ----- +.... # pcs status corosync - Membership information +---------------------- Nodeid Votes Name -1795270848 1 node1 (local) ----- + 1 1 node1 (local) +.... -Verify pacemaker status. At first the 'pcs cluster status' output will look like this. +Verify Pacemaker status. At first, the `pcs cluster status` output will look +like this. -[source,C] ---- # pcs status - - Last updated: Thu Mar 14 12:26:00 2013 - Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host - Stack: corosync - Current DC: - Version: 1.1.11 - 1 Nodes configured, unknown expected votes - 0 Resources configured. +Cluster name: mycluster +Last updated: Fri Aug 21 16:14:05 2015 +Last change: Fri Aug 21 14:02:14 2015 +Stack: corosync +Current DC: NONE +Version: 1.1.12-a14efad +1 Nodes configured, unknown expected votes +0 Resources configured ---- -After about a minute you should see your two cluster-nodes come online. +After about a minute, you should see your two cluster nodes come online. -[source,C] ---- # pcs status +Cluster name: mycluster +Last updated: Fri Aug 21 16:16:32 2015 +Last change: Fri Aug 21 14:02:14 2015 +Stack: corosync +Current DC: node1 (1) - partition with quorum +Version: 1.1.12-a14efad +2 Nodes configured +0 Resources configured - Last updated: Thu Mar 14 12:28:23 2013 - Last change: Thu Mar 14 12:25:55 2013 via crmd on node1 - Stack: corosync - Current DC: node1 (1795270848) - partition with quorum - Version: 1.1.11 - 2 Nodes configured, unknown expected votes - 0 Resources configured. - - Online: [ node1 node2 ] +Online: [ node1 node2 ] ---- For the sake of this tutorial, we are going to disable stonith to avoid having to cover fencing device configuration. -[source,C] ---- # pcs property set stonith-enabled=false ---- -== Integrate Baremetal remote-node into Cluster == +== Integrate Remote Node into Cluster == -Integrating a baremetal remote-node into the cluster is achieved through the creation of a remote-node connection resource. The remote-node connection resource both establishes the connection to the remote-node and defines that the remote-node exists. Note that this resource is actually internal to Pacemaker's crmd component. A metadata file for this resource can be found in the /usr/lib/ocf/resource.d/pacemaker/remote file that describes what options are available, but there is no actual ocf:pacemaker:remote resource agent script that performs any work. +Integrating a remote node into the cluster is achieved through the +creation of a remote node connection resource. The remote node connection +resource both establishes the connection to the remote node and defines that +the remote node exists. Note that this resource is actually internal to +Pacemaker's crmd component. A metadata file for this resource can be found in +the +/usr/lib/ocf/resource.d/pacemaker/remote+ file that describes what options +are available, but there is no actual *ocf:pacemaker:remote* resource agent +script that performs any work. -Define the remote-node connection resource to our baremetal remote-node, remote1, using the following command. +Define the remote node connection resource to our remote node, +*remote1*, using the following command on any cluster node. -[source,C] ---- # pcs resource create remote1 ocf:pacemaker:remote ---- -That's it. After a moment you should see the remote-node come online. +That's it. After a moment you should see the remote node come online. -[source,C] ---- -Last updated: Fri Oct 18 18:47:21 2013 -Last change: Fri Oct 18 18:46:14 2013 via cibadmin on node1 +Cluster name: mycluster +Last updated: Fri Aug 21 17:13:09 2015 +Last change: Fri Aug 21 17:02:02 2015 Stack: corosync -Current DC: node1 (1) - partition with quorum -Version: 1.1.11 +Current DC: node1 (1) - partition with quorum +Version: 1.1.12-a14efad 3 Nodes configured 1 Resources configured + Online: [ node1 node2 ] RemoteOnline: [ remote1 ] -remote1 (ocf::pacemaker:remote): Started node1 +Full list of resources: + + remote1 (ocf::pacemaker:remote): Started node1 + +PCSD Status: + node1: Online + node2: Online + +Daemon Status: + corosync: active/disabled + pacemaker: active/disabled + pcsd: active/enabled ---- -== Starting Resources on baremetal remote-node == +== Starting Resources on Remote Node == + +Once the remote node is integrated into the cluster, starting resources on a +remote node is the exact same as on cluster nodes. Refer to the +http://clusterlabs.org/doc/['Clusters from Scratch'] document for examples of +resource creation. -+"Warning: Never involve a remote-node connection resource in a resource group, colocation, or order constraint"+ +[WARNING] +========= +Never involve a remote node connection resource in a resource group, +colocation constraint, or order constraint. +========= -Once the baremetal remote-node is integrated into the cluster, starting resources on a baremetal remote-node is the exact same as the cluster nodes. Refer to the Clusters from Scratch document for examples on resource creation. http://clusterlabs.org/doc/ +== Fencing Remote Nodes == -== Fencing baremetal remote-nodes == +Remote nodes are fenced the same way as cluster nodes. No special +considerations are required. Configure fencing resources for use with +remote nodes the same as you would with cluster nodes. -The cluster understands how to fence baremetal remote-nodes and can use standard fencing devices to do so. No special considerations are required. Note however that remote-nodes can never initiate a fencing action. Only cluster-nodes are capable of actually executing the fencing operation on another node. +Note, however, that remote nodes can never 'initiate' a fencing action. Only +cluster nodes are capable of actually executing a fencing operation against +another node. -== Accessing Cluster Tools from a Baremetal remote-node == +== Accessing Cluster Tools from a Remote Node == -Besides allowing the cluster to manage resources on a remote-node, pacemaker_remote has one other trick. +The pacemaker_remote daemon allows nearly all the pacemaker tools (crm_resource, crm_mon, crm_attribute, crm_master) to work on remote nodes natively.+ +Besides allowing the cluster to manage resources on a remote node, +pacemaker_remote has one other trick. The pacemaker_remote daemon allows +nearly all the pacemaker tools (`crm_resource`, `crm_mon`, `crm_attribute`, +`crm_master`, etc.) to work on remote nodes natively. -Try it, run +crm_mon+ or +pcs status+ on the baremetal node after pacemaker has integrated the remote-node into the cluster. These tools just work. These means resource agents such as master/slave resources which need access to tools like crm_master work seamlessly on the remote-nodes. +Try it: Run `crm_mon` on the remote node after pacemaker has +integrated it into the cluster. These tools just work. These means resource +agents such as master/slave resources which need access to tools like +`crm_master` work seamlessly on the remote nodes. +Higher-level command shells such as `pcs` may have partial support +on remote nodes, but it is recommended to run them from a cluster node. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Example.txt b/doc/Pacemaker_Remote/en-US/Ch-Example.txt index 5db250f551..9513e3da6a 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Example.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Example.txt @@ -1,107 +1,130 @@ -= KVM Remote-node Quick Example = - -If you already know how to use pacemaker, you'll likely be able to grasp this new concept of remote-nodes by reading through this quick example without having to sort through all the detailed walk-through steps. Here are the key configuration ingredients that make this possible using libvirt and KVM virtual guests. These steps strip everything down to the very basics. - -== Mile High View of Configuration Steps == - -* +Put an authkey with this path, /etc/pacemaker/authkey, on every cluster-node and virtual machine+. This secures remote communication and authentication. - -Run this command if you want to make a somewhat random authkey. - -[source,C] += Guest Node Quick Example = + +If you already know how to use Pacemaker, you'll likely be able to grasp this +new concept of guest nodes by reading through this quick example without +having to sort through all the detailed walk-through steps. Here are the key +configuration ingredients that make this possible using libvirt and KVM virtual +guests. These steps strip everything down to the very basics. +(((guest node))) +(((node,guest node))) + +== Mile-High View of Configuration Steps == + +* Give each virtual machine that will be used as a guest node a static network + address and unique hostname. + +* Put the same authentication key with the path +/etc/pacemaker/authkey+ on + every cluster node and virtual machine. This secures remote communication. ++ +Run this command if you want to make a somewhat random key: ++ ---- dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 ---- -* +Install pacemaker_remote packages on every virtual machine, enable pacemaker_remote on startup, and poke hole in firewall for tcp port 3121.+ - -[source,C] +* Install pacemaker_remote on every virtual machine, enabling it to start at + boot, and if a local firewall is used, allow the node to accept connections + on TCP port 3121. ++ ---- yum install pacemaker-remote resource-agents systemctl enable pacemaker_remote -# If you just want to see this work, disable iptables and ip6tables on most distros. -# You may have to put selinux in permissive mode as well for the time being. firewall-cmd --add-port 3121/tcp --permanent ---- - -* +Give each virtual machine a static network address and unique hostname+ - -* +Tell pacemaker to launch a virtual machine and that the virtual machine is a remote-node capable of running resources by using the "remote-node" meta-attribute.+ - -with pcs - -[source,C] ++ +[NOTE] +====== +If you just want to see this work, you may want to simply disable the local +firewall and put SELinux in permissive mode while testing. This creates +security risks and should not be done on a production machine exposed to the +Internet, but can be appropriate for a protected test machine. +====== + +* Create a Pacemaker resource to launch each virtual machine, using the + *remote-node* meta-attribute to let Pacemaker know this will be a + guest node capable of running resources. ++ ---- -# pcs resource create vm-guest1 VirtualDomain hypervisor="qemu:///system" config="vm-guest1.xml" meta +remote-node=guest1+ +# pcs resource create vm-guest1 VirtualDomain hypervisor="qemu:///system" config="vm-guest1.xml" meta remote-node="guest1" ---- - -raw xml ++ +The above command will create CIB XML similar to the following: ++ [source,XML] ---- ---- -In the example above the meta-attribute 'remote-node=guest1' tells pacemaker that this resource is a remote-node with the hostname 'guest1' that is capable of being integrated into the cluster. The cluster will attempt to contact the virtual machine's pacemaker_remote service at the hostname 'guest1' after it launches. +In the example above, the meta-attribute *remote-node="guest1"* tells Pacemaker +that this resource is a guest node with the hostname *guest1*. The cluster will +attempt to contact the virtual machine's pacemaker_remote service at the +hostname *guest1* after it launches. -== What those steps just did == +[NOTE] +====== +The ID of the resource creating the virtual machine (*vm-guest1* in the above +example) 'must' be different from the virtual machine's uname (*guest1* in the +above example). Pacemaker will create an implicit internal resource for the +pacemaker_remote connection to the guest, named with the value of *remote-node*, +so that value cannot be used as the name of any other resource. +====== -Those steps just told pacemaker to launch a virtual machine called vm-guest1 and integrate that virtual machine as a remote-node called 'guest1'. +== Using a Guest Node == -Example crm_mon output after guest1 is integrated into cluster. +Guest nodes will show up in `crm_mon` output as normal: -[source,C] +.Example `crm_mon` output after *guest1* is integrated into cluster ---- Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ node1 guest1] vm-guest1 (ocf::heartbeat:VirtualDomain): Started node1 ---- -Now, you could place a resource, such as a webserver on guest1. - -[source,C] +Now, you could place a resource, such as a webserver, on *guest1*: ---- # pcs resource create webserver apache params configfile=/etc/httpd/conf/httpd.conf op monitor interval=30s # pcs constraint webserver prefers guest1 ---- -Now the crm_mon output would show a webserver launched on the guest1 remote-node. - -[source,C] +Now, the crm_mon output would show: ---- Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ node1 guest1] vm-guest1 (ocf::heartbeat:VirtualDomain): Started node1 webserver (ocf::heartbeat::apache): Started guest1 ---- -== Accessing Cluster from Remote-node == - -It is worth noting that after 'guest1' is integrated into the cluster, all the pacemaker cli tools immediately become available to the remote node. This means things like crm_mon, crm_resource, and crm_attribute will work natively on the remote-node as long as the connection between the remote-node and cluster-node exists. This is particularly important for any master/slave resources executing on the remote-node that need access to crm_master to set the nodes transient attributes. - +It is worth noting that after *guest1* is integrated into the cluster, nearly all the +Pacemaker command-line tools immediately become available to the guest node. +This means things like `crm_mon`, `crm_resource`, and `crm_attribute` will work +natively on the guest node, as long as the connection between the guest node +and a cluster node exists. This is particularly important for any master/slave +resources executing on the guest node that need access to `crm_master` to set +transient attributes. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Future.txt b/doc/Pacemaker_Remote/en-US/Ch-Future.txt deleted file mode 100644 index 43d136c4e8..0000000000 --- a/doc/Pacemaker_Remote/en-US/Ch-Future.txt +++ /dev/null @@ -1,17 +0,0 @@ -= Future Features = - -Basic KVM and Linux container integration was the first phase of development for pacemaker_remote and was completed for Pacemaker v1.1.10. Here are some planned features that expand upon this initial functionality. - -== Libvirt Sandbox Support == - -Once the libvirt-sandbox project is integrated with pacemaker_remote, we will gain the ability to preform per-resource linux container isolation with very little performance impact. This functionality will allow resources living on a single node to be isolated from one another. At that point CPU and memory limits could be set per-resource dynamically just using the cluster config. - -== Bare-metal Support == -+"This feature has already been introduced into Pacemaker's master github branch and is scheduled for Pacemaker v1.1.11"+ - -The pacemaker_remote daemon already has the ability to run on bare-metal hardware nodes, but the policy engine logic for integrating bare-metal nodes is not complete. There are some complications involved with understanding a bare-metal node's state that virtual nodes don't have. Once this logic is complete, pacemaker will be able to integrate bare-metal nodes in the same way virtual remote-nodes currently are. Some special considerations for fencing will need to be addressed. - -== KVM Migration Support == -+"This feature has already been introduced into Pacemaker's master github branch and is scheduled for Pacemaker v1.1.12"+ - -Pacemaker's policy engine is limited in its ability to perform live migrations of KVM resources when resource dependencies are involved. This limitation affects how resources living within a KVM remote-node are handled when a live migration takes place. Currently when a live migration is performed on a KVM remote-node, all the resources within that remote-node have to be stopped before the migration takes place and started once again after migration has finished. This policy engine limitation is fully explained in this bug report, http://bugs.clusterlabs.org/show_bug.cgi?id=5055#c3 diff --git a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt index 777bb976a1..438ecd2aa4 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt @@ -1,85 +1,193 @@ -= Extending High Availability Cluster into Virtual Nodes = += Scaling a Pacemaker Cluster = == Overview == -The recent addition of the +pacemaker_remote+ service supported by +Pacemaker version 1.1.10 and greater+ allows nodes not running the cluster stack (pacemaker+corosync) to integrate into the cluster and have the cluster manage their resources just as if they were a real cluster node. This means that pacemaker clusters are now capable of managing both launching virtual environments (KVM/LXC) as well as launching the resources that live within those virtual environments without requiring the virtual environments to run pacemaker or corosync. +In a basic Pacemaker high-availability +cluster,footnote:[See the http://www.clusterlabs.org/doc/[Pacemaker +documentation], especially 'Clusters From Scratch' and 'Pacemaker Explained', +for basic information about high-availability using Pacemaker] +each node runs the full cluster stack of corosync and all Pacemaker components. +This allows great flexibility but limits scalability to around 16 nodes. -== Terms == -+cluster-node+ - A node running the High Availability stack (pacemaker + corosync) - -+remote-node+ - A node running pacemaker_remote without the rest of the High Availability stack. There are two types of remote-nodes, container and baremetal. - -+container+ - A pacemaker resource that contains additional resources. For example, a KVM virtual machine resource that contains a webserver resource. - -+container remote-node+ - A virtual guest remote-node running the pacemaker_remote service. This describes a specific remote-node use case where a virtual guest resource managed by the cluster is both started by the cluster and integrated into the cluster as a remote-node. - -+baremetal+ - Term used to describe an environment that is not virtualized. - -+baremetal remote-node+ - A baremetal hardware node running pacemaker_remote. This describes a specific remote-node use case where a hardware node not running the High Availability stack is integrated into the cluster as a remote-node through the use of pacemaker_remote. - -+pacemaker_remote+ - A service daemon capable of performing remote application management within guest nodes (baremetal, kvm, and lxc) in both pacemaker cluster environments and standalone (non-cluster) environments. This service is an enhanced version of pacemaker's local resource manage daemon (LRMD) that is capable of managing and monitoring LSB, OCF, upstart, and systemd resources on a guest remotely. It also allows for most of pacemaker's cli tools (crm_mon, crm_resource, crm_master, crm_attribute, ect..) to work natively on remote-nodes. - -+LXC+ - A Linux Container defined by the libvirt-lxc Linux container driver. http://libvirt.org/drvlxc.html - -== Version Info == - -This feature is in ongoing development. - -+Pacemaker v1.1.10+ - -* Initial pacemaker_remote daemon and integration support. -* Only supports pacemaker in KVM/LXC environments. -* pacemaker_remote daemon unit test suite. -* Known bugs include (These are likely resolved if you have received an 1.1.10.x point release): Errors when setting remote-node attributes, Failures when stopping orphaned (deleted from cib while running) remote-nodes, Fixes remote-node usage in asymmetric clusters. - -+Currently in Master github branch and scheduled for Pacemaker v1.1.11+ - -* Baremetal remote-node support. -* Improvements to scaling remote-node integration. Performance testing here included 16 cluster nodes running 64 remote-nodes living in LXC containers. As part of this testing, several performance enhancements were introduced into the integration code. -* CTS tests. RemoteLXC and RemoteBaremetal. These two CTS tests allow us to perform automated verification of pacemaker_remote integration. -* Fixes for known bugs in 1.1.10 release. - -== Virtual Machine Use Case == -The use of pacemaker_remote in virtual machines solves a deployment scenario that has traditionally been difficult to solve. +To allow for scalability to dozens or even hundreds of nodes, Pacemaker +allows nodes not running the full cluster stack to integrate into the cluster +and have the cluster manage their resources as if they were a cluster node. -+"I want a pacemaker cluster to manage virtual machine resources, but I also want pacemaker to be able to manage the resources that live within those virtual machines."+ - -In the past, users desiring this deployment had to make a decision. They would either have to sacrifice the ability of monitoring resources residing within virtual guests by running the cluster stack on the baremetal nodes, or run another cluster instance on the virtual guests where they potentially run into corosync scalability issues. There is a third scenario where the virtual guests run the cluster stack and join the same network as the baremetal nodes, but that can quickly hit issues with scalability as well. - -With the pacemaker_remote service we have a new option. - -* The baremetal cluster-nodes run the cluster stack (pacemaker+corosync). -* The virtual remote-nodes run the pacemaker_remote service (nearly zero configuration required on the virtual machine side) -* The cluster stack on the cluster-nodes launch the virtual machines and immediately connect to the pacemaker_remote service, allowing the virtual machines to integrate into the cluster just as if they were a real cluster-node. - -The key difference here between the virtual machine remote-nodes and the cluster-nodes is that the remote-nodes are not running the cluster stack. This means the remote nodes will never become the DC, and they do not take place in quorum. On the other hand this also means that remote-nodes are not bound to the scalability limits associated with the cluster stack either. +No 16 node corosync member limits+ to deal with. That isn't to say remote-nodes can scale indefinitely, but it is known that remote-nodes scale horizontally much further than cluster-nodes. Other than the quorum limitation, these remote-nodes behave just like cluster nodes in respects to resource management. The cluster is fully capable of managing and monitoring resources on each remote-node. You can build constraints against remote-nodes, put them in standby, or whatever else you'd expect to be able to do with normal cluster-nodes. They even show up in the crm_mon output as you would expect cluster-nodes to. - -To solidify the concept, below is an example deployment that is very similar to an actual deployment we test in our developer environment to verify remote-node scalability. - -* 16 cluster-nodes running corosync+pacemaker stack. -* 64 pacemaker managed virtual machine resources running pacemaker_remote configured as remote-nodes. -* 64 pacemaker managed webserver and database resources configured to run on the 64 remote-nodes. - -With this deployment you would have 64 webservers and databases running on 64 virtual machines on 16 hardware nodes all of which are managed and monitored by the same pacemaker deployment. It is known that pacemaker_remote can scale to these lengths and possibly much further depending on the specific scenario. - -== Baremetal remote-node Use Case == - -+"I want my traditional High Availability cluster to scale beyond the limits imposed by the corosync messaging layer."+ - -Ultimately the primary advantage of baremetal remote-nodes over traditional nodes running the Corosync+Pacemaker stack is scalability. There are likely some other use cases related to geographically distributed HA clusters that baremetal remote-nodes may serve a purpose in, but those use cases are not well understood at this point. The only limitations baremetal remote-nodes have that cluster-nodes do not is the ability to take place in cluster quorum, and the ability to execute fencing agents via stonith. That is not to say however that fencing of a baremetal node works any differently than that of a normal cluster-node. The Pacemaker policy engine understands how to fence baremetal remote-nodes. As long as a fencing device exists, the cluster is capable of ensuring baremetal nodes are fenced in the exact same way as normal cluster-nodes are fenced. - -== Linux Container Use Case == - -+I want to isolate and limit the system resources (cpu, memory, filesystem) a cluster resource can consume without using virtual machines.+ +== Terms == -Using pacemaker_remote with Linux containers (libvirt-lxc) opens up some interesting possibilities for isolating resources in the cluster without the use of a hypervisor. We now have the ability to both define a contained environment with cpu and memory utilization limits and then assign resources to that contained environment all managed from within pacemaker. The LXC Walk-through section of this document outlines how pacemaker_remote can be used to bring Linux containers into the cluster as remote-nodes capable of executing resources. +cluster node:: + A node running the full high-availability stack of corosync and all + Pacemaker components. Cluster nodes may run cluster resources, run + all Pacemaker command-line tools (`crm_mon`, `crm_resource` and so on), + execute fencing actions, count toward cluster quorum, and serve as the + cluster's Designated Controller (DC). +(((cluster node))) +(((node,cluster node))) + +pacemaker_remote:: + A small service daemon that allows a host to be used as a Pacemaker node + without running the full cluster stack. Nodes running pacemaker_remote + may run cluster resources and most command-line tools, but cannot perform + other functions of full cluster nodes such as fencing execution, quorum + voting or DC eligibility. The pacemaker_remote daemon is an enhanced + version of Pacemaker's local resource management daemon (LRMD). +(((pacemaker_remote))) + +remote node:: + A physical host running pacemaker_remote. Remote nodes have a special + resource that manages communication with the cluster. This is sometimes + referred to as the 'baremetal' case. +(((remote node))) +(((node,remote node))) + +guest node:: + A virtual host running pacemaker_remote. Guest nodes differ from remote + nodes mainly in that the guest node is itself a resource that the cluster + manages. +(((guest node))) +(((node,guest node))) + +[NOTE] +====== +'Remote' in this document refers to the node not being a part of the underlying +corosync cluster. It has nothing to do with physical proximity. Remote nodes +and guest nodes are subject to the same latency requirements as cluster nodes, +which means they are typically in the same data center. +====== + +[NOTE] +====== +It is important to distinguish the various roles a virtual machine can serve +in Pacemaker clusters: + +* A virtual machine can run the full cluster stack, in which case it is a + cluster node and is not itself managed by the cluster. +* A virtual machine can be managed by the cluster as a resource, without the + cluster having any awareness of the services running inside the virtual + machine. The virtual machine is 'opaque' to the cluster. +* A virtual machine can be a cluster resource, and run pacemaker_remote + to make it a a guest node, allowing the cluster to manage services + inside it. The virtual machine is 'transparent' to the cluster. +====== + +== Support in Pacemaker Versions == + +It is recommended to run Pacemaker 1.1.12 or later when using pacemaker_remote +due to important bug fixes. An overview of changes in pacemaker_remote +capability by version: + +.1.1.13 +* Support for maintenance mode +* Remote nodes can recover without being fenced when the cluster node + hosting their connection fails +* Running pacemaker_remote within LXC environments is deprecated due to + newly added Pacemaker support for isolated resources +* Bug fixes + +.1.1.12 +* Support for permanent node attributes +* Support for migration +* Bug fixes + +.1.1.11 +* Support for IPv6 +* Support for remote nodes +* Support for transient node attributes +* Support for clusters with mixed endian architectures +* Bug fixes + +.1.1.10 +* Bug fixes + +.1.1.9 +* Initial version to include pacemaker_remote +* Limited to guest nodes in KVM/LXC environments using only IPv4; + all nodes' architectures must have same endianness + +== Guest Nodes == +(((guest node))) +(((node,guest node))) + +*"I want a Pacemaker cluster to manage virtual machine resources, but I also +want Pacemaker to be able to manage the resources that live within those +virtual machines."* + +Without pacemaker_remote, the possibilities for implementing the above use case +have significant limitations: + +* The cluster stack could be run on the physical hosts only, which loses the + ability to monitor resources within the guests. +* A separate cluster could be on the virtual guests, which quickly hits + scalability issues. +* The cluster stack could be run on the guests using the same cluster as the + physical hosts, which also hits scalability issues and complicates fencing. + +With pacemaker_remote: + +* The physical hosts are cluster nodes (running the full cluster stack). +* The virtual machines are guest nodes (running the pacemaker_remote service). + Nearly zero configuration is required on the virtual machine. +* The cluster stack on the cluster nodes launches the virtual machines and + immediately connects to the pacemaker_remote service on them, allowing the + virtual machines to integrate into the cluster. + +The key difference here between the guest nodes and the cluster nodes is that +the guest nodes do not run the cluster stack. This means they will never become +the DC, initiate fencing actions or participate in quorum voting. + +On the other hand, this also means that they are not bound to the scalability +limits associated with the cluster stack (no 16-node corosync member limits to +deal with). That isn't to say that guest nodes can scale indefinitely, but it +is known that guest nodes scale horizontally much further than cluster nodes. + +Other than the quorum limitation, these guest nodes behave just like cluster +nodes with respect to resource management. The cluster is fully capable of +managing and monitoring resources on each guest node. You can build constraints +against guest nodes, put them in standby, or do whatever else you'd expect to +be able to do with cluster nodes. They even show up in `crm_mon` output as +nodes. + +To solidify the concept, below is an example that is very similar to an actual +deployment we test in our developer environment to verify guest node scalability: + +* 16 cluster nodes running the full corosync + pacemaker stack +* 64 Pacemaker-managed virtual machine resources running pacemaker_remote configured as guest nodes +* 64 Pacemaker-managed webserver and database resources configured to run on the 64 guest nodes + +With this deployment, you would have 64 webservers and databases running on 64 +virtual machines on 16 hardware nodes, all of which are managed and monitored by +the same Pacemaker deployment. It is known that pacemaker_remote can scale to +these lengths and possibly much further depending on the specific scenario. + + +== Remote Nodes == +(((remote node))) +(((node,remote node))) + +*"I want my traditional high-availability cluster to scale beyond the limits +imposed by the corosync messaging layer."* + +Ultimately, the primary advantage of remote nodes over cluster nodes is +scalability. There are likely some other use cases related to geographically +distributed HA clusters that remote nodes may serve a purpose in, but those use +cases are not well understood at this point. + +Like guest nodes, remote nodes will never become the DC, initiate +fencing actions or participate in quorum voting. + +That is not to say, however, that fencing of a remote node works any +differently than that of a cluster node. The Pacemaker policy engine +understands how to fence remote nodes. As long as a fencing device exists, the +cluster is capable of ensuring remote nodes are fenced in the exact same way as +cluster nodes. == Expanding the Cluster Stack == -=== Traditional HA Stack === - -image::images/pcmk-ha-cluster-stack.png["The Traditional Pacemaker Corosync HA Stack.",width="17cm",height="9cm",align="center"] +With pacemaker_remote, the traditional view of the high-availability stack can +be expanded to include a new layer: -=== Remote-Node Enabled HA Stack Using Virtual guest nodes === +.Traditional HA Stack +image::images/pcmk-ha-cluster-stack.png["Traditional Pacemaker+Corosync Stack",width="17cm",height="9cm",align="center"] -image::images/pcmk-ha-remote-stack.png["Placing Pacemaker Remote into the Traditional HA Stack.",width="20cm",height="10cm",align="center"] +.HA Stack With Guest Nodes +image::images/pcmk-ha-remote-stack.png["Pacemaker+Corosync Stack With pacemaker_remote",width="20cm",height="10cm",align="center"] diff --git a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt index 7b150aa5ef..328a52eb0f 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt @@ -1,467 +1,477 @@ -= KVM Walk-through = += Guest Node Walk-through = -+What this tutorial is:+ This tutorial is an in-depth walk-through of how to get pacemaker to manage a KVM guest instance and integrate that guest into the cluster as a remote-node. +*What this tutorial is:* An in-depth walk-through of how to get Pacemaker to +manage a KVM guest instance and integrate that guest into the cluster as a +guest node. -+What this tutorial is not:+ This tutorial is not a realistic deployment scenario. The steps shown here are meant to get users familiar with the concept of remote-nodes as quickly as possible. +*What this tutorial is not:* A realistic deployment scenario. The steps shown +here are meant to get users familiar with the concept of guest nodes as quickly +as possible. -== Step 1: Setup the Host == - -This tutorial was created using Fedora 20 on the host and guest nodes. Anything that is capable of running libvirt and pacemaker v1.1.10 or greater will do though. An installation guide for installing Fedora 20 can be found here, http://docs.fedoraproject.org/en-US/Fedora/20/html/Installation_Guide/. - -Fedora 20 (or similar distro) host preparation steps. +== Configure the Physical Host == === SElinux and Firewall === -In order to simply this tutorial we will disable the selinux and the firewall on the host. -+WARNING:+ These actions will open a significant security threat to machines exposed to the outside world. -[source,C] + +In order to simplify this tutorial, we will disable SELinux and the local +firewall on the host. This may create significant security issues and should +not be performed on machines that will be exposed to the outside world, but may +be appropriate during development and testing on a protected host. ---- # setenforce 0 # sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config -# systemctl disable iptables.service -# systemctl disable ip6tables.service -# rm '/etc/systemd/system/basic.target.wants/iptables.service' -# rm '/etc/systemd/system/basic.target.wants/ip6tables.service' -# systemctl stop iptables.service -# systemctl stop ip6tables.service +# systemctl disable firewalld.service +# systemctl stop firewalld.service +# iptables --flush ---- === Install Cluster Software === -[source,C] ---- # yum install -y pacemaker corosync pcs resource-agents ---- -=== Setup Corosync === +=== Configure Corosync === -Corosync handles pacemaker's cluster membership and messaging. The corosync config file is located in /etc/corosync/corosync.conf. That config file must be initialized with information about the cluster-nodes before pacemaker can start. +Corosync handles pacemaker's cluster membership and messaging. The corosync +config file is located in /etc/corosync/corosync.conf. That config file must be +initialized with information about the cluster nodes before pacemaker can +start. To initialize the corosync config file, execute the following pcs command on both nodes filling in the information in <> with your nodes' information. -[source,C] ----- -# pcs cluster setup --local mycluster ----- - -A recent syntax change in pcs may cause the above command to fail. If so try this alternative. -[source,C] ---- # pcs cluster setup --force --local --name mycluster ---- === Verify Cluster Software === Start the cluster - -[source,C] ---- # pcs cluster start ---- Verify corosync membership - -[source,C] ---- # pcs status corosync Membership information Nodeid Votes Name 1795270848 1 example-host (local) ---- -Verify pacemaker status. At first the 'pcs cluster status' output will look like this. - -[source,C] +Verify pacemaker status. At first, the output will look like this: ---- # pcs status Last updated: Thu Mar 14 12:26:00 2013 Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. ---- After about a minute you should see your host as a single node in the cluster. -[source,C] ---- # pcs status Last updated: Thu Mar 14 12:28:23 2013 Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.8-9b13ea1 1 Nodes configured, unknown expected votes 0 Resources configured. Online: [ example-host ] ---- Go ahead and stop the cluster for now after verifying everything is in order. - -[source,C] ---- # pcs cluster stop ---- === Install Virtualization Software === -[source,C] ---- # yum install -y kvm libvirt qemu-system qemu-kvm bridge-utils virt-manager # systemctl enable libvirtd.service ---- reboot the host -== Step2: Create the KVM guest == +[NOTE] +====== +While KVM is used in this example, any virtualization platform with a Pacemaker +resource agent can be used to create a guest node. The resource agent needs +only to support usual commands (start, stop, etc.); Pacemaker implements the +*remote-node* meta-attribute, independent of the agent. +====== + +== Configure the KVM guest == -I am not going to outline the installation steps required to create a kvm guest. There are plenty of tutorials available elsewhere that do that. I recommend using a Fedora 18 or greater distro as your guest as that is what I am testing this tutorial with. +=== Create Guest === -=== Setup Guest Network === +I am not going to outline the installation steps required to create a KVM +guest. There are plenty of tutorials available elsewhere that do that. + +=== Configure Guest Network === Run the commands below to set up a static ip address (192.168.122.10) and hostname (guest1). -[source,C] ---- export remote_hostname=guest1 export remote_ip=192.168.122.10 export remote_gateway=192.168.122.1 yum remove -y NetworkManager rm -f /etc/hostname cat << END >> /etc/hostname $remote_hostname END hostname $remote_hostname cat << END >> /etc/sysconfig/network HOSTNAME=$remote_hostname GATEWAY=$remote_gateway END sed -i.bak "s/.*BOOTPROTO=.*/BOOTPROTO=none/g" /etc/sysconfig/network-scripts/ifcfg-eth0 cat << END >> /etc/sysconfig/network-scripts/ifcfg-eth0 IPADDR0=$remote_ip PREFIX0=24 GATEWAY0=$remote_gateway DNS1=$remote_gateway END systemctl restart network systemctl enable network.service systemctl enable sshd systemctl start sshd echo "checking connectivity" ping www.google.com ---- To simplify the tutorial we'll go ahead and disable selinux on the guest. We'll also need to poke a hole through the firewall on port 3121 (the default port for pacemaker_remote) so the host can contact the guest. -[source,C] ---- # setenforce 0 # sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config # firewall-cmd --add-port 3121/tcp --permanent ---- -If you still encounter connection issues just disable iptables and ipv6tables on the guest like we did on the host to guarantee you'll be able to contact the guest from the host. +If you still encounter connection issues, just disable firewalld on the guest +like we did on the host, to guarantee you'll be able to contact the guest from +the host. At this point you should be able to ssh into the guest from the host. -=== Setup Pacemaker Remote === +=== Configure pacemaker_remote === -On the +HOST+ machine run these commands to generate an authkey and copy it to the /etc/pacemaker folder on both the host and guest. +On the 'host' machine, run these commands to generate an authkey and copy it to +the /etc/pacemaker folder on both the host and guest. -[source,C] ---- -# mkdir /etc/pacemaker +# mkdir -p --mode=0750 /etc/pacemaker +# chgrp haclient /etc/pacemaker # dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 # scp -r /etc/pacemaker root@192.168.122.10:/etc/ ---- -Now on the +GUEST+ install pacemaker-remote package and enable the daemon to run at startup. In the commands below you will notice the 'pacemaker' and 'pacemaker_remote' packages are being installed. The 'pacemaker' package is not required. The only reason it is being installed for this tutorial is because it contains the a 'Dummy' resource agent we will be using later on to test the remote-node. +Now on the 'guest', install the pacemaker-remote package, and enable the daemon +to run at startup. In the commands below, you will notice the pacemaker +package is also installed. It is not required; the only reason it is being +installed for this tutorial is because it contains the Dummy resource agent +that we will use later for testing. -[source,C] ---- # yum install -y pacemaker pacemaker-remote resource-agents # systemctl enable pacemaker_remote.service ---- Now start pacemaker_remote on the guest and verify the start was successful. -[source,C] ---- # systemctl start pacemaker_remote.service # systemctl status pacemaker_remote pacemaker_remote.service - Pacemaker Remote Service Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled) Active: active (running) since Thu 2013-03-14 18:24:04 EDT; 2min 8s ago Main PID: 1233 (pacemaker_remot) CGroup: name=systemd:/system/pacemaker_remote.service └─1233 /usr/sbin/pacemaker_remoted Mar 14 18:24:04 guest1 systemd[1]: Starting Pacemaker Remote Service... Mar 14 18:24:04 guest1 systemd[1]: Started Pacemaker Remote Service. Mar 14 18:24:04 guest1 pacemaker_remoted[1233]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121. ---- === Verify Host Connection to Guest === -Before moving forward it's worth going ahead and verifying the host can contact the guest on port 3121. Here's a trick you can use. Connect using telnet from the host. The connection will get destroyed, but how it is destroyed tells you whether it worked or not. +Before moving forward, it's worth verifying that the host can contact the guest +on port 3121. Here's a trick you can use. Connect using ssh from the host. The +connection will get destroyed, but how it is destroyed tells you whether it +worked or not. First add guest1 to the host machine's /etc/hosts file if you haven't already. This is required unless you have dns setup in a way where guest1's address can be discovered. -[source,C] ---- # cat << END >> /etc/hosts 192.168.122.10 guest1 END ---- -If running the telnet command on the host results in this output before disconnecting, the connection works. -[source,C] +If running the ssh command on one of the cluster nodes results in this +output before disconnecting, the connection works. ---- -# telnet guest1 3121 - Trying 192.168.122.10... - Connected to guest1. - Escape character is '^]'. - Connection closed by foreign host. +# ssh -p 3121 guest1 +ssh_exchange_identification: read: Connection reset by peer ---- If you see this, the connection is not working. -[source,C] ---- -# telnet guest1 3121 -Trying 192.168.122.10... -telnet: connect to address 192.168.122.10: No route to host +# ssh -p 3121 guest1 +ssh: connect to host guest1 port 3121: No route to host ---- Once you can successfully connect to the guest from the host, shutdown the guest. Pacemaker will be managing the virtual machine from this point forward. -== Step3: Integrate KVM guest into Cluster. == +== Integrate Guest into Cluster == Now the fun part, integrating the virtual machine you've just created into the cluster. It is incredibly simple. === Start the Cluster === On the host, start pacemaker. -[source,C] ---- # pcs cluster start ---- -Wait for the host to become the DC. The output of 'pcs status' should look similar to this after about a minute. +Wait for the host to become the DC. The output of `pcs status` should look +similar to this after about a minute. -[source,C] ---- Last updated: Thu Mar 14 16:41:22 2013 Last change: Thu Mar 14 16:41:08 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. Online: [ example-host ] ---- -Now enable the cluster to work without quorum or stonith. This is required just for the sake of getting this tutorial to work with a single cluster-node. +Now enable the cluster to work without quorum or stonith. This is required +just for the sake of getting this tutorial to work with a single cluster node. -[source,C] ---- # pcs property set stonith-enabled=false # pcs property set no-quorum-policy=ignore ---- -=== Integrate KVM Guest as remote-node === +=== Integrate as Guest Node === If you didn't already do this earlier in the verify host to guest connection section, add the KVM guest's ip to the host's /etc/hosts file so we can connect by hostname. The command below will do that if you used the same ip address I used earlier. -[source,C] ---- # cat << END >> /etc/hosts 192.168.122.10 guest1 END ---- -We will use the +VirtualDomain+ resource agent for the management of the virtual machine. This agent requires the virtual machine's xml config to be dumped to a file on disk. To do this pick out the name of the virtual machine you just created from the output of this list. +We will use the *VirtualDomain* resource agent for the management of the +virtual machine. This agent requires the virtual machine's XML config to be +dumped to a file on disk. To do this, pick out the name of the virtual machine +you just created from the output of this list. -[source,C] ----- +.... # virsh list --all Id Name State -______________________________________________ +---------------------------------------------------- - guest1 shut off ----- +.... In my case I named it guest1. Dump the xml to a file somewhere on the host using the following command. -[source,C] ---- # virsh dumpxml guest1 > /root/guest1.xml ---- Now just register the resource with pacemaker and you're set! -[source,C] ---- # pcs resource create vm-guest1 VirtualDomain hypervisor="qemu:///system" config="/root/guest1.xml" meta remote-node=guest1 ---- -Once the 'vm-guest1' resource is started you will see 'guest1' appear in the 'pcs status' output as a node. The final 'pcs status' output should look something like this. +Once the *vm-guest1* resource is started you will see *guest1* appear in the +`pcs status` output as a node. The final `pcs status` output should look +something like this. -[source,C] ---- Last updated: Fri Mar 15 09:30:30 2013 Last change: Thu Mar 14 17:21:35 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ example-host guest1 ] Full list of resources: vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host ---- === Starting Resources on KVM Guest === -The commands below demonstrate how resources can be executed on both the remote-node and the cluster-node. +The commands below demonstrate how resources can be executed on both the +guest node and the cluster node. Create a few Dummy resources. Dummy resources are real resource agents used just for testing purposes. They actually execute on the host they are assigned to just like an apache server or database would, except their execution just means a file was created. When the resource is stopped, that the file it created is removed. -[source,C] ---- # pcs resource create FAKE1 ocf:pacemaker:Dummy # pcs resource create FAKE2 ocf:pacemaker:Dummy # pcs resource create FAKE3 ocf:pacemaker:Dummy # pcs resource create FAKE4 ocf:pacemaker:Dummy # pcs resource create FAKE5 ocf:pacemaker:Dummy ---- -Now check your 'pcs status' output. In the resource section you should see something like the following, where some of the resources got started on the cluster-node, and some started on the remote-node. +Now check your `pcs status` output. In the resource section, you should see +something like the following, where some of the resources started on the +cluster node, and some started on the guest node. -[source,C] ---- Full list of resources: vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host FAKE1 (ocf::pacemaker:Dummy): Started guest1 FAKE2 (ocf::pacemaker:Dummy): Started guest1 FAKE3 (ocf::pacemaker:Dummy): Started example-host FAKE4 (ocf::pacemaker:Dummy): Started guest1 FAKE5 (ocf::pacemaker:Dummy): Started example-host ---- -The remote-node, 'guest1', reacts just like any other node in the cluster. For example, pick out a resource that is running on your cluster-node. For my purposes I am picking FAKE3 from the output above. We can force FAKE3 to run on 'guest1' in the exact same way we would any other node. +The guest node, *guest1*, reacts just like any other node in the cluster. For +example, pick out a resource that is running on your cluster node. For my +purposes, I am picking FAKE3 from the output above. We can force FAKE3 to run +on *guest1* in the exact same way we would any other node. -[source,C] ---- # pcs constraint FAKE3 prefers guest1 ---- -Now looking at the bottom of the 'pcs status' output you'll see FAKE3 is on 'guest1'. +Now, looking at the bottom of the `pcs status` output you'll see FAKE3 is on +*guest1*. -[source,C] ---- Full list of resources: vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host FAKE1 (ocf::pacemaker:Dummy): Started guest1 FAKE2 (ocf::pacemaker:Dummy): Started guest1 FAKE3 (ocf::pacemaker:Dummy): Started guest1 FAKE4 (ocf::pacemaker:Dummy): Started example-host FAKE5 (ocf::pacemaker:Dummy): Started example-host ---- -=== Testing Remote-node Recovery and Fencing === +=== Testing Recovery and Fencing === -Pacemaker's policy engine is smart enough to know fencing remote-nodes associated with a virtual machine means shutting off/rebooting the virtual machine. No special configuration is necessary to make this happen. If you are interested in testing this functionality out, trying stopping the guest's pacemaker_remote daemon. This would be equivalent of abruptly terminating a cluster-node's corosync membership without properly shutting it down. +Pacemaker's policy engine is smart enough to know fencing guest nodes +associated with a virtual machine means shutting off/rebooting the virtual +machine. No special configuration is necessary to make this happen. If you +are interested in testing this functionality out, trying stopping the guest's +pacemaker_remote daemon. This would be equivalent of abruptly terminating a +cluster node's corosync membership without properly shutting it down. ssh into the guest and run this command. -[source,C] ---- # kill -9 `pidof pacemaker_remoted` ---- -After a few seconds or so you'll see this in your 'pcs status' output. The 'guest1' node will be show as offline as it is being recovered. +After a few seconds or so, you'll see this in your `pcs status` output. The +*guest1* node will be show as offline as it is being recovered. -[source,C] ---- Last updated: Fri Mar 15 11:00:31 2013 Last change: Fri Mar 15 09:54:16 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 7 Resources configured. Online: [ example-host ] OFFLINE: [ guest1 ] Full list of resources: vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host FAKE1 (ocf::pacemaker:Dummy): Stopped FAKE2 (ocf::pacemaker:Dummy): Stopped FAKE3 (ocf::pacemaker:Dummy): Stopped FAKE4 (ocf::pacemaker:Dummy): Started example-host FAKE5 (ocf::pacemaker:Dummy): Started example-host Failed actions: guest1_monitor_30000 (node=example-host, call=3, rc=7, status=complete): not running ---- -Once recovery of the guest is complete, you'll see it automatically get re-integrated into the cluster. The final 'pcs status' output should look something like this. +Once recovery of the guest is complete, you'll see it automatically get +re-integrated into the cluster. The final `pcs status` output should look +something like this. -[source,C] ---- Last updated: Fri Mar 15 11:03:17 2013 Last change: Fri Mar 15 09:54:16 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 7 Resources configured. Online: [ example-host guest1 ] Full list of resources: vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host FAKE1 (ocf::pacemaker:Dummy): Started guest1 FAKE2 (ocf::pacemaker:Dummy): Started guest1 FAKE3 (ocf::pacemaker:Dummy): Started guest1 FAKE4 (ocf::pacemaker:Dummy): Started example-host FAKE5 (ocf::pacemaker:Dummy): Started example-host Failed actions: guest1_monitor_30000 (node=example-host, call=3, rc=7, status=complete): not running ---- -=== Accessing Cluster Tools from Remote-node === +=== Accessing Cluster Tools from Guest Node === -Besides just allowing the cluster to manage resources on a remote-node, pacemaker_remote has one other trick. +The pacemaker_remote daemon allows nearly all the pacemaker tools (crm_resource, crm_mon, crm_attribute, crm_master) to work on remote nodes natively.+ +Besides allowing the cluster to manage resources on a guest node, +pacemaker_remote has one other trick. The pacemaker_remote daemon allows +nearly all the pacemaker tools (`crm_resource`, `crm_mon`, `crm_attribute`, +`crm_master`, etc.) to work on guest nodes natively. -Try it, run +crm_mon+ or +pcs status+ on the guest after pacemaker has integrated the remote-node into the cluster. These tools just work. These means resource agents such as master/slave resources which need access to tools like crm_master work seamlessly on the remote-nodes. +Try it: Run `crm_mon` on the guest after pacemaker has +integrated the guest node into the cluster. These tools just work. This +means resource agents such as master/slave resources which need access to tools +like `crm_master` work seamlessly on the guest nodes. +Higher-level command shells such as `pcs` may have partial support +on guest nodes, but it is recommended to run them from a cluster node. diff --git a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt deleted file mode 100644 index 9b14effe09..0000000000 --- a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt +++ /dev/null @@ -1,314 +0,0 @@ -= Linux Container (LXC) Walk-through = - -+Warning: Continued development in the VirtualDomain agent, libvirt, and the lxc_autogen script have rendered this tutorial (in its current form) obsolete.+ The high level approach of this tutorial remains accurate, but many of the specifics related to configuring the lxc environment have changed. This walk-through needs to be updated to reflect the current tested methodology. - -+What this tutorial is:+ This tutorial demonstrates how pacemaker_remote can be used with Linux containers (managed by libvirt-lxc) to run cluster resources in an isolated environment. - -+What this tutorial is not:+ This tutorial is not a realistic deployment scenario. The steps shown here are meant to introduce users to the concept of managing Linux container environments with Pacemaker. - -== Step 1: Setup LXC Host == - -This tutorial was tested with Fedora 18. Anything that is capable of running libvirt and pacemaker v1.1.10 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. - -Fedora 18 (or similar distro) host preparation steps. - -=== SElinux and Firewall Rules === -In order to simply this tutorial we will disable the selinux and the firewall on the host. -WARNING: These actions pose a significant security issues to machines exposed to the outside world. Basically, just don't do this on your production system. -[source,C] ----- -# setenforce 0 -# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config -# firewall-cmd --add-port 3121/tcp --permanent - -# systemctl disable iptables.service -# systemctl disable ip6tables.service -# rm '/etc/systemd/system/basic.target.wants/iptables.service' -# rm '/etc/systemd/system/basic.target.wants/ip6tables.service' -# systemctl stop iptables.service -# systemctl stop ip6tables.service ----- - -=== Install Cluster Software on Host === - -[source,C] ----- -# yum install -y pacemaker pacemaker-remote corosync pcs resource-agents ----- - -=== Configure Corosync === - -Corosync handles pacemaker's cluster membership and messaging. The corosync config file is located in /etc/corosync/corosync.conf. That config file must be initialized with information about the cluster-nodes before pacemaker can start. - -To initialize the corosync config file, execute the following pcs command on both nodes filling in the information in <> with your nodes' information. -[source,C] ----- -# pcs cluster setup --local mycluster ----- - -A recent syntax change in pcs may cause the above command to fail. If so try this alternative. -[source,C] ----- -# pcs cluster setup --force --local --name mycluster ----- - -=== Verify Cluster === - -Start the cluster - -[source,C] ----- -# pcs cluster start ----- - -Verify corosync membership - -[source,C] ----- -# pcs status corosync - -Membership information - Nodeid Votes Name -1795270848 1 example-host (local) ----- - -Verify pacemaker status. At first the 'pcs cluster status' output will look like this. - -[source,C] ----- -# pcs status - - Last updated: Thu Mar 14 12:26:00 2013 - Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host - Stack: corosync - Current DC: - Version: 1.1.10 - 1 Nodes configured, unknown expected votes - 0 Resources configured. ----- - -After about a minute you should see your host as a single node in the cluster. - -[source,C] ----- -# pcs status - - Last updated: Thu Mar 14 12:28:23 2013 - Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host - Stack: corosync - Current DC: example-host (1795270848) - partition WITHOUT quorum - Version: 1.1.8-9b13ea1 - 1 Nodes configured, unknown expected votes - 0 Resources configured. - - Online: [ example-host ] ----- - -Go ahead and stop the cluster for now after verifying everything is in order. - -[source,C] ----- -# pcs cluster stop ----- - -== Step 2: Setup LXC Environment == - -=== Install Libvirt LXC software === - -[source,C] ----- -# yum install -y libvirt libvirt-daemon-lxc wget -# systemctl enable libvirtd ----- - -At this point, restart the host. - -=== Generate Libvirt LXC domains === - -I've attempted to simply this tutorial by creating a script to auto generate the libvirt-lxc xml domain definitions. - -Download the script to whatever directory you want the containers to live in. In this example I am using the /root/lxc/ directory. - -[source,C] ----- -# mkdir /root/lxc/ -# cd /root/lxc/ -# wget https://raw.github.com/davidvossel/pcmk-lxc-autogen/master/lxc-autogen -# chmod 755 lxc-autogen ----- - -Now execute the script. - -[source,C] ----- -# ./lxc-autogen ----- - -After executing the script you will see a bunch of directories and xml files are generated. Those xml files are the libvirt-lxc domain definitions, and the directories are used as some special mount points for each container. If you open up one of the xml files you'll be able to see how the cpu, memory, and filesystem resources for the container are defined. You can use the libvirt-lxc driver's documentation found here, http://libvirt.org/drvlxc.html, as a reference to help understand all the parts of the xml file. The lxc-autogen script is not complicated and is worth exploring in order to grasp how the environment is generated. - -It is worth noting that this environment is dependent on use of libvirt's default network interface. Verify the commands below look the same as your environment. The default network address 192.168.122.1 should have been generated by automatically when you installed the virtualization software. - -[source,C] ----- -# virsh net-list -Name State Autostart Persistent -________________________________________________________ -default active yes yes - -# virsh net-dumpxml default | grep -e "ip address=" - - ----- - -=== Generate the Authkey === - -Generate the authkey used to secure connections between the host and the lxc guest pacemaker_remote instances. This is sort of a funny case because the lxc guests and the host will share the same key file in the /etc/pacemaker/ directory. If in a different deployment where the lxc guests do not share the host's /etc/pacemaker directory, this key will have to be copied into each lxc guest. - -[source,C] ----- -# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 ----- - -== Step 3: Integrate LXC guests into Cluster. == - -=== Start Cluster === -On the host, start pacemaker. - -[source,C] ----- -# pcs cluster start ----- - -Wait for the host to become the DC. The output of 'pcs status' should look similar to this after about a minute. - -[source,C] ----- -Last updated: Thu Mar 14 16:41:22 2013 -Last change: Thu Mar 14 16:41:08 2013 via crmd on example-host -Stack: corosync -Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.10 -1 Nodes configured, unknown expected votes -0 Resources configured. - - -Online: [ example-host ] ----- - -Now enable the cluster to work without quorum or stonith. This is required just for the sake of getting this tutorial to work with a single cluster-node. - -[source,C] ----- -# pcs property set stonith-enabled=false -# pcs property set no-quorum-policy=ignore ----- - -=== Integrate LXC Guests as remote-nodes === - -If you ran the 'lxc-autogen' script with default parameters, 3 lxc domain definitions were created as .xml files. If you used the same directory I used for the lxc environment, the config files will be located in /root/lxc. Replace the 'config' parameters in the following pcs commands if yours should be different. - -The pcs commands below each configure a lxc guest as a remote-node in pacemaker. Behind the scenes each lxc guest is launching an instance of pacemaker_remote allowing pacemaker to integrate the lxc guests as remote-nodes. The meta-attribute 'remote-node=' used in each command is what tells pacemaker that the lxc guest is both a resource and a remote-node capable of running resources. In this case, the 'remote-node' attribute also indicates to pacemaker that it can contact each lxc's pacemaker_remote service by using the remote-node name as the hostname. If you look in the /etc/hosts/ file you will see entries for each lxc guest. These entries were auto-generated earlier by the 'lxc-autogen' script. - -[source,C] ----- -# pcs resource create container1 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc1.xml" meta remote-node=lxc1 -# pcs resource create container2 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc2.xml" meta remote-node=lxc2 -# pcs resource create container3 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc3.xml" meta remote-node=lxc3 ----- - - -After creating the container resources you 'pcs status' should look like this. - -[source,C] ----- -Last updated: Mon Mar 18 17:15:46 2013 -Last change: Mon Mar 18 17:15:26 2013 via cibadmin on guest1 -Stack: corosync -Current DC: example-host (175810752) - partition WITHOUT quorum -Version: 1.1.10 -4 Nodes configured, unknown expected votes -6 Resources configured. - -Online: [ example-host lxc1 lxc2 lxc3 ] - -Full list of resources: - - container3 (ocf::heartbeat:VirtualDomain): Started example-host - container1 (ocf::heartbeat:VirtualDomain): Started example-host - container2 (ocf::heartbeat:VirtualDomain): Started example-host ----- - - -=== Starting Resources on LXC Guests === - -Now that the lxc guests are integrated into the cluster, lets generate some Dummy resources to run on them. - -Dummy resources are real resource agents used just for testing purposes. They actually execute on the node they are assigned to just like an apache server or database would, except their execution just means a file was created. When the resource is stopped, that the file it created is removed. - -[source,C] ----- -# pcs resource create FAKE1 ocf:pacemaker:Dummy -# pcs resource create FAKE2 ocf:pacemaker:Dummy -# pcs resource create FAKE3 ocf:pacemaker:Dummy -# pcs resource create FAKE4 ocf:pacemaker:Dummy -# pcs resource create FAKE5 ocf:pacemaker:Dummy ----- - - -After creating the Dummy resources you will see that the resource got distributed among all the nodes. The 'pcs status' output should look similar to this. - -[source,C] ----- -Last updated: Mon Mar 18 17:31:54 2013 -Last change: Mon Mar 18 17:31:05 2013 via cibadmin on example-host -Stack: corosync -Current DC: example=host (175810752) - partition WITHOUT quorum -Version: 1.1.10 -4 Nodes configured, unknown expected votes -11 Resources configured. - - -Online: [ example-host lxc1 lxc2 lxc3 ] - -Full list of resources: - - container3 (ocf::heartbeat:VirtualDomain): Started example-host - container1 (ocf::heartbeat:VirtualDomain): Started example-host - container2 (ocf::heartbeat:VirtualDomain): Started example-host - FAKE1 (ocf::pacemaker:Dummy): Started lxc1 - FAKE2 (ocf::pacemaker:Dummy): Started lxc2 - FAKE3 (ocf::pacemaker:Dummy): Started lxc3 - FAKE4 (ocf::pacemaker:Dummy): Started lxc1 - FAKE5 (ocf::pacemaker:Dummy): Started lxc2 ----- - -To witness that Dummy agents are running within the lxc guests browse one of the lxc domain's filesystem folders. Each lxc guest has a custom mount point for the '/var/run/'directory, which is the location the Dummy resources write their state files to. - -[source,C] ----- -# ls lxc1-filesystem/var/run/ -Dummy-FAKE4.state Dummy-FAKE.state ----- - -If you are curious, take a look at lxc1.xml to see how the filesystem is mounted. - -=== Testing LXC Guest Failure === - -You will be able to see each pacemaker_remoted process running in each lxc guest from the host machine. - -[source,C] ----- -# ps -A | grep -e pacemaker_remote* - 9142 pts/2 00:00:00 pacemaker_remot -10148 pts/4 00:00:00 pacemaker_remot -10942 pts/6 00:00:00 pacemaker_remot ----- - -In order to see how the cluster reacts to a failed lxc guest. Try killing one of the pacemaker_remote instances. - -[source,C] ----- -# kill -9 9142 ----- - -After a few moments the lxc guest that was running that instance of pacemaker_remote will be recovered along with all the resources running within that container. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Options.txt b/doc/Pacemaker_Remote/en-US/Ch-Options.txt index 5d68b4f9d0..abe511fd35 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Options.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Options.txt @@ -1,77 +1,115 @@ = Configuration Explained = -The walk-through examples use some of these options, but don't explain exactly what they mean or do. This section is meant to be the go-to resource for all the options available for configuring remote-nodes. +The walk-through examples use some of these options, but don't explain exactly +what they mean or do. This section is meant to be the go-to resource for all +the options available for configuring pacemaker_remote-based nodes. +(((configuration))) -== Container remote-node Resource Options == +== Resource Meta-Attributes for Guest Nodes == -When configuring a virtual machine or lxc resource to act as a remote-node, these are the metadata options available to both enable the resource as a remote-node and define the connection parameters. +When configuring a virtual machine to use as a guest node, these are the +metadata options available to enable the resource as a guest node and +define its connection parameters. -.Metadata Options for configuring KVM/LXC resources as remote-nodes -[width="95%",cols="1m,1,4<",options="header",align="center"] +.Meta-attributes for configuring VM resources as guest nodes +[width="95%",cols="2m,1,4<",options="header",align="center"] |========================================================= |Option |Default |Description -|+remote-node+ -| -|The name of the remote-node this resource defines. This both enables the resource as a remote-node and defines the unique name used to identify the remote-node. If no other parameters are set, this value will also be assumed as the hostname to connect to at port 3121. +WARNING+ This value cannot overlap with any resource or node IDs. +|remote-node +|'none' +|The node name of the guest node this resource defines. This both enables the +resource as a guest node and defines the unique name used to identify the +guest node. If no other parameters are set, this value will also be assumed as +the hostname to use when connecting to pacemaker_remote on the VM. This value +*must not* overlap with any resource or node IDs. -|+remote-port+ +|remote-port |3121 -|Configure a custom port to use for the guest connection to pacemaker_remote. +|The port on the virtual machine that the cluster will use to connect to +pacemaker_remote. -|+remote-addr+ -|+remote-node+ value used as hostname -|The ip address or hostname to connect to if remote-node's name is not the hostname of the guest. +|remote-addr +|'value of' +remote-node+ +|The IP address or hostname to use when connecting to pacemaker_remote on the VM. -|+remote-connect-timeout+ +|remote-connect-timeout |60s |How long before a pending guest connection will time out. |========================================================= -== Baremetal remote-node Options == +== Connection Resources for Remote Nodes == -Baremetal remote-nodes are defined by a connection resource. That connection resource has the following instance attributes that define where the baremetal remote-node is located on the network and how to communicate with that remote-node. Descriptions of these options can be retrieved using the following pcs command. +A remote node is defined by a connection resource. That connection resource +has instance attributes that define where the remote node is located on the +network and how to communicate with it. -[source,C] +Descriptions of these instance attributes can be retrieved using the following +`pcs` command: ---- # pcs resource describe remote - Resource options for: ocf:pacemaker:remote - server: Server location to connect to. This can be an ip address or hostname. - port: tcp port to connect to. +ocf:pacemaker:remote - + + + +Resource options: + server: Server location to connect to. This can be an ip address or hostname. + port: tcp port to connect to. ---- -When defining a baremetal remote-node's connection resource, it is common and recommended to name the connection resource the same name as the baremeatal remote-node's hostname. By default, if no "server" option is provided, the cluster will attempt to contact the remote-node using the resource name as the hostname. +When defining a remote node's connection resource, it is common and recommended +to name the connection resource the same as the remote node's hostname. By +default, if no *server* option is provided, the cluster will attempt to contact +the remote node using the resource name as the hostname. -Example, defining a baremetal remote-node with the hostname "remote1" -[source,C] +Example defining a remote node with the hostname *remote1*: ---- # pcs resource create remote1 remote ---- -Example, defining a baremetal remote-node to connect to a specific ip and port. -[source,C] +Example defining a remote node to connect to a specific IP address and port: ---- # pcs resource create remote1 remote server=192.168.122.200 port=8938 ---- -== Host and Guest Authentication == +== Environment Variables for Daemon Start-up == -Authentication and encryption of the connection between cluster-nodes (pacemaker) to remote-nodes (pacemaker_remote) is achieved using TLS with PSK encryption/authentication on +tcp port 3121+. This means both the cluster-node and remote-node must share the same private key. By default this +key must be placed at "/etc/pacemaker/authkey" on both cluster-nodes and remote-nodes+. +Authentication and encryption of the connection between cluster nodes +and nodes running pacemaker_remote is achieved using +with https://en.wikipedia.org/wiki/TLS-PSK[TLS-PSK] encryption/authentication +over TCP (port 3121 by default). This means that both the cluster node and +remote node must share the same private key. By default, this +key is placed at +/etc/pacemaker/authkey+ on each node. -== Pacemaker and pacemaker_remote Options == - -If you need to change the default port or authkey location for either pacemaker or pacemaker_remote, there are environment variables you can set that affect both of those daemons. These environment variables can be enabled by placing them in the /etc/sysconfig/pacemaker file. -[source,C] +You can change the default port and/or key location for Pacemaker and +pacemaker_remote via environment variables. These environment variables can be +enabled by placing them in the +/etc/sysconfig/pacemaker+ file. ---- #==#==# Pacemaker Remote # Use a custom directory for finding the authkey. PCMK_authkey_location=/etc/pacemaker/authkey # # Specify a custom port for Pacemaker Remote connections PCMK_remote_port=3121 ---- +== Removing Remote Nodes and Guest Nodes == + +If the resource creating a guest node, or the *ocf:pacemaker:remote* resource +creating a connection to a remote node, is removed from the configuration, the +affected node will continue to show up in output as an offline node. + +If you want to get rid of that output, run (replacing $NODE_NAME appropriately): +---- +# crm_node --force --remove $NODE_NAME +---- + +[WARNING] +========= +Be absolutely sure that the node's resource has been deleted from the +configuration first. +========= diff --git a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.ent b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.ent index be6171c50d..8a9d4f932b 100644 --- a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.ent +++ b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.ent @@ -1,6 +1,6 @@ - + - - + + diff --git a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml index 9a5e119481..3867314a33 100644 --- a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml +++ b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml @@ -1,18 +1,17 @@ %BOOK_ENTITIES; ]> - - + diff --git a/doc/Pacemaker_Remote/en-US/Revision_History.xml b/doc/Pacemaker_Remote/en-US/Revision_History.xml index af25ebe72a..269b549a11 100644 --- a/doc/Pacemaker_Remote/en-US/Revision_History.xml +++ b/doc/Pacemaker_Remote/en-US/Revision_History.xml @@ -1,31 +1,37 @@ %BOOK_ENTITIES; ]> Revision History 1-0 Tue Mar 19 2013 DavidVosseldavidvossel@gmail.com Import from Pages.app 2-0 Tue May 13 2013 DavidVosseldavidvossel@gmail.com Added Future Features Section 3-0 Fri Oct 18 2013 DavidVosseldavidvossel@gmail.com Added Baremetal remote-node feature documentation + + 4-0 + Tue Aug 25 2015 + KenGaillotkgaillot@redhat.com + Targeted CentOS 7.1 and Pacemaker 1.1.12+, updated for current terminology and practice + diff --git a/fencing/commands.c b/fencing/commands.c index adf655b887..77baa1ddc1 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -1,2472 +1,2538 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if SUPPORT_CIBSECRETS # include #endif #include GHashTable *device_list = NULL; GHashTable *topology = NULL; GList *cmd_list = NULL; struct device_search_s { /* target of fence action */ char *host; /* requested fence action */ char *action; /* timeout to use if a device is queried dynamically for possible targets */ int per_device_timeout; /* number of registered fencing devices at time of request */ int replies_needed; /* number of device replies received so far */ int replies_received; /* whether the target is eligible to perform requested action (or off) */ bool allow_suicide; /* private data to pass to search callback function */ void *user_data; /* function to call when all replies have been received */ void (*callback) (GList * devices, void *user_data); /* devices capable of performing requested action (or off if remapping) */ GListPtr capable; }; static gboolean stonith_device_dispatch(gpointer user_data); static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data); static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, const char *client_id); static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence); typedef struct async_command_s { int id; int pid; int fd_stdout; int options; int default_timeout; /* seconds */ int timeout; /* seconds */ int start_delay; /* milliseconds */ int delay_id; char *op; char *origin; char *client; char *client_name; char *remote_op_id; char *victim; uint32_t victim_nodeid; char *action; char *device; char *mode; GListPtr device_list; GListPtr device_next; void *internal_user_data; void (*done_cb) (GPid pid, int rc, const char *output, gpointer user_data); guint timer_sigterm; guint timer_sigkill; /*! If the operation timed out, this is the last signal * we sent to the process to get it to terminate */ int last_timeout_signo; } async_command_t; static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc); static gboolean is_action_required(const char *action, stonith_device_t *device) { return device && device->automatic_unfencing && safe_str_eq(action, "on"); } static int get_action_delay_max(stonith_device_t * device, const char * action) { const char *value = NULL; int delay_max_ms = 0; if (safe_str_neq(action, "off") && safe_str_neq(action, "reboot")) { return 0; } value = g_hash_table_lookup(device->params, STONITH_ATTR_DELAY_MAX); if (value) { delay_max_ms = crm_get_msec(value); } return delay_max_ms; } /*! * \internal * \brief Override STONITH timeout with pcmk_*_timeout if available * * \param[in] device STONITH device to use * \param[in] action STONITH action name * \param[in] default_timeout Timeout to use if device does not have * a pcmk_*_timeout parameter for action * * \return Value of pcmk_(action)_timeout if available, otherwise default_timeout * \note For consistency, it would be nice if reboot/off/on timeouts could be * set the same way as start/stop/monitor timeouts, i.e. with an * entry in the fencing resource configuration. However that * is insufficient because fencing devices may be registered directly via * the STONITH register_device() API instead of going through the CIB * (e.g. stonith_admin uses it for its -R option, and the LRMD uses it to * ensure a device is registered when a command is issued). As device * properties, pcmk_*_timeout parameters can be grabbed by stonithd when * the device is registered, whether by CIB change or API call. */ static int get_action_timeout(stonith_device_t * device, const char *action, int default_timeout) { if (action && device && device->params) { char buffer[64] = { 0, }; const char *value = NULL; /* If "reboot" was requested but the device does not support it, * we will remap to "off", so check timeout for "off" instead */ if (safe_str_eq(action, "reboot") && is_not_set(device->flags, st_device_supports_reboot)) { crm_trace("%s doesn't support reboot, using timeout for off instead", device->id); action = "off"; } /* If the device config specified an action-specific timeout, use it */ snprintf(buffer, sizeof(buffer) - 1, "pcmk_%s_timeout", action); value = g_hash_table_lookup(device->params, buffer); if (value) { return atoi(value); } } return default_timeout; } static void free_async_command(async_command_t * cmd) { if (!cmd) { return; } if (cmd->delay_id) { g_source_remove(cmd->delay_id); } cmd_list = g_list_remove(cmd_list, cmd); g_list_free_full(cmd->device_list, free); free(cmd->device); free(cmd->action); free(cmd->victim); free(cmd->remote_op_id); free(cmd->client); free(cmd->client_name); free(cmd->origin); free(cmd->mode); free(cmd->op); free(cmd); } static async_command_t * create_async_command(xmlNode * msg) { async_command_t *cmd = NULL; xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); const char *action = crm_element_value(op, F_STONITH_ACTION); CRM_CHECK(action != NULL, crm_log_xml_warn(msg, "NoAction"); return NULL); crm_log_xml_trace(msg, "Command"); cmd = calloc(1, sizeof(async_command_t)); crm_element_value_int(msg, F_STONITH_CALLID, &(cmd->id)); crm_element_value_int(msg, F_STONITH_CALLOPTS, &(cmd->options)); crm_element_value_int(msg, F_STONITH_TIMEOUT, &(cmd->default_timeout)); cmd->timeout = cmd->default_timeout; cmd->origin = crm_element_value_copy(msg, F_ORIG); cmd->remote_op_id = crm_element_value_copy(msg, F_STONITH_REMOTE_OP_ID); cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID); cmd->client_name = crm_element_value_copy(msg, F_STONITH_CLIENTNAME); cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION); cmd->action = strdup(action); cmd->victim = crm_element_value_copy(op, F_STONITH_TARGET); cmd->mode = crm_element_value_copy(op, F_STONITH_MODE); cmd->device = crm_element_value_copy(op, F_STONITH_DEVICE); CRM_CHECK(cmd->op != NULL, crm_log_xml_warn(msg, "NoOp"); free_async_command(cmd); return NULL); CRM_CHECK(cmd->client != NULL, crm_log_xml_warn(msg, "NoClient")); cmd->done_cb = st_child_done; cmd_list = g_list_append(cmd_list, cmd); return cmd; } static gboolean stonith_device_execute(stonith_device_t * device) { int exec_rc = 0; const char *action_str = NULL; async_command_t *cmd = NULL; stonith_action_t *action = NULL; CRM_CHECK(device != NULL, return FALSE); if (device->active_pid) { crm_trace("%s is still active with pid %u", device->id, device->active_pid); return TRUE; } if (device->pending_ops) { GList *first = device->pending_ops; cmd = first->data; if (cmd && cmd->delay_id) { crm_trace ("Operation %s%s%s on %s was asked to run too early, waiting for start_delay timeout of %dms", cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "", device->id, cmd->start_delay); return TRUE; } device->pending_ops = g_list_remove_link(device->pending_ops, first); g_list_free_1(first); } if (cmd == NULL) { crm_trace("Nothing further to do for %s", device->id); return TRUE; } if(safe_str_eq(device->agent, STONITH_WATCHDOG_AGENT)) { if(safe_str_eq(cmd->action, "reboot")) { pcmk_panic(__FUNCTION__); return TRUE; } else if(safe_str_eq(cmd->action, "off")) { pcmk_panic(__FUNCTION__); return TRUE; } else { crm_info("Faking success for %s watchdog operation", cmd->action); cmd->done_cb(0, 0, NULL, cmd); return TRUE; } } #if SUPPORT_CIBSECRETS if (replace_secret_params(device->id, device->params) < 0) { /* replacing secrets failed! */ if (safe_str_eq(cmd->action,"stop")) { /* don't fail on stop! */ crm_info("proceeding with the stop operation for %s", device->id); } else { crm_err("failed to get secrets for %s, " "considering resource not configured", device->id); exec_rc = PCMK_OCF_NOT_CONFIGURED; cmd->done_cb(0, exec_rc, NULL, cmd); return TRUE; } } #endif action_str = cmd->action; if (safe_str_eq(cmd->action, "reboot") && is_not_set(device->flags, st_device_supports_reboot)) { crm_warn("Agent '%s' does not advertise support for 'reboot', performing 'off' action instead", device->agent); action_str = "off"; } action = stonith_action_create(device->agent, action_str, cmd->victim, cmd->victim_nodeid, cmd->timeout, device->params, device->aliases); /* for async exec, exec_rc is pid if positive and error code if negative/zero */ exec_rc = stonith_action_execute_async(action, (void *)cmd, cmd->done_cb); if (exec_rc > 0) { crm_debug("Operation %s%s%s on %s now running with pid=%d, timeout=%ds", cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "", device->id, exec_rc, cmd->timeout); device->active_pid = exec_rc; } else { crm_warn("Operation %s%s%s on %s failed: %s (%d)", cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "", device->id, pcmk_strerror(exec_rc), exec_rc); cmd->done_cb(0, exec_rc, NULL, cmd); } return TRUE; } static gboolean stonith_device_dispatch(gpointer user_data) { return stonith_device_execute(user_data); } static gboolean start_delay_helper(gpointer data) { async_command_t *cmd = data; stonith_device_t *device = NULL; cmd->delay_id = 0; device = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; if (device) { mainloop_set_trigger(device->work); } return FALSE; } static void schedule_stonith_command(async_command_t * cmd, stonith_device_t * device) { int delay_max = 0; CRM_CHECK(cmd != NULL, return); CRM_CHECK(device != NULL, return); if (cmd->device) { free(cmd->device); } if (device->include_nodeid && cmd->victim) { crm_node_t *node = crm_get_peer(0, cmd->victim); cmd->victim_nodeid = node->id; } cmd->device = strdup(device->id); cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout); if (cmd->remote_op_id) { crm_debug("Scheduling %s on %s for remote peer %s with op id (%s) (timeout=%ds)", cmd->action, device->id, cmd->origin, cmd->remote_op_id, cmd->timeout); } else { crm_debug("Scheduling %s on %s for %s (timeout=%ds)", cmd->action, device->id, cmd->client, cmd->timeout); } device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); delay_max = get_action_delay_max(device, cmd->action); if (delay_max > 0) { cmd->start_delay = rand() % delay_max; crm_notice("Delaying %s on %s for %lldms (timeout=%ds)", cmd->action, device->id, cmd->start_delay, cmd->timeout); cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); } } void free_device(gpointer data) { GListPtr gIter = NULL; stonith_device_t *device = data; g_hash_table_destroy(device->params); g_hash_table_destroy(device->aliases); for (gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) { async_command_t *cmd = gIter->data; crm_warn("Removal of device '%s' purged operation %s", device->id, cmd->action); cmd->done_cb(0, -ENODEV, NULL, cmd); free_async_command(cmd); } g_list_free(device->pending_ops); g_list_free_full(device->targets, free); mainloop_destroy_trigger(device->work); free_xml(device->agent_metadata); free(device->namespace); free(device->on_target_actions); free(device->agent); free(device->id); free(device); } static GHashTable * build_port_aliases(const char *hostmap, GListPtr * targets) { char *name = NULL; int last = 0, lpc = 0, max = 0, added = 0; GHashTable *aliases = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, g_hash_destroy_str, g_hash_destroy_str); if (hostmap == NULL) { return aliases; } max = strlen(hostmap); for (; lpc <= max; lpc++) { switch (hostmap[lpc]) { /* Assignment chars */ case '=': case ':': if (lpc > last) { free(name); name = calloc(1, 1 + lpc - last); memcpy(name, hostmap + last, lpc - last); } last = lpc + 1; break; /* Delimeter chars */ /* case ',': Potentially used to specify multiple ports */ case 0: case ';': case ' ': case '\t': if (name) { char *value = NULL; value = calloc(1, 1 + lpc - last); memcpy(value, hostmap + last, lpc - last); crm_debug("Adding alias '%s'='%s'", name, value); g_hash_table_replace(aliases, name, value); if (targets) { *targets = g_list_append(*targets, strdup(value)); } value = NULL; name = NULL; added++; } else if (lpc > last) { crm_debug("Parse error at offset %d near '%s'", lpc - last, hostmap + last); } last = lpc + 1; break; } if (hostmap[lpc] == 0) { break; } } if (added == 0) { crm_info("No host mappings detected in '%s'", hostmap); } free(name); return aliases; } static void parse_host_line(const char *line, int max, GListPtr * output) { int lpc = 0; int last = 0; if (max <= 0) { return; } /* Check for any complaints about additional parameters that the device doesn't understand */ if (strstr(line, "invalid") || strstr(line, "variable")) { crm_debug("Skipping: %s", line); return; } crm_trace("Processing %d bytes: [%s]", max, line); /* Skip initial whitespace */ for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { last = lpc + 1; } /* Now the actual content */ for (lpc = 0; lpc <= max; lpc++) { gboolean a_space = isspace(line[lpc]); if (a_space && lpc < max && isspace(line[lpc + 1])) { /* fast-forward to the end of the spaces */ } else if (a_space || line[lpc] == ',' || line[lpc] == ';' || line[lpc] == 0) { int rc = 1; char *entry = NULL; if (lpc != last) { entry = calloc(1, 1 + lpc - last); rc = sscanf(line + last, "%[a-zA-Z0-9_-.]", entry); } if (entry == NULL) { /* Skip */ } else if (rc != 1) { crm_warn("Could not parse (%d %d): %s", last, lpc, line + last); } else if (safe_str_neq(entry, "on") && safe_str_neq(entry, "off")) { crm_trace("Adding '%s'", entry); *output = g_list_append(*output, entry); entry = NULL; } free(entry); last = lpc + 1; } } } static GListPtr parse_host_list(const char *hosts) { int lpc = 0; int max = 0; int last = 0; GListPtr output = NULL; if (hosts == NULL) { return output; } max = strlen(hosts); for (lpc = 0; lpc <= max; lpc++) { if (hosts[lpc] == '\n' || hosts[lpc] == 0) { char *line = NULL; int len = lpc - last; if(len > 1) { line = malloc(1 + len); } if(line) { snprintf(line, 1 + len, "%s", hosts + last); line[len] = 0; /* Because it might be '\n' */ parse_host_line(line, len, &output); free(line); } last = lpc + 1; } } crm_trace("Parsed %d entries from '%s'", g_list_length(output), hosts); return output; } GHashTable *metadata_cache = NULL; static xmlNode * get_agent_metadata(const char *agent) { xmlNode *xml = NULL; char *buffer = NULL; if(metadata_cache == NULL) { metadata_cache = g_hash_table_new_full( crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } buffer = g_hash_table_lookup(metadata_cache, agent); if(safe_str_eq(agent, STONITH_WATCHDOG_AGENT)) { return NULL; } else if(buffer == NULL) { stonith_t *st = stonith_api_new(); int rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); stonith_api_delete(st); if (rc || !buffer) { crm_err("Could not retrieve metadata for fencing agent %s", agent); return NULL; } g_hash_table_replace(metadata_cache, strdup(agent), buffer); } xml = string2xml(buffer); return xml; } static gboolean is_nodeid_required(xmlNode * xml) { xmlXPathObjectPtr xpath = NULL; if (stand_alone) { return FALSE; } if (!xml) { return FALSE; } xpath = xpath_search(xml, "//parameter[@name='nodeid']"); if (numXpathResults(xpath) <= 0) { freeXpathObject(xpath); return FALSE; } freeXpathObject(xpath); return TRUE; } static char * add_action(char *actions, const char *action) { static size_t len = 256; int offset = 0; if (actions == NULL) { actions = calloc(1, len); } else { offset = strlen(actions); } if (offset > 0) { offset += snprintf(actions+offset, len-offset, " "); } offset += snprintf(actions+offset, len-offset, "%s", action); return actions; } static void read_action_metadata(stonith_device_t *device) { xmlXPathObjectPtr xpath = NULL; int max = 0; int lpc = 0; if (device->agent_metadata == NULL) { return; } xpath = xpath_search(device->agent_metadata, "//action"); max = numXpathResults(xpath); if (max <= 0) { freeXpathObject(xpath); return; } for (lpc = 0; lpc < max; lpc++) { const char *on_target = NULL; const char *action = NULL; xmlNode *match = getXpathResult(xpath, lpc); CRM_LOG_ASSERT(match != NULL); if(match == NULL) { continue; }; on_target = crm_element_value(match, "on_target"); action = crm_element_value(match, "name"); if(safe_str_eq(action, "list")) { set_bit(device->flags, st_device_supports_list); } else if(safe_str_eq(action, "status")) { set_bit(device->flags, st_device_supports_status); } else if(safe_str_eq(action, "reboot")) { set_bit(device->flags, st_device_supports_reboot); } else if (safe_str_eq(action, "on")) { /* "automatic" means the cluster will unfence node when it joins */ const char *automatic = crm_element_value(match, "automatic"); /* "required" is a deprecated synonym for "automatic" */ const char *required = crm_element_value(match, "required"); if (crm_is_true(automatic) || crm_is_true(required)) { device->automatic_unfencing = TRUE; } } if (action && crm_is_true(on_target)) { device->on_target_actions = add_action(device->on_target_actions, action); } } freeXpathObject(xpath); } static stonith_device_t * build_device_from_xml(xmlNode * msg) { const char *value = NULL; xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); stonith_device_t *device = NULL; device = calloc(1, sizeof(stonith_device_t)); device->id = crm_element_value_copy(dev, XML_ATTR_ID); device->agent = crm_element_value_copy(dev, "agent"); device->namespace = crm_element_value_copy(dev, "namespace"); device->params = xml2list(dev); value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTLIST); if (value) { device->targets = parse_host_list(value); } value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTMAP); device->aliases = build_port_aliases(value, &(device->targets)); device->agent_metadata = get_agent_metadata(device->agent); read_action_metadata(device); value = g_hash_table_lookup(device->params, "nodeid"); if (!value) { device->include_nodeid = is_nodeid_required(device->agent_metadata); } value = crm_element_value(dev, "rsc_provides"); if (safe_str_eq(value, "unfencing")) { device->automatic_unfencing = TRUE; } if (is_action_required("on", device)) { crm_info("The fencing device '%s' requires unfencing", device->id); } if (device->on_target_actions) { crm_info("The fencing device '%s' requires actions (%s) to be executed on the target node", device->id, device->on_target_actions); } device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device); /* TODO: Hook up priority */ return device; } static const char * target_list_type(stonith_device_t * dev) { const char *check_type = NULL; check_type = g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTCHECK); if (check_type == NULL) { if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTLIST)) { check_type = "static-list"; } else if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP)) { check_type = "static-list"; } else if(is_set(dev->flags, st_device_supports_list)){ check_type = "dynamic-list"; } else if(is_set(dev->flags, st_device_supports_status)){ check_type = "status"; } else { check_type = "none"; } } return check_type; } void schedule_internal_command(const char *origin, stonith_device_t * device, const char *action, const char *victim, int timeout, void *internal_user_data, void (*done_cb) (GPid pid, int rc, const char *output, gpointer user_data)) { async_command_t *cmd = NULL; cmd = calloc(1, sizeof(async_command_t)); cmd->id = -1; cmd->default_timeout = timeout ? timeout : 60; cmd->timeout = cmd->default_timeout; cmd->action = strdup(action); cmd->victim = victim ? strdup(victim) : NULL; cmd->device = strdup(device->id); cmd->origin = strdup(origin); cmd->client = strdup(crm_system_name); cmd->client_name = strdup(crm_system_name); cmd->internal_user_data = internal_user_data; cmd->done_cb = done_cb; /* cmd, not internal_user_data, is passed to 'done_cb' as the userdata */ schedule_stonith_command(cmd, device); } gboolean string_in_list(GListPtr list, const char *item) { int lpc = 0; int max = g_list_length(list); for (lpc = 0; lpc < max; lpc++) { const char *value = g_list_nth_data(list, lpc); if (safe_str_eq(item, value)) { return TRUE; } else { crm_trace("%d: '%s' != '%s'", lpc, item, value); } } return FALSE; } static void status_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; gboolean can = FALSE; free_async_command(cmd); if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } dev->active_pid = 0; mainloop_set_trigger(dev->work); if (rc == 1 /* unknown */ ) { crm_trace("Host %s is not known by %s", search->host, dev->id); } else if (rc == 0 /* active */ || rc == 2 /* inactive */ ) { crm_trace("Host %s is known by %s", search->host, dev->id); can = TRUE; } else { crm_notice("Unknown result when testing if %s can fence %s: rc=%d", dev->id, search->host, rc); } search_devices_record_result(search, dev->id, can); } static void dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { async_command_t *cmd = user_data; struct device_search_s *search = cmd->internal_user_data; stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; gboolean can_fence = FALSE; free_async_command(cmd); /* Host/alias must be in the list output to be eligible to be fenced * * Will cause problems if down'd nodes aren't listed or (for virtual nodes) * if the guest is still listed despite being moved to another machine */ if (!dev) { search_devices_record_result(search, NULL, FALSE); return; } dev->active_pid = 0; mainloop_set_trigger(dev->work); /* If we successfully got the targets earlier, don't disable. */ if (rc != 0 && !dev->targets) { crm_notice("Disabling port list queries for %s (%d): %s", dev->id, rc, output); /* Fall back to status */ g_hash_table_replace(dev->params, strdup(STONITH_ATTR_HOSTCHECK), strdup("status")); g_list_free_full(dev->targets, free); dev->targets = NULL; } else if (!rc) { crm_info("Refreshing port list for %s", dev->id); g_list_free_full(dev->targets, free); dev->targets = parse_host_list(output); dev->targets_age = time(NULL); } if (dev->targets) { const char *alias = g_hash_table_lookup(dev->aliases, search->host); if (!alias) { alias = search->host; } if (string_in_list(dev->targets, alias)) { can_fence = TRUE; } } search_devices_record_result(search, dev->id, can_fence); } /*! * \internal * \brief Checks to see if an identical device already exists in the device_list */ static stonith_device_t * device_has_duplicate(stonith_device_t * device) { char *key = NULL; char *value = NULL; GHashTableIter gIter; stonith_device_t *dup = g_hash_table_lookup(device_list, device->id); if (!dup) { crm_trace("No match for %s", device->id); return NULL; } else if (safe_str_neq(dup->agent, device->agent)) { crm_trace("Different agent: %s != %s", dup->agent, device->agent); return NULL; } /* Use calculate_operation_digest() here? */ g_hash_table_iter_init(&gIter, device->params); while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&value)) { if(strstr(key, "CRM_meta") == key) { continue; } else if(strcmp(key, "crm_feature_set") == 0) { continue; } else { char *other_value = g_hash_table_lookup(dup->params, key); if (!other_value || safe_str_neq(other_value, value)) { crm_trace("Different value for %s: %s != %s", key, other_value, value); return NULL; } } } crm_trace("Match"); return dup; } int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) { stonith_device_t *dup = NULL; stonith_device_t *device = build_device_from_xml(msg); dup = device_has_duplicate(device); if (dup) { crm_debug("Device '%s' already existed in device list (%d active devices)", device->id, g_hash_table_size(device_list)); free_device(device); device = dup; } else { stonith_device_t *old = g_hash_table_lookup(device_list, device->id); if (from_cib && old && old->api_registered) { /* If the cib is writing over an entry that is shared with a stonith client, * copy any pending ops that currently exist on the old entry to the new one. * Otherwise the pending ops will be reported as failures */ crm_info("Overwriting an existing entry for %s from the cib", device->id); device->pending_ops = old->pending_ops; device->api_registered = TRUE; old->pending_ops = NULL; if (device->pending_ops) { mainloop_set_trigger(device->work); } } g_hash_table_replace(device_list, device->id, device); crm_notice("Added '%s' to the device list (%d active devices)", device->id, g_hash_table_size(device_list)); } if (desc) { *desc = device->id; } if (from_cib) { device->cib_registered = TRUE; } else { device->api_registered = TRUE; } return pcmk_ok; } int stonith_device_remove(const char *id, gboolean from_cib) { stonith_device_t *device = g_hash_table_lookup(device_list, id); if (!device) { crm_info("Device '%s' not found (%d active devices)", id, g_hash_table_size(device_list)); return pcmk_ok; } if (from_cib) { device->cib_registered = FALSE; } else { device->verified = FALSE; device->api_registered = FALSE; } if (!device->cib_registered && !device->api_registered) { g_hash_table_remove(device_list, id); crm_info("Removed '%s' from the device list (%d active devices)", id, g_hash_table_size(device_list)); } return pcmk_ok; } /*! * \internal * \brief Return the number of stonith levels registered for a node * * \param[in] tp Node's topology table entry * * \return Number of non-NULL levels in topology entry * \note This function is used only for log messages. */ static int count_active_levels(stonith_topology_t * tp) { int lpc = 0; int count = 0; for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if (tp->levels[lpc] != NULL) { count++; } } return count; } void free_topology_entry(gpointer data) { stonith_topology_t *tp = data; int lpc = 0; for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if (tp->levels[lpc] != NULL) { g_list_free_full(tp->levels[lpc], free); } } free(tp->target); free(tp); } +/* + * \internal + * \brief Check whether a string contains an attribute name/value separator + * + * A topology target may be specified either as a node name regular expression + * or as a node attribute name/value pair. Name/value pairs may be specified + * as either "name=value" or "name:value". If the given string contains an '=' + * or a single ':', this function will return a pointer to it. + * + * \param[in] nvpair String to check + * + * \return Pointer to separator if present, NULL otherwise + */ +static char * +find_nvpair_separator(const char *nvpair) +{ + char *sep; + + /* If we find an equals sign, return pointer to it */ + sep = strchr(nvpair, '='); + if (sep != NULL) { + return sep; + } + + /* If we find a colon, make sure there is only one (to distinguish + * nvpairs from IPv6 addresses), and return pointer to it if so. + */ + sep = strchr(nvpair, ':'); + if ((sep != NULL) && (strchr(sep, ':') == NULL)) { + return sep; + } + + /* We didn't find a separator */ + return NULL; +} + /*! * \internal * \brief Register a STONITH level for a target * * Given an XML request specifying the target name, level index, and device IDs * for the level, this will create an entry for the target in the global topology * table if one does not already exist, then append the specified device IDs to * the entry's device list for the specified level. * * \param[in] msg XML request for STONITH level registration * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") * * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index */ int stonith_level_register(xmlNode * msg, char **desc) { int id = 0; xmlNode *child = NULL; - xmlNode *level = get_xpath_object("//" F_STONITH_LEVEL, msg, LOG_ERR); - const char *target = crm_element_value(level, F_STONITH_TARGET); - stonith_topology_t *tp = g_hash_table_lookup(topology, target); - - CRM_LOG_ASSERT(target != NULL); + char *target, *sep; + stonith_topology_t *tp; + /* Parse target and level index from XML, and use to return a description */ + target = crm_element_value_copy(level, F_STONITH_TARGET); crm_element_value_int(level, XML_ATTR_ID, &id); if (desc) { *desc = crm_strdup_printf("%s[%d]", target, id); } - if (id <= 0 || id >= ST_LEVEL_MAX) { - return -EINVAL; + + /* Sanity-check arguments */ + if ((target == NULL) || (id <= 0) || (id >= ST_LEVEL_MAX)) { + goto invalid_argument; } - /* Target-by-node-attribute requires the CIB, so disallow if standalone */ - if (stand_alone && target && strchr(target, '=')) { - return -EINVAL; + /* Check whether level is targeting by node attribute */ + sep = find_nvpair_separator(target); + if (sep != NULL) { + /* Target-by-attribute requires the CIB, so disallow if standalone */ + if (stand_alone) { + goto invalid_argument; + } + + /* Target-by-attribute can be specified using '=' or ':' as separator, + * but always use '=' internally to recognize it as the same target + */ + *sep = '='; } + /* Find or create topology table entry */ + tp = g_hash_table_lookup(topology, target); if (tp == NULL) { tp = calloc(1, sizeof(stonith_topology_t)); - tp->target = strdup(target); + tp->target = target; g_hash_table_replace(topology, tp->target, tp); crm_trace("Added %s to the topology (%d active entries)", target, g_hash_table_size(topology)); + } else { + free(target); } if (tp->levels[id] != NULL) { - crm_info("Adding to the existing %s[%d] topology entry", target, id); + crm_info("Adding to the existing %s[%d] topology entry", + tp->target, id); } for (child = __xml_first_child(level); child != NULL; child = __xml_next(child)) { const char *device = ID(child); - crm_trace("Adding device '%s' for %s[%d]", device, target, id); + crm_trace("Adding device '%s' for %s[%d]", device, tp->target, id); tp->levels[id] = g_list_append(tp->levels[id], strdup(device)); } crm_info("Target %s has %d active fencing levels", - target, count_active_levels(tp)); + tp->target, count_active_levels(tp)); return pcmk_ok; + +invalid_argument: + free(target); + return -EINVAL; } int stonith_level_remove(xmlNode * msg, char **desc) { int id = 0; xmlNode *level = get_xpath_object("//" F_STONITH_LEVEL, msg, LOG_ERR); - const char *target = crm_element_value(level, F_STONITH_TARGET); - stonith_topology_t *tp = g_hash_table_lookup(topology, target); - - CRM_LOG_ASSERT(target != NULL); + char *target, *sep; + stonith_topology_t *tp; + /* Parse target and level index from XML, and use to return a description */ + target = crm_element_value_copy(level, F_STONITH_TARGET); + crm_element_value_int(level, XML_ATTR_ID, &id); if (desc) { *desc = crm_strdup_printf("%s[%d]", target, id); } - crm_element_value_int(level, XML_ATTR_ID, &id); + /* Sanity-check arguments; unlike registering, id==0 here means all */ + if ((target == NULL) || (id < 0) || (id >= ST_LEVEL_MAX)) { + free(target); + return -EINVAL; + } + + /* Target-by-attribute can be specified using '=' or ':' as separator, + * but always use '=' internally to recognize it as the same target + */ + sep = find_nvpair_separator(target); + if (sep != NULL) { + *sep = '='; + } + + tp = g_hash_table_lookup(topology, target); if (tp == NULL) { crm_info("Topology for %s not found (%d active entries)", target, g_hash_table_size(topology)); - return pcmk_ok; - - } else if (id < 0 || id >= ST_LEVEL_MAX) { - return -EINVAL; - } - if (id == 0 && g_hash_table_remove(topology, target)) { + } else if (id == 0 && g_hash_table_remove(topology, target)) { crm_info("Removed all %s related entries from the topology (%d active entries)", target, g_hash_table_size(topology)); } else if (id > 0 && tp->levels[id] != NULL) { g_list_free_full(tp->levels[id], free); tp->levels[id] = NULL; crm_info("Removed level '%d' from topology for %s (%d active levels remaining)", id, target, count_active_levels(tp)); } + free(target); return pcmk_ok; } static int stonith_device_action(xmlNode * msg, char **output) { int rc = pcmk_ok; xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); const char *id = crm_element_value(dev, F_STONITH_DEVICE); async_command_t *cmd = NULL; stonith_device_t *device = NULL; if (id) { crm_trace("Looking for '%s'", id); device = g_hash_table_lookup(device_list, id); } if (device && device->api_registered == FALSE) { rc = -ENODEV; } else if (device) { cmd = create_async_command(msg); if (cmd == NULL) { return -EPROTO; } schedule_stonith_command(cmd, device); rc = -EINPROGRESS; } else { crm_info("Device %s not found", id ? id : ""); rc = -ENODEV; } return rc; } static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence) { search->replies_received++; if (can_fence && device) { search->capable = g_list_append(search->capable, strdup(device)); } if (search->replies_needed == search->replies_received) { crm_debug("Finished Search. %d devices can perform action (%s) on node %s", g_list_length(search->capable), search->action ? search->action : "", search->host ? search->host : ""); search->callback(search->capable, search->user_data); free(search->host); free(search->action); free(search); } } /* * \internal * \brief Check whether the local host is allowed to execute a fencing action * * \param[in] device Fence device to check * \param[in] action Fence action to check * \param[in] target Hostname of fence target * \param[in] allow_suicide Whether self-fencing is allowed for this operation * * \return TRUE if local host is allowed to execute action, FALSE otherwise */ static gboolean localhost_is_eligible(const stonith_device_t *device, const char *action, const char *target, gboolean allow_suicide) { gboolean localhost_is_target = safe_str_eq(target, stonith_our_uname); if (device && action && device->on_target_actions && strstr(device->on_target_actions, action)) { if (!localhost_is_target) { crm_trace("%s operation with %s can only be executed for localhost not %s", action, device->id, target); return FALSE; } } else if (localhost_is_target && !allow_suicide) { crm_trace("%s operation does not support self-fencing", action); return FALSE; } return TRUE; } static void can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *search) { gboolean can = FALSE; const char *check_type = NULL; const char *host = search->host; const char *alias = NULL; CRM_LOG_ASSERT(dev != NULL); if (dev == NULL) { goto search_report_results; } else if (host == NULL) { can = TRUE; goto search_report_results; } /* Short-circuit query if this host is not allowed to perform the action */ if (safe_str_eq(search->action, "reboot")) { /* A "reboot" *might* get remapped to "off" then "on", so short-circuit * only if all three are disallowed. If only one or two are disallowed, * we'll report that with the results. We never allow suicide for * remapped "on" operations because the host is off at that point. */ if (!localhost_is_eligible(dev, "reboot", host, search->allow_suicide) && !localhost_is_eligible(dev, "off", host, search->allow_suicide) && !localhost_is_eligible(dev, "on", host, FALSE)) { goto search_report_results; } } else if (!localhost_is_eligible(dev, search->action, host, search->allow_suicide)) { goto search_report_results; } alias = g_hash_table_lookup(dev->aliases, host); if (alias == NULL) { alias = host; } check_type = target_list_type(dev); if (safe_str_eq(check_type, "none")) { can = TRUE; } else if (safe_str_eq(check_type, "static-list")) { /* Presence in the hostmap is sufficient * Only use if all hosts on which the device can be active can always fence all listed hosts */ if (string_in_list(dev->targets, host)) { can = TRUE; } else if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP) && g_hash_table_lookup(dev->aliases, host)) { can = TRUE; } } else if (safe_str_eq(check_type, "dynamic-list")) { time_t now = time(NULL); if (dev->targets == NULL || dev->targets_age + 60 < now) { crm_trace("Running %s command to see if %s can fence %s (%s)", check_type, dev?dev->id:"N/A", search->host, search->action); schedule_internal_command(__FUNCTION__, dev, "list", NULL, search->per_device_timeout, search, dynamic_list_search_cb); /* we'll respond to this search request async in the cb */ return; } if (string_in_list(dev->targets, alias)) { can = TRUE; } } else if (safe_str_eq(check_type, "status")) { crm_trace("Running %s command to see if %s can fence %s (%s)", check_type, dev?dev->id:"N/A", search->host, search->action); schedule_internal_command(__FUNCTION__, dev, "status", search->host, search->per_device_timeout, search, status_search_cb); /* we'll respond to this search request async in the cb */ return; } else { crm_err("Unknown check type: %s", check_type); } if (safe_str_eq(host, alias)) { crm_notice("%s can%s fence (%s) %s: %s", dev->id, can ? "" : " not", search->action, host, check_type); } else { crm_notice("%s can%s fence (%s) %s (aka. '%s'): %s", dev->id, can ? "" : " not", search->action, host, alias, check_type); } search_report_results: search_devices_record_result(search, dev ? dev->id : NULL, can); } static void search_devices(gpointer key, gpointer value, gpointer user_data) { stonith_device_t *dev = value; struct device_search_s *search = user_data; can_fence_host_with_device(dev, search); } #define DEFAULT_QUERY_TIMEOUT 20 static void get_capable_devices(const char *host, const char *action, int timeout, bool suicide, void *user_data, void (*callback) (GList * devices, void *user_data)) { struct device_search_s *search; int per_device_timeout = DEFAULT_QUERY_TIMEOUT; int devices_needing_async_query = 0; char *key = NULL; const char *check_type = NULL; GHashTableIter gIter; stonith_device_t *device = NULL; if (!g_hash_table_size(device_list)) { callback(NULL, user_data); return; } search = calloc(1, sizeof(struct device_search_s)); if (!search) { callback(NULL, user_data); return; } g_hash_table_iter_init(&gIter, device_list); while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&device)) { check_type = target_list_type(device); if (safe_str_eq(check_type, "status") || safe_str_eq(check_type, "dynamic-list")) { devices_needing_async_query++; } } /* If we have devices that require an async event in order to know what * nodes they can fence, we have to give the events a timeout. The total * query timeout is divided among those events. */ if (devices_needing_async_query) { per_device_timeout = timeout / devices_needing_async_query; if (!per_device_timeout) { crm_err("STONITH timeout %ds is too low; using %ds, but consider raising to at least %ds", timeout, DEFAULT_QUERY_TIMEOUT, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); per_device_timeout = DEFAULT_QUERY_TIMEOUT; } else if (per_device_timeout < DEFAULT_QUERY_TIMEOUT) { crm_notice("STONITH timeout %ds is low for the current configuration;" " consider raising to at least %ds", timeout, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); } } search->host = host ? strdup(host) : NULL; search->action = action ? strdup(action) : NULL; search->per_device_timeout = per_device_timeout; /* We are guaranteed this many replies. Even if a device gets * unregistered some how during the async search, we will get * the correct number of replies. */ search->replies_needed = g_hash_table_size(device_list); search->allow_suicide = suicide; search->callback = callback; search->user_data = user_data; /* kick off the search */ crm_debug("Searching through %d devices to see what is capable of action (%s) for target %s", search->replies_needed, search->action ? search->action : "", search->host ? search->host : ""); g_hash_table_foreach(device_list, search_devices, search); } struct st_query_data { xmlNode *reply; char *remote_peer; char *client_id; char *target; char *action; int call_options; }; /* * \internal * \brief Add action-specific attributes to query reply XML * * \param[in,out] xml XML to add attributes to * \param[in] action Fence action * \param[in] device Fence device */ static void add_action_specific_attributes(xmlNode *xml, const char *action, stonith_device_t *device) { int action_specific_timeout; int delay_max; CRM_CHECK(xml && action && device, return); if (is_action_required(action, device)) { crm_trace("Action %s is required on %s", action, device->id); crm_xml_add_int(xml, F_STONITH_DEVICE_REQUIRED, 1); } action_specific_timeout = get_action_timeout(device, action, 0); if (action_specific_timeout) { crm_trace("Action %s has timeout %dms on %s", action, action_specific_timeout, device->id); crm_xml_add_int(xml, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); } delay_max = get_action_delay_max(device, action); if (delay_max > 0) { crm_trace("Action %s has maximum random delay %dms on %s", action, delay_max, device->id); crm_xml_add_int(xml, F_STONITH_DELAY_MAX, delay_max / 1000); } } /* * \internal * \brief Add "disallowed" attribute to query reply XML if appropriate * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_disallowed(xmlNode *xml, const char *action, stonith_device_t *device, const char *target, gboolean allow_suicide) { if (!localhost_is_eligible(device, action, target, allow_suicide)) { crm_trace("Action %s on %s is disallowed for local host", action, device->id); crm_xml_add(xml, F_STONITH_ACTION_DISALLOWED, XML_BOOLEAN_TRUE); } } /* * \internal * \brief Add child element with action-specific values to query reply XML * * \param[in,out] xml XML to add attribute to * \param[in] action Fence action * \param[in] device Fence device * \param[in] target Fence target * \param[in] allow_suicide Whether self-fencing is allowed */ static void add_action_reply(xmlNode *xml, const char *action, stonith_device_t *device, const char *target, gboolean allow_suicide) { xmlNode *child = create_xml_node(xml, F_STONITH_ACTION); crm_xml_add(child, XML_ATTR_ID, action); add_action_specific_attributes(child, action, device); add_disallowed(child, action, device, target, allow_suicide); } static void stonith_query_capable_device_cb(GList * devices, void *user_data) { struct st_query_data *query = user_data; int available_devices = 0; xmlNode *dev = NULL; xmlNode *list = NULL; GListPtr lpc = NULL; /* Pack the results into XML */ list = create_xml_node(NULL, __FUNCTION__); crm_xml_add(list, F_STONITH_TARGET, query->target); for (lpc = devices; lpc != NULL; lpc = lpc->next) { stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); const char *action = query->action; if (!device) { /* It is possible the device got unregistered while * determining who can fence the target */ continue; } available_devices++; dev = create_xml_node(list, F_STONITH_DEVICE); crm_xml_add(dev, XML_ATTR_ID, device->id); crm_xml_add(dev, "namespace", device->namespace); crm_xml_add(dev, "agent", device->agent); crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified); /* If the originating stonithd wants to reboot the node, and we have a * capable device that doesn't support "reboot", remap to "off" instead. */ if (is_not_set(device->flags, st_device_supports_reboot) && safe_str_eq(query->action, "reboot")) { crm_trace("%s doesn't support reboot, using values for off instead", device->id); action = "off"; } /* Add action-specific values if available */ add_action_specific_attributes(dev, action, device); if (safe_str_eq(query->action, "reboot")) { /* A "reboot" *might* get remapped to "off" then "on", so after * sending the "reboot"-specific values in the main element, we add * sub-elements for "off" and "on" values. * * We short-circuited earlier if "reboot", "off" and "on" are all * disallowed for the local host. However if only one or two are * disallowed, we send back the results and mark which ones are * disallowed. If "reboot" is disallowed, this might cause problems * with older stonithd versions, which won't check for it. Older * versions will ignore "off" and "on", so they are not a problem. */ add_disallowed(dev, action, device, query->target, is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, "off", device, query->target, is_set(query->call_options, st_opt_allow_suicide)); add_action_reply(dev, "on", device, query->target, FALSE); } /* A query without a target wants device parameters */ if (query->target == NULL) { xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); g_hash_table_foreach(device->params, hash2field, attrs); } } crm_xml_add_int(list, F_STONITH_AVAILABLE_DEVICES, available_devices); if (query->target) { crm_debug("Found %d matching devices for '%s'", available_devices, query->target); } else { crm_debug("%d devices installed", available_devices); } if (list != NULL) { crm_log_xml_trace(list, "Add query results"); add_message_xml(query->reply, F_STONITH_CALLDATA, list); } stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id); free_xml(query->reply); free(query->remote_peer); free(query->client_id); free(query->target); free(query->action); free(query); free_xml(list); g_list_free_full(devices, free); } static void stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int call_options) { struct st_query_data *query = NULL; const char *action = NULL; const char *target = NULL; int timeout = 0; xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_DEBUG_3); crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); if (dev) { const char *device = crm_element_value(dev, F_STONITH_DEVICE); target = crm_element_value(dev, F_STONITH_TARGET); action = crm_element_value(dev, F_STONITH_ACTION); if (device && safe_str_eq(device, "manual_ack")) { /* No query or reply necessary */ return; } } crm_log_xml_debug(msg, "Query"); query = calloc(1, sizeof(struct st_query_data)); query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; query->client_id = client_id ? strdup(client_id) : NULL; query->target = target ? strdup(target) : NULL; query->action = action ? strdup(action) : NULL; query->call_options = call_options; get_capable_devices(target, action, timeout, is_set(call_options, st_opt_allow_suicide), query, stonith_query_capable_device_cb); } #define ST_LOG_OUTPUT_MAX 512 static void log_operation(async_command_t * cmd, int rc, int pid, const char *next, const char *output) { if (rc == 0) { next = NULL; } if (cmd->victim != NULL) { do_crm_log(rc == 0 ? LOG_NOTICE : LOG_ERR, "Operation '%s' [%d] (call %d from %s) for host '%s' with device '%s' returned: %d (%s)%s%s", cmd->action, pid, cmd->id, cmd->client_name, cmd->victim, cmd->device, rc, pcmk_strerror(rc), next ? ". Trying: " : "", next ? next : ""); } else { do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Operation '%s' [%d] for device '%s' returned: %d (%s)%s%s", cmd->action, pid, cmd->device, rc, pcmk_strerror(rc), next ? ". Trying: " : "", next ? next : ""); } if (output) { /* Logging the whole string confuses syslog when the string is xml */ char *prefix = crm_strdup_printf("%s:%d", cmd->device, pid); crm_log_output(rc == 0 ? LOG_DEBUG : LOG_WARNING, prefix, output); free(prefix); } } static void stonith_send_async_reply(async_command_t * cmd, const char *output, int rc, GPid pid) { xmlNode *reply = NULL; gboolean bcast = FALSE; reply = stonith_construct_async_reply(cmd, output, NULL, rc); if (safe_str_eq(cmd->action, "metadata")) { /* Too verbose to log */ crm_trace("Metadata query for %s", cmd->device); output = NULL; } else if (crm_str_eq(cmd->action, "monitor", TRUE) || crm_str_eq(cmd->action, "list", TRUE) || crm_str_eq(cmd->action, "status", TRUE)) { crm_trace("Never broadcast %s replies", cmd->action); } else if (!stand_alone && safe_str_eq(cmd->origin, cmd->victim) && safe_str_neq(cmd->action, "on")) { crm_trace("Broadcast %s reply for %s", cmd->action, cmd->victim); crm_xml_add(reply, F_SUBTYPE, "broadcast"); bcast = TRUE; } log_operation(cmd, rc, pid, NULL, output); crm_log_xml_trace(reply, "Reply"); if (bcast) { crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); } else if (cmd->origin) { crm_trace("Directed reply to %s", cmd->origin); send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); } else { crm_trace("Directed local %ssync reply to %s", (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name); do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); } if (stand_alone) { /* Do notification with a clean data object */ xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); crm_xml_add(notify_data, F_STONITH_DEVICE, cmd->device); crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); } free_xml(reply); } void unfence_cb(GPid pid, int rc, const char *output, gpointer user_data) { async_command_t * cmd = user_data; stonith_device_t *dev = g_hash_table_lookup(device_list, cmd->device); log_operation(cmd, rc, pid, NULL, output); if(dev) { dev->active_pid = 0; mainloop_set_trigger(dev->work); } else { crm_trace("Device %s does not exist", cmd->device); } if(rc != 0) { crm_exit(DAEMON_RESPAWN_STOP); } } static void cancel_stonith_command(async_command_t * cmd) { stonith_device_t *device; CRM_CHECK(cmd != NULL, return); if (!cmd->device) { return; } device = g_hash_table_lookup(device_list, cmd->device); if (device) { crm_trace("Cancel scheduled %s on %s", cmd->action, device->id); device->pending_ops = g_list_remove(device->pending_ops, cmd); } } static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data) { stonith_device_t *device = NULL; stonith_device_t *next_device = NULL; async_command_t *cmd = user_data; GListPtr gIter = NULL; GListPtr gIterNext = NULL; CRM_CHECK(cmd != NULL, return); /* The device is ready to do something else now */ device = g_hash_table_lookup(device_list, cmd->device); if (device) { device->active_pid = 0; if (rc == pcmk_ok && (safe_str_eq(cmd->action, "list") || safe_str_eq(cmd->action, "monitor") || safe_str_eq(cmd->action, "status"))) { device->verified = TRUE; } mainloop_set_trigger(device->work); } crm_debug("Operation '%s' on '%s' completed with rc=%d (%d remaining)", cmd->action, cmd->device, rc, g_list_length(cmd->device_next)); if (rc == 0) { GListPtr iter; /* see if there are any required devices left to execute for this op */ for (iter = cmd->device_next; iter != NULL; iter = iter->next) { next_device = g_hash_table_lookup(device_list, iter->data); if (next_device != NULL && is_action_required(cmd->action, next_device)) { cmd->device_next = iter->next; break; } next_device = NULL; } } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) { /* if this device didn't work out, see if there are any others we can try. * if the failed device was 'required', we can't pick another device. */ next_device = g_hash_table_lookup(device_list, cmd->device_next->data); cmd->device_next = cmd->device_next->next; } /* this operation requires more fencing, hooray! */ if (next_device) { log_operation(cmd, rc, pid, cmd->device, output); schedule_stonith_command(cmd, next_device); /* Prevent cmd from being freed */ cmd = NULL; goto done; } if (rc > 0) { /* Try to provide _something_ useful */ if(output == NULL) { rc = -ENODATA; } else if(strstr(output, "imed out")) { rc = -ETIMEDOUT; } else if(strstr(output, "Unrecognised action")) { rc = -EOPNOTSUPP; } else { rc = -pcmk_err_generic; } } stonith_send_async_reply(cmd, output, rc, pid); if (rc != 0) { goto done; } /* Check to see if any operations are scheduled to do the exact * same thing that just completed. If so, rather than * performing the same fencing operation twice, return the result * of this operation for all pending commands it matches. */ for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) { async_command_t *cmd_other = gIter->data; gIterNext = gIter->next; if (cmd == cmd_other) { continue; } /* A pending scheduled command matches the command that just finished if. * 1. The client connections are different. * 2. The node victim is the same. * 3. The fencing action is the same. * 4. The device scheduled to execute the action is the same. */ if (safe_str_eq(cmd->client, cmd_other->client) || safe_str_neq(cmd->victim, cmd_other->victim) || safe_str_neq(cmd->action, cmd_other->action) || safe_str_neq(cmd->device, cmd_other->device)) { continue; } /* Duplicate merging will do the right thing for either type of remapped * reboot. If the executing stonithd remapped an unsupported reboot to * off, then cmd->action will be reboot and will be merged with any * other reboot requests. If the originating stonithd remapped a * topology reboot to off then on, we will get here once with * cmd->action "off" and once with "on", and they will be merged * separately with similar requests. */ crm_notice ("Merging stonith action %s for node %s originating from client %s with identical stonith request from client %s", cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name); cmd_list = g_list_remove_link(cmd_list, gIter); stonith_send_async_reply(cmd_other, output, rc, pid); cancel_stonith_command(cmd_other); free_async_command(cmd_other); g_list_free_1(gIter); } done: free_async_command(cmd); } static gint sort_device_priority(gconstpointer a, gconstpointer b) { const stonith_device_t *dev_a = a; const stonith_device_t *dev_b = b; if (dev_a->priority > dev_b->priority) { return -1; } else if (dev_a->priority < dev_b->priority) { return 1; } return 0; } static void stonith_fence_get_devices_cb(GList * devices, void *user_data) { async_command_t *cmd = user_data; stonith_device_t *device = NULL; crm_info("Found %d matching devices for '%s'", g_list_length(devices), cmd->victim); if (g_list_length(devices) > 0) { /* Order based on priority */ devices = g_list_sort(devices, sort_device_priority); device = g_hash_table_lookup(device_list, devices->data); if (device) { cmd->device_list = devices; cmd->device_next = devices->next; devices = NULL; /* list owned by cmd now */ } } /* we have a device, schedule it for fencing. */ if (device) { schedule_stonith_command(cmd, device); /* in progress */ return; } /* no device found! */ stonith_send_async_reply(cmd, NULL, -ENODEV, 0); free_async_command(cmd); g_list_free_full(devices, free); } static int stonith_fence(xmlNode * msg) { const char *device_id = NULL; stonith_device_t *device = NULL; async_command_t *cmd = create_async_command(msg); xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); if (cmd == NULL) { return -EPROTO; } device_id = crm_element_value(dev, F_STONITH_DEVICE); if (device_id) { device = g_hash_table_lookup(device_list, device_id); if (device == NULL) { crm_err("Requested device '%s' is not available", device_id); return -ENODEV; } schedule_stonith_command(cmd, device); } else { const char *host = crm_element_value(dev, F_STONITH_TARGET); if (cmd->options & st_opt_cs_nodeid) { int nodeid = crm_atoi(host, NULL); crm_node_t *node = crm_get_peer(nodeid, NULL); if (node) { host = node->uname; } } /* If we get to here, then self-fencing is implicitly allowed */ get_capable_devices(host, cmd->action, cmd->default_timeout, TRUE, cmd, stonith_fence_get_devices_cb); } return -EINPROGRESS; } xmlNode * stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) { int lpc = 0; xmlNode *reply = NULL; const char *name = NULL; const char *value = NULL; const char *names[] = { F_STONITH_OPERATION, F_STONITH_CALLID, F_STONITH_CLIENTID, F_STONITH_CLIENTNAME, F_STONITH_REMOTE_OP_ID, F_STONITH_CALLOPTS }; crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, "st_output", output); crm_xml_add_int(reply, F_STONITH_RC, rc); CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply); for (lpc = 0; lpc < DIMOF(names); lpc++) { name = names[lpc]; value = crm_element_value(request, name); crm_xml_add(reply, name, value); } if (data != NULL) { crm_trace("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } static xmlNode * stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc) { xmlNode *reply = NULL; crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client); crm_xml_add(reply, F_STONITH_CLIENTNAME, cmd->client_name); crm_xml_add(reply, F_STONITH_TARGET, cmd->victim); crm_xml_add(reply, F_STONITH_ACTION, cmd->op); crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); crm_xml_add_int(reply, F_STONITH_RC, rc); crm_xml_add(reply, "st_output", output); if (data != NULL) { crm_info("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } bool fencing_peer_active(crm_node_t *peer) { if (peer == NULL) { return FALSE; } else if (peer->uname == NULL) { return FALSE; } else if (is_set(peer->processes, crm_get_cluster_proc())) { return TRUE; } return FALSE; } /*! * \internal * \brief Determine if we need to use an alternate node to * fence the target. If so return that node's uname * * \retval NULL, no alternate host * \retval uname, uname of alternate host to use */ static const char * check_alternate_host(const char *target) { const char *alternate_host = NULL; if (find_topology_for_host(target) && safe_str_eq(target, stonith_our_uname)) { GHashTableIter gIter; crm_node_t *entry = NULL; g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_trace("Checking for %s.%d != %s", entry->uname, entry->id, target); if (fencing_peer_active(entry) && safe_str_neq(entry->uname, target)) { alternate_host = entry->uname; break; } } if (alternate_host == NULL) { crm_err("No alternate host available to handle complex self fencing request"); g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_notice("Peer[%d] %s", entry->id, entry->uname); } } } return alternate_host; } static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, const char *client_id) { if (remote_peer) { send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); } else { do_local_reply(reply, client_id, is_set(call_options, st_opt_sync_call), remote_peer != NULL); } } static int handle_request(crm_client_t * client, uint32_t id, uint32_t flags, xmlNode * request, const char *remote_peer) { int call_options = 0; int rc = -EOPNOTSUPP; xmlNode *data = NULL; xmlNode *reply = NULL; char *output = NULL; const char *op = crm_element_value(request, F_STONITH_OPERATION); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); if (is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (crm_str_eq(op, CRM_OP_REGISTER, TRUE)) { xmlNode *reply = create_xml_node(NULL, "reply"); CRM_ASSERT(client); crm_xml_add(reply, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(reply, F_STONITH_CLIENTID, client->id); crm_ipcs_send(client, id, reply, flags); client->request_id = 0; free_xml(reply); return 0; } else if (crm_str_eq(op, STONITH_OP_EXEC, TRUE)) { rc = stonith_device_action(request, &output); } else if (crm_str_eq(op, STONITH_OP_TIMEOUT_UPDATE, TRUE)) { const char *call_id = crm_element_value(request, F_STONITH_CALLID); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); int op_timeout = 0; crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); do_stonith_async_timeout_update(client_id, call_id, op_timeout); return 0; } else if (crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { if (remote_peer) { create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */ } stonith_query(request, remote_peer, client_id, call_options); return 0; } else if (crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { const char *flag_name = NULL; CRM_ASSERT(client); flag_name = crm_element_value(request, F_STONITH_NOTIFY_ACTIVATE); if (flag_name) { crm_debug("Setting %s callbacks for %s (%s): ON", flag_name, client->name, client->id); client->options |= get_stonith_flag(flag_name); } flag_name = crm_element_value(request, F_STONITH_NOTIFY_DEACTIVATE); if (flag_name) { crm_debug("Setting %s callbacks for %s (%s): off", flag_name, client->name, client->id); client->options |= get_stonith_flag(flag_name); } if (flags & crm_ipc_client_response) { crm_ipcs_send_ack(client, id, flags, "ack", __FUNCTION__, __LINE__); } return 0; } else if (crm_str_eq(op, STONITH_OP_RELAY, TRUE)) { xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); crm_notice("Peer %s has received a forwarded fencing request from %s to fence (%s) peer %s", stonith_our_uname, client ? client->name : remote_peer, crm_element_value(dev, F_STONITH_ACTION), crm_element_value(dev, F_STONITH_TARGET)); if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { rc = -EINPROGRESS; } } else if (crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { if (remote_peer || stand_alone) { rc = stonith_fence(request); } else if (call_options & st_opt_manual_ack) { remote_fencing_op_t *rop = NULL; xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); const char *target = crm_element_value(dev, F_STONITH_TARGET); crm_notice("Received manual confirmation that %s is fenced", target); rop = initiate_remote_stonith_op(client, request, TRUE); rc = stonith_manual_ack(request, rop); } else { const char *alternate_host = NULL; xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); const char *target = crm_element_value(dev, F_STONITH_TARGET); const char *action = crm_element_value(dev, F_STONITH_ACTION); const char *device = crm_element_value(dev, F_STONITH_DEVICE); if (client) { int tolerance = 0; crm_notice("Client %s.%.8s wants to fence (%s) '%s' with device '%s'", client->name, client->id, action, target, device ? device : "(any)"); crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); if (stonith_check_fence_tolerance(tolerance, target, action)) { rc = 0; goto done; } } else { crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'", remote_peer, action, target, device ? device : "(any)"); } alternate_host = check_alternate_host(target); if (alternate_host && client) { const char *client_id = NULL; crm_notice("Forwarding complex self fencing request to peer %s", alternate_host); if (client) { client_id = client->id; } else { client_id = crm_element_value(request, F_STONITH_CLIENTID); } /* Create a record of it, otherwise call_id will be 0 if we need to notify of failures */ create_remote_stonith_op(client_id, request, FALSE); crm_xml_add(request, F_STONITH_OPERATION, STONITH_OP_RELAY); crm_xml_add(request, F_STONITH_CLIENTID, client->id); send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, FALSE); rc = -EINPROGRESS; } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) { rc = -EINPROGRESS; } } } else if (crm_str_eq(op, STONITH_OP_FENCE_HISTORY, TRUE)) { rc = stonith_fence_history(request, &data); } else if (crm_str_eq(op, STONITH_OP_DEVICE_ADD, TRUE)) { const char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_device_register(request, &id, FALSE); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if (crm_str_eq(op, STONITH_OP_DEVICE_DEL, TRUE)) { xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); const char *id = crm_element_value(dev, XML_ATTR_ID); xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_device_remove(id, FALSE); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if (crm_str_eq(op, STONITH_OP_LEVEL_ADD, TRUE)) { char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_level_register(request, &id); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); free(id); } else if (crm_str_eq(op, STONITH_OP_LEVEL_DEL, TRUE)) { char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_level_remove(request, &id); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if (crm_str_eq(op, STONITH_OP_CONFIRM, TRUE)) { async_command_t *cmd = create_async_command(request); xmlNode *reply = stonith_construct_async_reply(cmd, NULL, NULL, 0); crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); crm_notice("Broadcasting manual fencing confirmation for node %s", cmd->victim); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); free_async_command(cmd); free_xml(reply); } else if(safe_str_eq(op, CRM_OP_RM_NODE_CACHE)) { int id = 0; const char *name = NULL; crm_element_value_int(request, XML_ATTR_ID, &id); name = crm_element_value(request, XML_ATTR_UNAME); reap_crm_member(id, name); return pcmk_ok; } else { crm_err("Unknown %s from %s", op, client ? client->name : remote_peer); crm_log_xml_warn(request, "UnknownOp"); } done: /* Always reply unless the request is in process still. * If in progress, a reply will happen async after the request * processing is finished */ if (rc != -EINPROGRESS) { crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0, id, is_set(call_options, st_opt_sync_call), call_options, crm_element_value(request, F_STONITH_CALLOPTS)); if (is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } reply = stonith_construct_reply(request, output, data, rc); stonith_send_reply(reply, call_options, remote_peer, client_id); } free(output); free_xml(data); free_xml(reply); return rc; } static void handle_reply(crm_client_t * client, xmlNode * request, const char *remote_peer) { const char *op = crm_element_value(request, F_STONITH_OPERATION); if (crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { process_remote_stonith_query(request); } else if (crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { process_remote_stonith_exec(request); } else if (crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { /* Reply to a complex fencing op */ process_remote_stonith_exec(request); } else { crm_err("Unknown %s reply from %s", op, client ? client->name : remote_peer); crm_log_xml_warn(request, "UnknownOp"); } } void stonith_command(crm_client_t * client, uint32_t id, uint32_t flags, xmlNode * request, const char *remote_peer) { int call_options = 0; int rc = 0; gboolean is_reply = FALSE; /* Copy op for reporting. The original might get freed by handle_reply() * before we use it in crm_debug(): * handle_reply() * |- process_remote_stonith_exec() * |-- remote_op_done() * |--- handle_local_reply_and_notify() * |---- crm_xml_add(...F_STONITH_OPERATION...) * |--- free_xml(op->request) */ char *op = crm_element_value_copy(request, F_STONITH_OPERATION); if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_DEBUG_3)) { is_reply = TRUE; } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_debug("Processing %s%s %u from %s (%16x)", op, is_reply ? " reply" : "", id, client ? client->name : remote_peer, call_options); if (is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (is_reply) { handle_reply(client, request, remote_peer); } else { rc = handle_request(client, id, flags, request, remote_peer); } crm_debug("Processed %s%s from %s: %s (%d)", op, is_reply ? " reply" : "", client ? client->name : remote_peer, rc > 0 ? "" : pcmk_strerror(rc), rc); free(op); }