diff --git a/cts/CTSaudits.py b/cts/CTSaudits.py index 58123da45f..df6750eeac 100755 --- a/cts/CTSaudits.py +++ b/cts/CTSaudits.py @@ -1,721 +1,741 @@ '''CTS: Cluster Testing System: Audit module ''' __copyright__=''' Copyright (C) 2000, 2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import time, os, string, re import CTS class ClusterAudit: def __init__(self, cm): self.CM = cm def __call__(self): raise ValueError("Abstract Class member (__call__)") def is_applicable(self): '''Return TRUE if we are applicable in the current test configuration''' raise ValueError("Abstract Class member (is_applicable)") return 1 def name(self): raise ValueError("Abstract Class member (name)") AllAuditClasses = [ ] class LogAudit(ClusterAudit): def name(self): return "LogAudit" def __init__(self, cm): self.CM = cm def RestartClusterLogging(self, nodes=None): if not nodes: nodes = self.CM.Env["nodes"] self.CM.log("Restarting logging on: %s" % repr(nodes)) for node in nodes: cmd=self.CM.Env["logrestartcmd"] if self.CM.rsh(node, cmd, blocking=0) != 0: self.CM.log ("ERROR: Cannot restart logging on %s [%s failed]" % (node, cmd)) def TestLogging(self): patterns= [] prefix="Test message from" for node in self.CM.Env["nodes"]: # Look for the node name in two places to make sure # that syslog is logging with the correct hostname patterns.append("%s.*%s %s" % (node, prefix, node)) watch = CTS.LogWatcher(self.CM.Env["LogFileName"], patterns, 60) watch.setwatch() for node in self.CM.Env["nodes"]: cmd="logger -p %s.info %s %s" % (self.CM.Env["SyslogFacility"], prefix, node) if self.CM.rsh(node, cmd, blocking=0) != 0: self.CM.log ("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node)) watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.CM.log ("Test message [%s] not found in logs." % (regex)) return 0 return 1 def __call__(self): max=3 attempt=0 self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"]) while attempt <= max and self.TestLogging() == 0: attempt = attempt + 1 self.RestartClusterLogging() time.sleep(60*attempt) if attempt > max: self.CM.log("ERROR: Cluster logging unrecoverable.") return 0 return 1 def is_applicable(self): if self.CM.Env["DoBSC"]: return 0 return 1 class DiskAudit(ClusterAudit): def name(self): return "DiskspaceAudit" def __init__(self, cm): self.CM = cm def __call__(self): result=1 dfcmd="df -k /var/log | tail -1 | tr -s ' ' | cut -d' ' -f2" self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"]) for node in self.CM.Env["nodes"]: dfout=self.CM.rsh(node, dfcmd, 1) if not dfout: self.CM.log ("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node)) else: try: idfout = int(dfout) except (ValueError, TypeError): self.CM.log("Warning: df output from %s was invalid [%s]" % (node, dfout)) else: if idfout == 0: self.CM.log("CRIT: Completely out of log disk space on %s" % node) result=None elif idfout <= 1000: self.CM.log("WARN: Low on log disk space (%d Mbytes) on %s" % (idfout, node)) return result def is_applicable(self): if self.CM.Env["DoBSC"]: return 0 return 1 class AuditResource: def __init__(self, cm, line): fields = line.split() self.CM = cm self.line = line self.type = fields[1] self.id = fields[2] self.clone_id = fields[3] self.parent = fields[4] - self.managed = fields[5] - self.needs_quorum = fields[6] - self.unique = fields[7] - self.rprovider = fields[8] - self.rclass = fields[9] - self.rtype = fields[10] - self.host = fields[11] + self.rprovider = fields[5] + self.rclass = fields[6] + self.rtype = fields[7] + self.host = fields[8] + self.needs_quorum = fields[9] + self.flags = int(fields[10]) + self.flags_s = fields[11] + + print "%s with %s: %d %d %d" % (self.id, self.flags_s, self.orphan(), self.unique(), self.managed()) if self.parent == "NA": self.parent = None + def unique(self): + if self.flags & int("0x00000020", 16): + return 1 + return 0 + + def orphan(self): + if self.flags & int("0x00000001", 16): + return 1 + return 0 + + def managed(self): + if self.flags & int("0x00000002", 16): + return 1 + return 0 + class AuditConstraint: def __init__(self, cm, line): fields = line.split() self.CM = cm self.line = line self.type = fields[1] self.id = fields[2] self.rsc = fields[3] self.target = fields[4] self.score = fields[5] self.rsc_role = fields[6] self.target_role = fields[7] if self.rsc_role == "NA": self.rsc_role = None if self.target_role == "NA": self.target_role = None class PrimitiveAudit(ClusterAudit): def name(self): return "PrimitiveAudit" def __init__(self, cm): self.CM = cm def doResourceAudit(self, resource): rc=1 active = self.CM.ResourceLocation(resource.id) if len(active) > 1: - if resource.unique == "1": + if resource.unique(): rc=0 self.CM.log("Resource %s is active multiple times: %s" % (resource.id, repr(active))) else: self.CM.debug("Non-unique resource %s is active on: %s" % (resource.id, repr(active))) elif len(active) == 1: if self.CM.HasQuorum(None): self.CM.debug("Resource %s active on %s" % (resource.id, repr(active))) elif resource.needs_quorum == 1: rc=0 self.CM.log("Resource %s active without quorum: %s (%s)" % (resource.id, repr(active), resource.line)) - elif resource.managed == "0": + elif not resource.managed(): self.CM.log("Resource %s not managed" % resource.id) + elif not resource.orphan(): + self.CM.log("Resource %s is an orphan" % resource.id) + elif len(self.inactive_nodes) == 0: self.CM.log("WARN: Resource %s not served anywhere" % resource.id) elif self.CM.Env["warn-inactive"] == 1: if self.CM.HasQuorum(None) or not resource.needs_quorum: self.CM.log("WARN: Resource %s not served anywhere (Inactive nodes: %s)" % (resource.id, repr(self.inactive_nodes))) else: self.CM.debug("Resource %s not served anywhere (Inactive nodes: %s)" % (resource.id, repr(self.inactive_nodes))) elif self.CM.HasQuorum(None) or not resource.needs_quorum: self.CM.debug("Resource %s not served anywhere (Inactive nodes: %s)" % (resource.id, repr(self.inactive_nodes))) return rc def setup(self): self.target = None self.resources = [] self.constraints = [] self.active_nodes = [] self.inactive_nodes = [] self.CM.debug("Do Audit %s"%self.name()) for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.active_nodes.append(node) else: self.inactive_nodes.append(node) for node in self.CM.Env["nodes"]: if self.target == None and self.CM.ShouldBeStatus[node] == "up": self.target = node if not self.target: # TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource # with CIB_file=/path/to/cib.xml even when the cluster isn't running self.CM.debug("No nodes active - skipping %s" % self.name()) return 0 (rc, lines) = self.CM.rsh(self.target, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): self.resources.append(AuditResource(self.CM, line)) elif re.search("^Constraint", line): self.constraints.append(AuditConstraint(self.CM, line)) else: self.CM.log("Unknown entry: %s" % line); return 1 def __call__(self): rc = 1 if not self.setup(): return 1 for resource in self.resources: if resource.type == "primitive": if self.doResourceAudit(resource) == 0: rc = 0 return rc def is_applicable(self): if self.CM["Name"] == "crm-lha": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 class GroupAudit(PrimitiveAudit): def name(self): return "GroupAudit" def __call__(self): rc = 1 if not self.setup(): return 1 for group in self.resources: if group.type == "group": first_match = 1 group_location = None for child in self.resources: if child.parent == group.id: nodes = self.CM.ResourceLocation(child.id) if first_match and len(nodes) > 0: group_location = nodes[0] first_match = 0 if len(nodes) > 1: rc = 0 self.CM.log("Child %s of %s is active more than once: %s" % (child.id, group.id, repr(nodes))) elif len(nodes) == 0: # Groups are allowed to be partially active # However we do need to make sure later children aren't running group_location = None self.CM.debug("Child %s of %s is stopped" % (child.id, group.id)) elif nodes[0] != group_location: rc = 0 self.CM.log("Child %s of %s is active on the wrong node (%s) expected %s" % (child.id, group.id, nodes[0], group_location)) else: self.CM.debug("Child %s of %s is active on %s" % (child.id, group.id, nodes[0])) return rc class CloneAudit(PrimitiveAudit): def name(self): return "CloneAudit" def __call__(self): rc = 1 if not self.setup(): return 1 for clone in self.resources: if clone.type == "clone": for child in self.resources: if child.parent == clone.id and child.type == "primitive": self.CM.debug("Checking child %s of %s..." % (child.id, clone.id)) # Check max and node_max # Obtain with: # crm_resource -g clone_max --meta -r child.id # crm_resource -g clone_node_max --meta -r child.id return rc class ColocationAudit(PrimitiveAudit): def name(self): return "ColocationAudit" def crm_location(self, resource): (rc, lines) = self.CM.rsh(self.target, "crm_resource -W -r %s -Q"%resource, None) hosts = [] if rc == 0: for line in lines: fields = line.split() hosts.append(fields[0]) return hosts def __call__(self): rc = 1 if not self.setup(): return 1 for coloc in self.constraints: if coloc.type == "rsc_colocation": source = self.crm_location(coloc.rsc) target = self.crm_location(coloc.target) if len(source) == 0: self.CM.debug("Colocation audit (%s): %s not running" % (coloc.id, coloc.rsc)) else: for node in source: if not node in target: rc = 0 self.CM.log("Colocation audit (%s): %s running on %s (not in %s)" % (coloc.id, coloc.rsc, node, repr(target))) else: self.CM.debug("Colocation audit (%s): %s running on %s (in %s)" % (coloc.id, coloc.rsc, node, repr(target))) return rc class CrmdStateAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): passed = 1 up_are_down = 0 down_are_up = 0 unstable_list = [] self.CM.debug("Do Audit %s"%self.name()) for node in self.CM.Env["nodes"]: should_be = self.CM.ShouldBeStatus[node] rc = self.CM.test_node_CM(node) if rc > 0: if should_be == "down": down_are_up = down_are_up + 1 if rc == 1: unstable_list.append(node) elif should_be == "up": up_are_down = up_are_down + 1 if len(unstable_list) > 0: passed = 0 self.CM.log("Cluster is not stable: %d (of %d): %s" %(len(unstable_list), self.CM.upcount(), repr(unstable_list))) if up_are_down > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be up were down." %(up_are_down, len(self.CM.Env["nodes"]))) if down_are_up > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be down were up." %(down_are_up, len(self.CM.Env["nodes"]))) return passed def name(self): return "CrmdStateAudit" def is_applicable(self): if self.CM["Name"] == "crm-lha": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 class CIBAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.debug("Do Audit %s"%self.name()) passed = 1 ccm_partitions = self.CM.find_partitions() if len(ccm_partitions) == 0: self.CM.debug("\tNo partitions to audit") return 1 for partition in ccm_partitions: self.CM.debug("\tAuditing CIB consistency for: %s" %partition) partition_passed = 0 if self.audit_cib_contents(partition) == 0: passed = 0 return passed def audit_cib_contents(self, hostlist): passed = 1 node0 = None node0_xml = None partition_hosts = hostlist.split() for node in partition_hosts: node_xml = self.store_remote_cib(node, node0) if node_xml == None: self.CM.log("Could not perform audit: No configuration from %s" % node) passed = 0 elif node0 == None: node0 = node node0_xml = node_xml elif node0_xml == None: self.CM.log("Could not perform audit: No configuration from %s" % node0) passed = 0 else: (rc, result) = self.CM.rsh( node0, "crm_diff -VV -cf --new %s --original %s" % (node_xml, node0_xml), None) if rc != 0: self.CM.log("Diff between %s and %s failed: %d" % (node0_xml, node_xml, rc)) passed = 0 for line in result: if not re.search("", line): passed = 0 self.CM.debug("CibDiff[%s-%s]: %s" % (node0, node, line)) else: self.CM.debug("CibDiff[%s-%s] Ignoring: %s" % (node0, node, line)) # self.CM.rsh(node0, "rm -f %s" % node_xml) # self.CM.rsh(node0, "rm -f %s" % node0_xml) return passed def store_remote_cib(self, node, target): combined = "" filename="/tmp/ctsaudit.%s.xml" % node if not target: target = node (rc, lines) = self.CM.rsh(node, self.CM["CibQuery"], None) if rc != 0: self.CM.log("Could not retrieve configuration") return None self.CM.rsh("localhost", "rm -f %s" % filename) for line in lines: self.CM.rsh("localhost", "echo \'%s\' >> %s" % (line[:-1], filename)) if self.CM.rsh.cp(filename, "root@%s:%s" % (target, filename)) != 0: self.CM.log("Could not store configuration") return None return filename def name(self): return "CibAudit" def is_applicable(self): if self.CM["Name"] == "crm-lha": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 class PartitionAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} self.NodeEpoche={} self.NodeState={} self.NodeQuorum={} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.debug("Do Audit %s"%self.name()) passed = 1 ccm_partitions = self.CM.find_partitions() if ccm_partitions == None or len(ccm_partitions) == 0: return 1 if len(ccm_partitions) != self.CM.partitions_expected: self.CM.log("ERROR: %d cluster partitions detected:" %len(ccm_partitions)) passed = 0 for partition in ccm_partitions: self.CM.log("\t %s" %partition) for partition in ccm_partitions: partition_passed = 0 if self.audit_partition(partition) == 0: passed = 0 return passed def trim_string(self, avalue): if not avalue: return None if len(avalue) > 1: return avalue[:-1] def trim2int(self, avalue): if not avalue: return None if len(avalue) > 1: return int(avalue[:-1]) def audit_partition(self, partition): passed = 1 dc_found = [] dc_allowed_list = [] lowest_epoche = None node_list = partition.split() self.CM.debug("Auditing partition: %s" %(partition)) for node in node_list: if self.CM.ShouldBeStatus[node] != "up": self.CM.log("Warn: Node %s appeared out of nowhere" %(node)) self.CM.ShouldBeStatus[node] = "up" # not in itself a reason to fail the audit (not what we're # checking for in this audit) self.NodeState[node] = self.CM.rsh(node, self.CM["StatusCmd"]%node, 1) self.NodeEpoche[node] = self.CM.rsh(node, self.CM["EpocheCmd"], 1) self.NodeQuorum[node] = self.CM.rsh(node, self.CM["QuorumCmd"], 1) self.CM.debug("Node %s: %s - %s - %s." %(node, self.NodeState[node], self.NodeEpoche[node], self.NodeQuorum[node])) self.NodeState[node] = self.trim_string(self.NodeState[node]) self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node]) self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node]) if not self.NodeEpoche[node]: self.CM.log("Warn: Node %s dissappeared: cant determin epoche" %(node)) self.CM.ShouldBeStatus[node] = "down" # not in itself a reason to fail the audit (not what we're # checking for in this audit) elif lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche: lowest_epoche = self.NodeEpoche[node] if not lowest_epoche: self.CM.log("Lowest epoche not determined in %s" % (partition)) passed = 0 for node in node_list: if self.CM.ShouldBeStatus[node] == "up": if self.CM.is_node_dc(node, self.NodeState[node]): dc_found.append(node) if self.NodeEpoche[node] == lowest_epoche: self.CM.debug("%s: OK" % node) elif not self.NodeEpoche[node]: self.CM.debug("Check on %s ignored: no node epoche" % node) elif not lowest_epoche: self.CM.debug("Check on %s ignored: no lowest epoche" % node) else: self.CM.log("DC %s is not the oldest node (%d vs. %d)" %(node, self.NodeEpoche[node], lowest_epoche)) passed = 0 if len(dc_found) == 0: self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)" %(len(dc_allowed_list), str(dc_allowed_list), str(node_list))) elif len(dc_found) > 1: self.CM.log("%d DCs (%s) found in cluster partition: %s" %(len(dc_found), str(dc_found), str(node_list))) passed = 0 if passed == 0: for node in node_list: if self.CM.ShouldBeStatus[node] == "up": self.CM.log("epoche %s : %s" %(self.NodeEpoche[node], self.NodeState[node])) return passed def name(self): return "PartitionAudit" def is_applicable(self): if self.CM["Name"] == "crm-lha": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 AllAuditClasses.append(DiskAudit) AllAuditClasses.append(LogAudit) AllAuditClasses.append(CrmdStateAudit) AllAuditClasses.append(PartitionAudit) AllAuditClasses.append(PrimitiveAudit) AllAuditClasses.append(GroupAudit) AllAuditClasses.append(CloneAudit) AllAuditClasses.append(ColocationAudit) AllAuditClasses.append(CIBAudit) def AuditList(cm): result = [] for auditclass in AllAuditClasses: result.append(auditclass(cm)) return result diff --git a/cts/CTStests.py b/cts/CTStests.py index 9490f4283e..2e57ef60aa 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -1,2355 +1,2355 @@ '''CTS: Cluster Testing System: Tests module There are a few things we want to do here: ''' __copyright__=''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. Add RecourceRecover testcase Zhao Kai ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import CTS import CTSaudits import time, os, re, types, string, tempfile, sys from CTSaudits import * from stat import * # List of all class objects for tests which we ought to # consider running. class AllTests: ''' A collection of tests which are run at random. ''' def __init__(self, scenario, cm, tests, Audits): self.CM = cm self.Env = cm.Env self.Scenario = scenario self.Tests = [] self.Audits = [] self.ns=CTS.NodeStatus(self.Env) self.Stats = {"success":0, "failure":0, "BadNews":0, "skipped":0} self.IndividualStats= {} for audit in Audits: if not issubclass(audit.__class__, ClusterAudit): raise ValueError("Init value must be a subclass of ClusterAudit") if audit.is_applicable(): self.Audits.append(audit) for test in tests: if not issubclass(test.__class__, CTSTest): raise ValueError("Init value must be a subclass of CTSTest") if test.is_applicable(): self.Tests.append(test) if not scenario.IsApplicable(): raise ValueError("Scenario not applicable in" " given Environment") def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def audit(self, BadNews, test): errcount=0 BadNewsDebug=0 #BadNews.debug=1 ignorelist = [] ignorelist.append(" CTS: ") ignorelist.append("BadNews:") ignorelist.extend(self.CM.errorstoignore()) if test: ignorelist.extend(test.errorstoignore()) while errcount < 1000: if BadNewsDebug: print "Looking for BadNews" match=BadNews.look(0) if match: if BadNewsDebug: print "BadNews found: "+match add_err = 1 for ignore in ignorelist: if add_err == 1 and re.search(ignore, match): if BadNewsDebug: print "Ignoring based on pattern: ("+ignore+")" add_err = 0 if add_err == 1: self.CM.log("BadNews: " + match) self.incr("BadNews") errcount=errcount+1 else: break else: answer = raw_input('Big problems. Continue? [nY]') if answer and answer == "n": self.CM.log("Shutting down.") self.CM.stopall() self.summarize() raise ValueError("Looks like we hit a BadNews jackpot!") for audit in self.Audits: if not audit(): self.CM.log("Audit " + audit.name() + " FAILED.") self.incr("auditfail") if test: test.incr("auditfail") def summarize(self): self.CM.log("****************") self.CM.log("Overall Results:" + repr(self.Stats)) self.CM.log("****************") stat_filter = { "calls":0, "success":0, "failure":0, "skipped":0, "auditfail":0, } self.CM.log("Test Summary") for test in self.Tests: for key in stat_filter.keys(): stat_filter[key] = test.Stats[key] self.CM.log("Test %s: \t%s" %(test.name, repr(stat_filter))) self.CM.debug("Detailed Results") for test in self.Tests: self.CM.debug("Test %s: \t%s" %(test.name, repr(test.Stats))) self.CM.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") def test_loop(self, BadNews, max): testcount=1 self.CM.log("Executing all tests once") for test in self.Tests: if self.run_test(BadNews, test, testcount): testcount += 1 return testcount def run_test(self, BadNews, test, testcount): nodechoice = self.Env.RandomNode() ret = 1 where = "" did_run = 0 starttime=time.time() test.starttime=starttime self.CM.log(("Running test %s" % test.name).ljust(35) + (" (%s) " % nodechoice).ljust(15) +"["+ ("%d" % testcount).rjust(3) +"]") if not test.setup(nodechoice): self.CM.log("Setup failed") ret = 0 elif not test.canrunnow(nodechoice): self.CM.log("Skipped") test.skipped() else: did_run = 1 ret = test(nodechoice) if not test.teardown(nodechoice): self.CM.log("Teardown failed") ret = 0 stoptime=time.time() self.CM.oprofileSave(testcount) elapsed_time = stoptime - starttime test_time = stoptime - test.starttime if not test.has_key("min_time"): test["elapsed_time"] = elapsed_time test["min_time"] = test_time test["max_time"] = test_time else: test["elapsed_time"] = test["elapsed_time"] + elapsed_time if test_time < test["min_time"]: test["min_time"] = test_time if test_time > test["max_time"]: test["max_time"] = test_time if ret: self.incr("success") else: self.incr("failure") self.CM.statall() did_run = 1 # Force the test count to be incrimented anyway so test extraction works self.audit(BadNews, test) return did_run def run(self, max=1): ( ''' Set up the given scenario, then run the selected tests at random for the selected number of iterations. ''') BadNews=CTS.LogWatcher(self.CM["LogFileName"], self.CM["BadRegexes"] , timeout=0) BadNews.setwatch() self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"]) self.CM.oprofileStop() self.CM.oprofileStart() if not self.CM.Env["DoBSC"]: audit = LogAudit(self.CM) if not audit(): self.CM.log("Audit " + audit.name() + " FAILED.") return (None, None) else: self.CM.log("Audit " + audit.name() + " passed.") audit = DiskAudit(self.CM) if not audit(): self.CM.log("Audit " + audit.name() + " FAILED.") return (None, None) else: self.CM.log("Audit " + audit.name() + " passed.") if not self.Scenario.SetUp(self.CM): return (None, None) self.CM.oprofileSave(0) time.sleep(30) # This makes sure everything is stabilized before starting... self.audit(BadNews, None) testcount = self.test_loop(BadNews, max) self.Scenario.TearDown(self.CM) self.CM.oprofileSave(testcount) self.CM.oprofileStop() self.audit(BadNews, None) for test in self.Tests: self.IndividualStats[test.name] = test.Stats return self.Stats, self.IndividualStats class RandomTests(AllTests): def test_loop(self, BadNews, max): testcount=1 self.CM.log("Executing tests at random") while testcount <= max: test = self.Env.RandomGen.choice(self.Tests) if self.run_test(BadNews, test, testcount): testcount += 1 return testcount AllTestClasses = [ ] class CTSTest: ''' A Cluster test. We implement the basic set of properties and behaviors for a generic cluster test. Cluster tests track their own statistics. We keep each of the kinds of counts we track as separate {name,value} pairs. ''' def __init__(self, cm): #self.name="the unnamed test" self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} # if not issubclass(cm.__class__, ClusterManager): # raise ValueError("Must be a ClusterManager object") self.CM = cm self.Audits = [] self.timeout=120 self.starttime=0 self.passed = 1 self.is_loop = 0 self.is_unsafe = 0 self.is_experimental = 0 self.is_valgrind = 0 def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 # Reset the test passed boolean if name == "calls": self.passed = 1 def failure(self, reason="none"): '''Increment the failure count''' self.passed = 0 self.incr("failure") self.CM.log(("Test %s" % self.name).ljust(35) +" FAILED: %s" % reason) return None def success(self): '''Increment the success count''' self.incr("success") return 1 def skipped(self): '''Increment the skipped count''' self.incr("skipped") return 1 def __call__(self, node): '''Perform the given test''' raise ValueError("Abstract Class member (__call__)") self.incr("calls") return self.failure() def audit(self): passed = 1 if len(self.Audits) > 0: for audit in self.Audits: if not audit(): self.CM.log("Internal %s Audit %s FAILED." % (self.name, audit.name())) self.incr("auditfail") passed = 0 return passed def setup(self, node): '''Setup the given test''' return self.success() def teardown(self, node): '''Tear down the given test''' return self.success() def local_badnews(self, prefix, watch, local_ignore=[]): errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [] ignorelist.append(" CTS: ") ignorelist.append(prefix) ignorelist.extend(local_ignore) while errcount < 100: match=watch.look(0) if match: add_err = 1 for ignore in ignorelist: if add_err == 1 and re.search(ignore, match): add_err = 0 if add_err == 1: self.CM.log(prefix + " " + match) errcount=errcount+1 else: break else: self.CM.log("Too many errors!") return errcount def is_applicable(self): return self.is_applicable_common() def is_applicable_common(self): '''Return TRUE if we are applicable in the current test configuration''' #raise ValueError("Abstract Class member (is_applicable)") if self.is_loop and not self.CM.Env["loop-tests"]: return 0 elif self.is_unsafe and not self.CM.Env["unsafe-tests"]: return 0 elif self.is_valgrind and not self.CM.Env["valgrind-tests"]: return 0 elif self.is_experimental and not self.CM.Env["experimental-tests"]: return 0 return 1 def find_ocfs2_resources(self, node): self.r_o2cb = None self.r_ocfs2 = [] (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "o2cb" and r.parent != "NA": self.CM.debug("Found o2cb: %s" % self.r_o2cb) self.r_o2cb = r.parent if re.search("^Constraint", line): c = AuditConstraint(self.CM, line) if c.type == "rsc_colocation" and c.target == self.r_o2cb: self.r_ocfs2.append(c.rsc) self.CM.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2)) return len(self.r_ocfs2) def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' return 1 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] ################################################################### class StopTest(CTSTest): ################################################################### '''Stop (deactivate) the cluster manager on a node''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="Stop" def __call__(self, node): '''Perform the 'stop' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] != "up": return self.skipped() patterns = [] # Technically we should always be able to notice ourselves stopping patterns.append(self.CM["Pat:We_stopped"] % node) #if self.CM.Env["use_logd"]: # patterns.append(self.CM["Pat:Logd_stopped"] % node) # Any active node needs to notice this one left # NOTE: This wont work if we have multiple partitions for other in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[other] == "up" and other != node: patterns.append(self.CM["Pat:They_stopped"] %(other, node)) #self.debug("Checking %s will notice %s left"%(other, node)) watch = CTS.LogWatcher( self.CM["LogFileName"], patterns, self.CM["DeadTime"]) watch.setwatch() if node == self.CM.OurNode: self.incr("us") else: if self.CM.upcount() <= 1: self.incr("all") else: self.incr("them") self.CM.StopaCM(node) watch_result = watch.lookforall() failreason=None UnmatchedList = "||" if watch.unmatched: (rc, output) = self.CM.rsh(node, "/bin/ps axf", None) for line in output: self.CM.debug(line) for regex in watch.unmatched: self.CM.log ("ERROR: Shutdown pattern not found: %s" % (regex)) UnmatchedList += regex + "||"; failreason="Missing shutdown pattern" self.CM.cluster_stable(self.CM["DeadTime"]) if not watch.unmatched or self.CM.upcount() == 0: return self.success() if len(watch.unmatched) >= self.CM.upcount(): return self.failure("no match against (%s)" % UnmatchedList) if failreason == None: return self.success() else: return self.failure(failreason) # # We don't register StopTest because it's better when called by # another test... # ################################################################### class StartTest(CTSTest): ################################################################### '''Start (activate) the cluster manager on a node''' def __init__(self, cm, debug=None): CTSTest.__init__(self,cm) self.name="start" self.debug = debug def __call__(self, node): '''Perform the 'start' test. ''' self.incr("calls") if self.CM.upcount() == 0: self.incr("us") else: self.incr("them") if self.CM.ShouldBeStatus[node] != "down": return self.skipped() elif self.CM.StartaCM(node): return self.success() else: return self.failure("Startup %s on node %s failed" %(self.CM["Name"], node)) # # We don't register StartTest because it's better when called by # another test... # ################################################################### class FlipTest(CTSTest): ################################################################### '''If it's running, stop it. If it's stopped start it. Overthrow the status quo... ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Flip" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'Flip' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == "up": self.incr("stopped") ret = self.stop(node) type="up->down" # Give the cluster time to recognize it's gone... time.sleep(self.CM["StableTime"]) elif self.CM.ShouldBeStatus[node] == "down": self.incr("started") ret = self.start(node) type="down->up" else: return self.skipped() self.incr(type) if ret: return self.success() else: return self.failure("%s failure" % type) # Register FlipTest as a good test to run AllTestClasses.append(FlipTest) ################################################################### class RestartTest(CTSTest): ################################################################### '''Stop and restart a node''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Restart" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'restart' test. ''' self.incr("calls") self.incr("node:" + node) ret1 = 1 if self.CM.StataCM(node): self.incr("WasStopped") if not self.start(node): return self.failure("start (setup) failure: "+node) self.starttime=time.time() if not self.stop(node): return self.failure("stop failure: "+node) if not self.start(node): return self.failure("start failure: "+node) return self.success() # Register RestartTest as a good test to run AllTestClasses.append(RestartTest) ################################################################### class StonithdTest(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self, cm) self.name="Stonithd" self.startall = SimulStartLite(cm) def __call__(self, node): self.incr("calls") if len(self.CM.Env["nodes"]) < 2: return self.skipped() ret = self.startall(None) if not ret: return self.failure("Setup failed") watchpats = [] watchpats.append("Forcing node %s to be terminated" % node) watchpats.append("Scheduling Node %s for STONITH" % node) watchpats.append("Executing .* fencing operation") watchpats.append("sending fencing op RESET for %s" % node) if not self.CM.is_node_dc(node): # Won't be found if the DC is shot (and there's no equivalent message from stonithd) watchpats.append("tengine_stonith_callback: .*result=0") if self.CM.Env["at-boot"] == 0: self.CM.debug("Expecting %s to stay down" % node) self.CM.ShouldBeStatus[node]="down" else: self.CM.debug("Expecting %s to come up again %d" % (node, self.CM.Env["at-boot"])) watchpats.append("%s crmd: .* S_STARTING -> S_PENDING" % node) watchpats.append("%s crmd: .* S_PENDING -> S_NOT_DC" % node) watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats, self.CM["DeadTime"] + self.CM["StableTime"] + self.CM["StartTime"]) watch.setwatch() self.CM.rsh(node, "crm_attribute --node %s --type status --attr-name terminate --attr-value true" % node) matched = watch.lookforall() if matched: self.CM.debug("Found: "+ repr(matched)) else: self.CM.log("Patterns not found: " + repr(watch.unmatched)) self.CM.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.CM.debug("Waiting STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"], 600) self.CM.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.CM["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") return self.success() def errorstoignore(self): return [ "Executing .* fencing operation" ] def is_applicable(self): if not self.is_applicable_common(): return 0 if self.CM.Env.has_key("DoStonith"): return self.CM.Env["DoStonith"] return 1 AllTestClasses.append(StonithdTest) ################################################################### class StartOnebyOne(CTSTest): ################################################################### '''Start all the nodes ~ one by one''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StartOnebyOne" self.stopall = SimulStopLite(cm) self.start = StartTest(cm) self.ns=CTS.NodeStatus(cm.Env) def __call__(self, dummy): '''Perform the 'StartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Test setup failed") failed=[] self.starttime=time.time() for node in self.CM.Env["nodes"]: if not self.start(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to start: " + repr(failed)) return self.success() # Register StartOnebyOne as a good test to run AllTestClasses.append(StartOnebyOne) ################################################################### class SimulStart(CTSTest): ################################################################### '''Start all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStart" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'SimulStart' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Setup failed") self.CM.clear_all_caches() if not self.startall(None): return self.failure("Startall failed") return self.success() # Register SimulStart as a good test to run AllTestClasses.append(SimulStart) ################################################################### class SimulStop(CTSTest): ################################################################### '''Stop all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStop" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, dummy): '''Perform the 'SimulStop' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.stopall(None): return self.failure("Stopall failed") return self.success() # Register SimulStop as a good test to run AllTestClasses.append(SimulStop) ################################################################### class StopOnebyOne(CTSTest): ################################################################### '''Stop all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StopOnebyOne" self.startall = SimulStartLite(cm) self.stop = StopTest(cm) def __call__(self, dummy): '''Perform the 'StopOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") failed=[] self.starttime=time.time() for node in self.CM.Env["nodes"]: if not self.stop(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to stop: " + repr(failed)) self.CM.clear_all_caches() return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(StopOnebyOne) ################################################################### class RestartOnebyOne(CTSTest): ################################################################### '''Restart all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="RestartOnebyOne" self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'RestartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") did_fail=[] self.starttime=time.time() self.restart = RestartTest(self.CM) for node in self.CM.Env["nodes"]: if not self.restart(node): did_fail.append(node) if did_fail: return self.failure("Could not restart %d nodes: %s" %(len(did_fail), repr(did_fail))) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(RestartOnebyOne) ################################################################### class PartialStart(CTSTest): ################################################################### '''Start a node - but tell it to stop before it finishes starting up''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="PartialStart" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) #self.is_unsafe = 1 def __call__(self, node): '''Perform the 'PartialStart' test. ''' self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Setup failed") # FIXME! This should use the CM class to get the pattern # then it would be applicable in general watchpats = [] watchpats.append("Starting crmd") watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats, timeout=self.CM["DeadTime"]+10) watch.setwatch() self.CM.StartaCMnoBlock(node) ret = watch.lookforall() if not ret: self.CM.log("Patterns not found: " + repr(watch.unmatched)) return self.failure("Setup of %s failed" % node) ret = self.stopall(None) if not ret: return self.failure("%s did not stop in time" % node) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(PartialStart) ####################################################################### class StandbyTest(CTSTest): ####################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Standby" self.start = StartTest(cm) self.startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resouces, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): self.incr("calls") ret=self.startall(None) if not ret: return self.failure("Start all nodes failed") self.CM.debug("Make sure node %s is active" % node) if self.CM.StandbyStatus(node) != "off": if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.CM.debug("Getting resources running on node %s" % node) rsc_on_node = self.CM.active_resources(node) self.CM.debug("Setting node %s to standby mode" % node) if not self.CM.SetStandbyMode(node, "on"): return self.failure("can't set node %s to standby mode" % node) time.sleep(30) # Allow time for the update to be applied and cause something self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "on": return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) self.CM.debug("Checking resources") bad_run = self.CM.active_resources(node) if len(bad_run) > 0: rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run))) self.CM.debug("Setting node %s to active mode" % node) self.CM.SetStandbyMode(node, "off") return rc self.CM.debug("Setting node %s to active mode" % node) if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) time.sleep(30) # Allow time for the update to be applied and cause something self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) return self.success() AllTestClasses.append(StandbyTest) ####################################################################### class ValgrindTest(CTSTest): ####################################################################### '''Check for memory leaks''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Valgrind" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_valgrind = 1 self.is_loop = 1 def setup(self, node): self.incr("calls") ret=self.stopall(None) if not ret: return self.failure("Stop all nodes failed") # Enable valgrind self.logPat = "/tmp/%s-*.valgrind" % self.name self.CM.Env["valgrind-prefix"] = self.name self.CM.rsh(node, "rm -f %s" % self.logPat, None) ret=self.startall(None) if not ret: return self.failure("Start all nodes failed") for node in self.CM.Env["nodes"]: (rc, output) = self.CM.rsh(node, "ps u --ppid `pidofproc aisexec`", None) for line in output: self.CM.debug(line) return self.success() def teardown(self, node): # Disable valgrind self.CM.Env["valgrind-prefix"] = None # Return all nodes to normal ret=self.stopall(None) if not ret: return self.failure("Stop all nodes failed") return self.success() def find_leaks(self): # Check for leaks leaked = [] self.stop = StopTest(self.CM) for node in self.CM.Env["nodes"]: (rc, ps_out) = self.CM.rsh(node, "ps u --ppid `pidofproc aisexec`", None) rc = self.stop(node) if not rc: self.failure("Couldn't shut down %s" % node) rc = self.CM.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e ERROR.*SUMMARY:.*[1-9].*errors %s" % self.logPat, 0) if rc != 1: leaked.append(node) self.failure("Valgrind errors detected on %s" % node) for line in ps_out: self.CM.log(line) (rc, output) = self.CM.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logPat, None) for line in output: self.CM.log(line) (rc, output) = self.CM.rsh(node, "cat %s" % self.logPat, None) for line in output: self.CM.debug(line) self.CM.rsh(node, "rm -f %s" % self.logPat, None) return leaked def __call__(self, node): leaked = self.find_leaks() if len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """cib:.*readCibXmlFile:""", """HA_VALGRIND_ENABLED""" ] ####################################################################### class StandbyLoopTest(ValgrindTest): ####################################################################### '''Check for memory leaks by putting a node in and out of standby for an hour''' def __init__(self, cm): ValgrindTest.__init__(self,cm) self.name="StandbyLoop" def __call__(self, node): lpc = 0 delay = 2 failed = 0 done=time.time() + self.CM.Env["loop-minutes"]*60 while time.time() <= done and not failed: lpc = lpc + 1 time.sleep(delay) if not self.CM.SetStandbyMode(node, "on"): self.failure("can't set node %s to standby mode" % node) failed = lpc time.sleep(delay) if not self.CM.SetStandbyMode(node, "off"): self.failure("can't set node %s to active mode" % node) failed = lpc leaked = self.find_leaks() if failed: return self.failure("Iteration %d failed" % failed) elif len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() AllTestClasses.append(StandbyLoopTest) ############################################################################## class BandwidthTest(CTSTest): ############################################################################## # Tests should not be cluster-manager-specific # If you need to find out cluster manager configuration to do this, then # it should be added to the generic cluster manager API. '''Test the bandwidth which heartbeat uses''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Bandwidth" self.start = StartTest(cm) self.__setitem__("min",0) self.__setitem__("max",0) self.__setitem__("totalbandwidth",0) self.tempfile = tempfile.mktemp(".cts") self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform the Bandwidth test''' self.incr("calls") if self.CM.upcount()<1: return self.skipped() Path = self.CM.InternalCommConfig() if "ip" not in Path["mediatype"]: return self.skipped() port = Path["port"][0] port = int(port) ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(5) # We get extra messages right after startup. fstmpfile = "/var/run/band_estimate" dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ % (port, fstmpfile) rc = self.CM.rsh(node, dumpcmd) if rc == 0: farfile = "root@%s:%s" % (node, fstmpfile) self.CM.rsh.cp(farfile, self.tempfile) Bandwidth = self.countbandwidth(self.tempfile) if not Bandwidth: self.CM.log("Could not compute bandwidth.") return self.success() intband = int(Bandwidth + 0.5) self.CM.log("...bandwidth: %d bits/sec" % intband) self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth if self.Stats["min"] == 0: self.Stats["min"] = Bandwidth if Bandwidth > self.Stats["max"]: self.Stats["max"] = Bandwidth if Bandwidth < self.Stats["min"]: self.Stats["min"] = Bandwidth self.CM.rsh(node, "rm -f %s" % fstmpfile) os.unlink(self.tempfile) return self.success() else: return self.failure("no response from tcpdump command [%d]!" % rc) def countbandwidth(self, file): fp = open(file, "r") fp.seek(0) count = 0 sum = 0 while 1: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count=count+1 linesplit = string.split(line," ") for j in range(len(linesplit)-1): if linesplit[j]=="udp": break if linesplit[j]=="length:": break try: sum = sum + int(linesplit[j+1]) except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T1 = linesplit[0] timesplit = string.split(T1,":") time2split = string.split(timesplit[2],".") time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 break while count < 100: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count+1 linessplit = string.split(line," ") for j in range(len(linessplit)-1): if linessplit[j] =="udp": break if linesplit[j]=="length:": break try: sum=int(linessplit[j+1])+sum except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T2 = linessplit[0] timesplit = string.split(T2,":") time2split = string.split(timesplit[2],".") time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 time = time2-time1 if (time <= 0): return 0 return (sum*8)/time def is_applicable(self): '''BandwidthTest never applicable''' return 0 AllTestClasses.append(BandwidthTest) ################################################################### class ResourceRecover(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="ResourceRecover" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max=30 self.rid=None #self.is_unsafe = 1 # these are the values used for the new LRM API call self.action = "asyncmon" self.interval = 0 def __call__(self, node): '''Perform the 'ResourceRecover' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed") resourcelist = self.CM.active_resources(node) # if there are no resourcelist, return directly if len(resourcelist)==0: self.CM.log("No active resources on %s" % node) return self.skipped() self.rid = self.CM.Env.RandomGen.choice(resourcelist) rsc = None (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.id == self.rid: rsc = tmp # Handle anonymous clones that get renamed self.rid = rsc.clone_id break if not rsc: return self.failure("Could not find %s in the resource list" % self.rid) self.CM.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) pats = [] pats.append("Updating failcount for %s on .* after .* %s" % (self.rid, self.action)) - if rsc.managed: + if rsc.managed(): pats.append("crmd:.* Performing .* op=%s_stop_0" % self.rid) - if rsc.unique == "1": + if rsc.unique(): pats.append("crmd:.* Performing .* op=%s_start_0" % self.rid) pats.append("crmd:.* LRM operation %s_start_0.*complete" % self.rid) else: # Anonymous clones may get restarted with a different clone number pats.append("crmd:.* Performing .* op=.*_start_0") pats.append("crmd:.* LRM operation .*_start_0.*complete") watch = CTS.LogWatcher(self.CM["LogFileName"], pats, timeout=60) watch.setwatch() self.CM.rsh(node, "crm_resource -F -r %s -H %s &>/dev/null" % (self.rid, node)) watch.lookforall() self.CM.cluster_stable() recovered=self.CM.ResourceLocation(self.rid) if watch.unmatched: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) - elif rsc.unique == "1" and len(recovered) > 1: + elif rsc.unique() and len(recovered) > 1: return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) elif len(recovered) > 0: self.CM.debug("%s is running on: %s" %(self.rid, repr(recovered))) - elif rsc.managed == "1": + elif rsc.managed(): return self.failure("%s was not recovered and is inactive" % self.rid) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """Updating failcount for %s""" % self.rid, """Unknown operation: fail""", """ERROR: sending stonithRA op to stonithd failed.""", """ERROR: process_lrm_event: LRM operation %s_%s_%d""" % (self.rid, self.action, self.interval), """ERROR: process_graph_event: Action %s_%s_%d .* initiated outside of a transition""" % (self.rid, self.action, self.interval), ] AllTestClasses.append(ResourceRecover) ################################################################### class ComponentFail(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="ComponentFail" self.startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] self.okerrpatterns = [] self.is_unsafe = 1 def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") self.patterns = [] self.okerrpatterns = [] # start all nodes ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.CM.cluster_stable(self.CM["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self.CM.is_node_dc(node, None) # select a component to kill chosen = self.CM.Env.RandomGen.choice(self.complist) while chosen.dc_only == 1 and node_is_dc == 0: chosen = self.CM.Env.RandomGen.choice(self.complist) self.CM.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot)) self.incr(chosen.name) if chosen.name != "aisexec": if self.CM["Name"] != "crm-lha" or chosen.name != "pengine": self.patterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name)) self.patterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name)) self.patterns.extend(chosen.pats) if node_is_dc: self.patterns.extend(chosen.dc_pats) # In an ideal world, this next stuff should be in the "chosen" object as a member function if self.CM["Name"] == "crm-lha" and chosen.triggersreboot: # Make sure the node goes down and then comes back up if it should reboot... for other in self.CM.Env["nodes"]: if other != node: self.patterns.append(self.CM["Pat:They_stopped"] %(other, node)) self.patterns.append(self.CM["Pat:Slave_started"] % node) self.patterns.append(self.CM["Pat:Local_started"] % node) if chosen.dc_only: # Sometimes these will be in the log, and sometimes they won't... self.okerrpatterns.append("%s crmd:.*Process %s:.* exited" %(node, chosen.name)) self.okerrpatterns.append("%s crmd:.*I_ERROR.*crmdManagedChildDied" %node) self.okerrpatterns.append("%s crmd:.*The %s subsystem terminated unexpectedly" %(node, chosen.name)) self.okerrpatterns.append("ERROR: Client .* exited with return code") else: # Sometimes this won't be in the log... self.okerrpatterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name)) self.okerrpatterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name)) self.okerrpatterns.append(self.CM["Pat:ChildExit"]) # supply a copy so self.patterns doesnt end up empty tmpPats = [] tmpPats.extend(self.patterns) self.patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonithPats = [] stonithPats.append("sending fencing op RESET for %s" % node) stonith = CTS.LogWatcher(self.CM["LogFileName"], stonithPats, 0) stonith.setwatch() # set the watch for stable watch = CTS.LogWatcher( self.CM["LogFileName"], tmpPats, self.CM["DeadTime"] + self.CM["StableTime"] + self.CM["StartTime"]) watch.setwatch() # kill the component chosen.kill(node) # check to see Heartbeat noticed matched = watch.lookforall(allow_multiple_matches=1) if matched: self.CM.debug("Found: "+ repr(matched)) else: self.CM.log("Patterns not found: " + repr(watch.unmatched)) if self.CM.Env["at-boot"] == 0: self.CM.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.CM.debug("Found: "+ repr(shot)) self.CM.ShouldBeStatus[node]="down" self.CM.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.CM.debug("Waiting for any STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"], 600) self.CM.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.CM["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self.okerrpatterns.extend(self.patterns) return self.okerrpatterns AllTestClasses.append(ComponentFail) #################################################################### class SplitBrainTest(CTSTest): #################################################################### '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.is_experimental = 1 def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self.CM.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.CM.log("Node "+node+" not in " + repr(self.CM.Env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.CM.debug("Creating partition: " + repr(partition)) self.CM.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self.CM.isolate_node(node, other_nodes): self.CM.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self.CM.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.CM.log("Node "+node+" not in " + repr(self.CM.Env["nodes"])) if len(other_nodes) == 0: return 1 self.CM.debug("Healing partition: " + repr(partition)) self.CM.debug("Everyone else: " + repr(other_nodes)) for node in partition: self.CM.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = 1 partitions = {} ret = self.startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self.CM.Env["nodes"]) for node in self.CM.Env["nodes"]: p = self.CM.Env.RandomGen.randint(1, p_max) if not partitions.has_key(p): partitions[p]= [] partitions[p].append(node) p_max = len(partitions.keys()) if p_max > 1: break # else, try again self.CM.debug("Created %d partitions" % p_max) for key in partitions.keys(): self.CM.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self.CM.rsh(node, "crm_attribute -n stonith-enabled -v false") for key in partitions.keys(): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self.CM.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self.CM.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self.CM.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self.CM.partitions_expected = 1 # And heal them again for key in partitions.keys(): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self.CM.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self.CM.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self.CM.Env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self.CM.cluster_stable(1200): self.failure("Reformed cluster not stable") answer = raw_input('Continue? [nY]') if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self.CM.Env["DoStonith"]: self.CM.rsh(node, "crm_attribute -D -n stonith-enabled") self.CM.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ "Another DC detected:", "ERROR: attrd_cib_callback: .*Application of an update diff failed", "crmd_ha_msg_callback:.*not in our membership list", "CRIT:.*node.*returning after partition", ] def is_applicable(self): if not self.is_applicable_common(): return 0 return len(self.CM.Env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) #################################################################### class Reattach(CTSTest): #################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Reattach" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = 0 # Handled by canrunnow() def setup(self, node): return self.startall(None) def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' if self.find_ocfs2_resources(node): self.CM.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return 0 return 1 def __call__(self, node): self.incr("calls") pats = [] managed = CTS.LogWatcher(self.CM["LogFileName"], ["is-managed-default"], timeout=60) managed.setwatch() self.CM.debug("Disable resource management") self.CM.rsh(node, "crm_attribute -n is-managed-default -v false") if not managed.lookforall(): self.CM.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append("crmd:.*Performing.*_stop_0") pats.append("crmd:.*Performing.*_start_0") pats.append("crmd:.*Performing.*_promote_0") pats.append("crmd:.*Performing.*_demote_0") pats.append("crmd:.*Performing.*_migrate_.*_0") watch = CTS.LogWatcher(self.CM["LogFileName"], pats, timeout=60) watch.setwatch() self.CM.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -D -n is-managed-default") return self.failure("Couldn't shut down the cluster") self.CM.debug("Bringing the cluster back up") ret = self.startall(None) if not ret: self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -D -n is-managed-default") return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -D -n is-managed-default") return self.failure("Resources stopped or started during cluster restart") watch = CTS.LogWatcher(self.CM["LogFileName"], pats, timeout=60) watch.setwatch() managed = CTS.LogWatcher(self.CM["LogFileName"], ["is-managed-default"], timeout=60) managed.setwatch() self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -D -n is-managed-default") if not managed.lookforall(): self.CM.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not enabled") self.CM.cluster_stable() # Ignore actions for STONITH resources ignore = [] (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.CM.debug("Ignoring: crmd:.*Performing.*op=%s_.*_0" % r.id) ignore.append("crmd:.*Performing.*op=%s_.*_0" % r.id) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret def errorstoignore(self): '''Return list of errors which should be ignored''' return [ "You may ignore this error if it is unmanaged.", "pingd: .*ERROR: send_ipc_message:", "pingd: .*ERROR: send_update:", ] def is_applicable(self): if self.CM["Name"] == "crm-lha": return None return 1 AllTestClasses.append(Reattach) #################################################################### class SpecialTest1(CTSTest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SpecialTest1" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return ret # Start the selected node ret = self.restart1(node) if not ret: return ret # Start all remaining nodes ret = self.startall(None) return ret AllTestClasses.append(SpecialTest1) #################################################################### class HAETest(CTSTest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="HAETest" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_loop = 1 def setup(self, node): # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") return self.success() def wait_on_state(self, node, resource, expected_clones, attempts=240): while attempts > 0: active=0 (rc, lines) = self.CM.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None) # Hack until crm_resource does the right thing if rc == 0 and lines: active = len(lines) if len(lines) == expected_clones: return 1 elif rc == 1: self.CM.debug("Resource %s is still inactive" % resource) elif rc == 234: self.CM.log("Unknown resource %s" % resource) return 0 elif rc == 246: self.CM.log("Cluster is inactive") return 0 elif rc != 0: self.CM.log("Call to crm_resource failed, rc=%d" % rc) return 0 else: self.CM.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones)) attempts -= 1 time.sleep(1) return 0 def find_dlm(self, node): self.r_dlm = None (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "controld" and r.parent != "NA": self.CM.debug("Found dlm: %s" % self.r_dlm) self.r_dlm = r.parent return 1 return 0 def find_hae_resources(self, node): self.r_dlm = None self.r_o2cb = None self.r_ocfs2 = [] if self.find_dlm(node): self.find_ocfs2_resources(node) def is_applicable(self): if not self.is_applicable_common(): return 0 if self.CM.Env["Schema"] == "hae": return 1 return None #################################################################### class HAERoleTest(HAETest): #################################################################### def __init__(self, cm): '''Lars' mount/unmount test for the HA extension. ''' HAETest.__init__(self,cm) self.name="HAERoleTest" def change_state(self, node, resource, target): rc = self.CM.rsh(node, "crm_resource -r %s -p target-role -v %s --meta" % (resource, target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 delay = 2 done=time.time() + self.CM.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.CM.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "Stopped") if not self.wait_on_state(node, self.r_dlm, 0): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "Started") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAERoleTest) #################################################################### class HAEStandbyTest(HAETest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): HAETest.__init__(self,cm) self.name="HAEStandbyTest" def change_state(self, node, resource, target): rc = self.CM.rsh(node, "crm_standby -l reboot -v %s" % (target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 done=time.time() + self.CM.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.CM.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "true") if not self.wait_on_state(node, self.r_dlm, clone_max-1): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "false") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAEStandbyTest) ################################################################### class NearQuorumPointTest(CTSTest): ################################################################### ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] #decide what to do with each node for node in self.CM.Env["nodes"]: action = self.CM.Env.RandomGen.choice(["start","stop"]) #action = self.CM.Env.RandomGen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.CM.debug("start nodes:" + repr(startset)) self.CM.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self.CM.ShouldBeStatus[node] == "up": watchpats.append(self.CM["Pat:We_stopped"] % node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": #watchpats.append(self.CM["Pat:Slave_started"] % node) watchpats.append(self.CM["Pat:Local_started"] % node) else: for stopping in stopset: if self.CM.ShouldBeStatus[stopping] == "up": watchpats.append(self.CM["Pat:They_stopped"] % (node, stopping)) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.CM["Pat:DC_IDLE"]) watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() #begin actions for node in stopset: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": self.CM.StartaCMnoBlock(node) #get the result if watch.lookforall(): self.CM.cluster_stable() return self.success() self.CM.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self.CM.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self.CM.StataCM(node) == 0: downnodes.append(node) if upnodes == [] and downnodes == []: self.CM.cluster_stable() return self.success() if len(upnodes) > 0: self.CM.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self.CM.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() AllTestClasses.append(NearQuorumPointTest) ################################################################### class RollingUpgradeTest(CTSTest): ################################################################### '''Perform a rolling upgrade of the cluster''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="RollingUpgrade" self.start = StartTest(cm) self.stop = StopTest(cm) self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def setup(self, node): # Start all remaining nodes ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.CM.Env["nodes"]: if not self.downgrade(node, None): return self.failure("Couldn't downgrade %s" % node) ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.CM.Env["nodes"]: if not self.upgrade(node, None): return self.failure("Couldn't upgrade %s" % node) return self.success() def install(self, node, version, start=1, flags="--force"): target_dir = "/tmp/rpm-%s" % version src_dir = "%s/%s" % (self.CM.Env["rpm-dir"], version) self.CM.log("Installing %s on %s with %s" % (version, node, flags)) if not self.stop(node): return self.failure("stop failure: "+node) rc = self.CM.rsh(node, "mkdir -p %s" % target_dir) rc = self.CM.rsh(node, "rm -f %s/*.rpm" % target_dir) (rc, lines) = self.CM.rsh(node, "ls -1 %s/*.rpm" % src_dir, None) for line in lines: line = line[:-1] rc = self.CM.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir)) rc = self.CM.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) if start and not self.start(node): return self.failure("start failure: "+node) return self.success() def upgrade(self, node, start=1): return self.install(node, self.CM.Env["current-version"], start) def downgrade(self, node, start=1): return self.install(node, self.CM.Env["previous-version"], start, "--force --nodeps") def __call__(self, node): '''Perform the 'Rolling Upgrade' test. ''' self.incr("calls") for node in self.CM.Env["nodes"]: if self.upgrade(node): return self.failure("Couldn't upgrade %s" % node) self.CM.cluster_stable() return self.success() def is_applicable(self): if not self.is_applicable_common(): return None if not self.CM.Env.has_key("rpm-dir"): return None if not self.CM.Env.has_key("current-version"): return None if not self.CM.Env.has_key("previous-version"): return None return 1 # Register RestartTest as a good test to run AllTestClasses.append(RollingUpgradeTest) ################################################################### class BSC_AddResource(CTSTest): ################################################################### '''Add a resource to the cluster''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="AddResource" self.resource_offset = 0 self.cib_cmd="""cibadmin -C -o %s -X '%s' """ def __call__(self, node): self.incr("calls") self.resource_offset = self.resource_offset + 1 r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) start_pat = "crmd.*%s_start_0.*complete" patterns = [] patterns.append(start_pat % r_id) watch = CTS.LogWatcher( self.CM["LogFileName"], patterns, self.CM["DeadTime"]) watch.setwatch() fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"] = ip if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): return self.failure("Make resource %s failed" % r_id) failed = 0 watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.CM.log ("Warn: Pattern not found: %s" % (regex)) failed = 1 if failed: return self.failure("Resource pattern(s) not found") if not self.CM.cluster_stable(self.CM["DeadTime"]): return self.failure("Unstable cluster") return self.success() def make_ip_resource(self, node, id, rclass, type, ip): self.CM.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) rsc_xml=""" """ % (id, rclass, type, id, id, ip) node_constraint=""" """ % (id, id, id, id, node) rc = 0 (rc, lines) = self.CM.rsh(node, self.cib_cmd % ("constraints", node_constraint), None) if rc != 0: self.CM.log("Constraint creation failed: %d" % rc) return None (rc, lines) = self.CM.rsh(node, self.cib_cmd % ("resources", rsc_xml), None) if rc != 0: self.CM.log("Resource creation failed: %d" % rc) return None return 1 def is_applicable(self): if self.CM.Env["DoBSC"]: return 1 return None class SimulStopLite(CTSTest): ################################################################### '''Stop any active nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStopLite" def __call__(self, dummy): '''Perform the 'SimulStopLite' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.incr("WasStarted") watchpats.append(self.CM["Pat:All_stopped"] % node) #if self.CM.Env["use_logd"]: # watchpats.append(self.CM["Pat:Logd_stopped"] % node) if len(watchpats) == 0: self.CM.clear_all_caches() return self.success() # Stop all the nodes - at about the same time... watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() self.starttime=time.time() for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) if watch.lookforall(): self.CM.clear_all_caches() return self.success() did_fail=0 up_nodes = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 1: did_fail=1 up_nodes.append(node) if did_fail: return self.failure("Active nodes exist: " + repr(up_nodes)) self.CM.log("Warn: All nodes stopped but CTS didnt detect: " + repr(watch.unmatched)) self.CM.clear_all_caches() return self.failure("Missing log message: "+repr(watch.unmatched)) def is_applicable(self): '''SimulStopLite is a setup test and never applicable''' return 0 ################################################################### class SimulStartLite(CTSTest): ################################################################### '''Start any stopped nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStartLite" def __call__(self, dummy): '''Perform the 'SimulStartList' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] uppat = self.CM["Pat:Slave_started"] if self.CM.upcount() == 0: uppat = self.CM["Pat:Local_started"] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "down": self.incr("WasStopped") watchpats.append(uppat % node) if len(watchpats) == 0: return self.success() watchpats.append(self.CM["Pat:DC_IDLE"]) # Start all the nodes - at about the same time... watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() self.starttime=time.time() for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "down": self.CM.StartaCMnoBlock(node) if watch.lookforall(): for attempt in (1, 2, 3, 4, 5): if self.CM.cluster_stable(): return self.success() return self.failure("Cluster did not stabilize") did_fail=0 unstable = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 0: did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstarted nodes exist: " + repr(unstable)) unstable = [] for node in self.CM.Env["nodes"]: if not self.CM.node_stable(node): did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstable cluster nodes exist: " + repr(unstable)) self.CM.log("ERROR: All nodes started but CTS didnt detect: " + repr(watch.unmatched)) return self.failure() def is_applicable(self): '''SimulStartLite is a setup test and never applicable''' return 0 def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.Audits = audits result.append(bound_test) return result diff --git a/tools/crm_resource.c b/tools/crm_resource.c index 77fde2dc32..90793bf579 100644 --- a/tools/crm_resource.c +++ b/tools/crm_resource.c @@ -1,1500 +1,1494 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include gboolean do_force = FALSE; gboolean BE_QUIET = FALSE; const char *attr_set_type = XML_TAG_ATTR_SETS; char *host_id = NULL; const char *rsc_id = NULL; const char *host_uname = NULL; const char *prop_name = NULL; const char *prop_value = NULL; const char *rsc_type = NULL; const char *prop_id = NULL; const char *prop_set = NULL; char *migrate_lifetime = NULL; char rsc_cmd = 'L'; char *our_pid = NULL; IPC_Channel *crmd_channel = NULL; char *xml_file = NULL; int cib_options = cib_sync_call; #define CMD_ERR(fmt, args...) do { \ crm_warn(fmt, ##args); \ fprintf(stderr, fmt, ##args); \ } while(0) static int do_find_resource(const char *rsc, pe_working_set_t *data_set) { int found = 0; resource_t *the_rsc = pe_find_resource(data_set->resources, rsc); if(the_rsc == NULL) { return cib_NOTEXISTS; } slist_iter(node, node_t, the_rsc->running_on, lpc, crm_debug_3("resource %s is running on: %s", rsc, node->details->uname); if(BE_QUIET) { fprintf(stdout, "%s\n", node->details->uname); } else { fprintf(stdout, "resource %s is running on: %s\n", rsc, node->details->uname); } found++; ); if(BE_QUIET == FALSE && found == 0) { fprintf(stderr, "resource %s is NOT running\n", rsc); } return 0; } #define cons_string(x) x?x:"NA" static void print_cts_constraints(pe_working_set_t *data_set) { xmlNode *lifetime = NULL; xmlNode * cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set->input); xml_child_iter(cib_constraints, xml_obj, const char *id = crm_element_value(xml_obj, XML_ATTR_ID); if(id == NULL) { continue; } lifetime = first_named_child(xml_obj, "lifetime"); if(test_ruleset(lifetime, NULL, data_set->now) == FALSE) { continue; } if(safe_str_eq(XML_CONS_TAG_RSC_DEPEND, crm_element_name(xml_obj))) { printf("Constraint %s %s %s %s %s %s %s\n", crm_element_name(xml_obj), cons_string(crm_element_value(xml_obj, XML_ATTR_ID)), cons_string(crm_element_value(xml_obj, XML_COLOC_ATTR_SOURCE)), cons_string(crm_element_value(xml_obj, XML_COLOC_ATTR_TARGET)), cons_string(crm_element_value(xml_obj, XML_RULE_ATTR_SCORE)), cons_string(crm_element_value(xml_obj, XML_COLOC_ATTR_SOURCE_ROLE)), cons_string(crm_element_value(xml_obj, XML_COLOC_ATTR_TARGET_ROLE))); } else if(safe_str_eq(XML_CONS_TAG_RSC_LOCATION, crm_element_name(xml_obj))) { /* unpack_rsc_location(xml_obj, data_set); */ } ); } static void print_cts_rsc(resource_t *rsc) { const char *host = NULL; gboolean needs_quorum = TRUE; - const char *p_id = "NA"; + const char *rtype = crm_element_value(rsc->xml, XML_ATTR_TYPE); const char *rprov = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); const char *rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); - const char *rtype = crm_element_value(rsc->xml, XML_ATTR_TYPE); - - if(rsc->parent) { - p_id = rsc->parent->id; - } if(safe_str_eq(rclass, "stonith")) { needs_quorum = FALSE; } else { xml_child_iter_filter(rsc->ops_xml, op, "op", const char *name = crm_element_value(op, "name"); if(safe_str_neq(name, CRMD_ACTION_START)) { const char *value = crm_element_value(op, "requires"); if(safe_str_eq(value, "nothing")) { needs_quorum = FALSE; } break; } ); } if(rsc->running_on != NULL && g_list_length(rsc->running_on) == 1) { node_t *tmp = rsc->running_on->data; host = tmp->details->uname; } - printf("Resource: %s %s %s %s %d %d %d %s %s %s %s\n", + printf("Resource: %s %s %s %s %s %s %s %s %d %lld 0x%.16llx\n", crm_element_name(rsc->xml), rsc->id, - rsc->clone_name?rsc->clone_name:rsc->id, p_id, - is_set(rsc->flags, pe_rsc_managed), needs_quorum, - is_set(rsc->flags, pe_rsc_unique), rprov?rprov:"NA", rclass, rtype, host?host:"NA"); + rsc->clone_name?rsc->clone_name:rsc->id, rsc->parent?rsc->parent->id:"NA", + rprov?rprov:"NA", rclass, rtype, host?host:"NA", needs_quorum, rsc->flags, rsc->flags); slist_iter(child, resource_t, rsc->children, lpc, print_cts_rsc(child); ); } static void print_raw_rsc(resource_t *rsc) { GListPtr children = rsc->children; if(children == NULL) { printf("%s\n", rsc->id); } slist_iter(child, resource_t, children, lpc, print_raw_rsc(child); ); } static int do_find_resource_list(pe_working_set_t *data_set, gboolean raw) { int found = 0; slist_iter( rsc, resource_t, data_set->resources, lpc, if(is_set(rsc->flags, pe_rsc_orphan) && rsc->fns->active(rsc, TRUE) == FALSE) { continue; } rsc->fns->print( rsc, NULL, pe_print_printf|pe_print_rsconly, stdout); found++; ); if(found == 0) { printf("NO resources configured\n"); return cib_NOTEXISTS; } return 0; } static resource_t *find_rsc_or_clone(const char *rsc, pe_working_set_t *data_set) { resource_t *the_rsc = pe_find_resource(data_set->resources, rsc); if(the_rsc == NULL) { char *as_clone = crm_concat(rsc, "0", ':'); the_rsc = pe_find_resource(data_set->resources, as_clone); crm_free(as_clone); } return the_rsc; } static int dump_resource(const char *rsc, pe_working_set_t *data_set) { char *rsc_xml = NULL; resource_t *the_rsc = find_rsc_or_clone(rsc, data_set); if(the_rsc == NULL) { return cib_NOTEXISTS; } the_rsc->fns->print(the_rsc, NULL, pe_print_printf, stdout); rsc_xml = dump_xml_formatted(the_rsc->xml); fprintf(stdout, "raw xml:\n%s", rsc_xml); crm_free(rsc_xml); return 0; } static int dump_resource_attr( const char *rsc, const char *attr, pe_working_set_t *data_set) { int rc = cib_NOTEXISTS; node_t *current = NULL; GHashTable *params = NULL; resource_t *the_rsc = find_rsc_or_clone(rsc, data_set); const char *value = NULL; if(the_rsc == NULL) { return cib_NOTEXISTS; } if(g_list_length(the_rsc->running_on) == 1) { current = the_rsc->running_on->data; } else if(g_list_length(the_rsc->running_on) > 1) { CMD_ERR("%s is active on more than one node," " returning the default value for %s\n", the_rsc->id, crm_str(value)); } params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { get_rsc_attributes(params, the_rsc, current, data_set); } else { get_meta_attributes(params, the_rsc, current, data_set); } crm_debug("Looking up %s in %s", attr, the_rsc->id); value = g_hash_table_lookup(params, attr); if(value != NULL) { fprintf(stdout, "%s\n", value); rc = 0; } g_hash_table_destroy(params); return rc; } static int find_resource_attr( cib_t *the_cib, const char *attr, const char *rsc, const char *set_type, const char *set_name, const char *attr_id, const char *attr_name, char **value) { int offset = 0; static int xpath_max = 1024; enum cib_errors rc = cib_ok; xmlNode *xml_search = NULL; char *xpath_string = NULL; CRM_ASSERT(value != NULL); *value = NULL; crm_malloc0(xpath_string, xpath_max); offset += snprintf(xpath_string + offset, xpath_max - offset, "%s", get_object_path("resources")); offset += snprintf(xpath_string + offset, xpath_max - offset, "//*[@id=\"%s\"]", rsc); if(set_type) { offset += snprintf(xpath_string + offset, xpath_max - offset, "//%s", set_type); if(set_name) { offset += snprintf(xpath_string + offset, xpath_max - offset, "[@id=\"%s\"]", set_name); } } offset += snprintf(xpath_string + offset, xpath_max - offset, "//nvpair["); if(attr_id) { offset += snprintf(xpath_string + offset, xpath_max - offset, "@id=\"%s\"", attr_id); } if(attr_name) { if(attr_id) { offset += snprintf(xpath_string + offset, xpath_max - offset, " and "); } offset += snprintf(xpath_string + offset, xpath_max - offset, "@name=\"%s\"", attr_name); } offset += snprintf(xpath_string + offset, xpath_max - offset, "]"); rc = the_cib->cmds->query( the_cib, xpath_string, &xml_search, cib_sync_call|cib_scope_local|cib_xpath); if(rc != cib_ok) { return rc; } crm_log_xml_debug(xml_search, "Match"); if(xml_has_children(xml_search)) { rc = cib_missing_data; printf("Multiple attributes match name=%s\n", attr_name); xml_child_iter(xml_search, child, printf(" Value: %s \t(id=%s)\n", crm_element_value(child, XML_NVPAIR_ATTR_VALUE), ID(child)); ); } else { const char *tmp = crm_element_value(xml_search, attr); if(tmp) { *value = crm_strdup(tmp); } } free_xml(xml_search); return rc; } static int set_resource_attr(const char *rsc_id, const char *attr_set, const char *attr_id, const char *attr_name, const char *attr_value, cib_t *cib, pe_working_set_t *data_set) { int rc = cib_ok; char *local_attr_id = NULL; char *local_attr_set = NULL; xmlNode *xml_top = NULL; xmlNode *xml_obj = NULL; gboolean use_attributes_tag = FALSE; resource_t *rsc = find_rsc_or_clone(rsc_id, data_set); if(rsc == NULL) { return cib_NOTEXISTS; } if(safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { rc = find_resource_attr( cib, XML_ATTR_ID, rsc_id, XML_TAG_META_SETS, attr_set, attr_id, attr_name, &local_attr_id); if(rc == cib_ok) { printf("WARNING: There is already a meta attribute called %s (id=%s)\n", attr_name, local_attr_id); } } rc = find_resource_attr( cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id); if(rc == cib_ok) { crm_debug("Found a match for name=%s: id=%s", attr_name, local_attr_id); attr_id = local_attr_id; } else if(rc != cib_NOTEXISTS) { return rc; } else { const char *value = NULL; xmlNode *cib_top = NULL; const char *tag = crm_element_name(rsc->xml); rc = cib->cmds->query(cib, "/cib", &cib_top, cib_sync_call|cib_scope_local|cib_xpath|cib_no_children); value = crm_element_value(cib_top, "ignore_dtd"); if(value != NULL) { use_attributes_tag = TRUE; } else { value = crm_element_value(cib_top, XML_ATTR_VALIDATION); if(value && strstr(value, "-0.6")) { use_attributes_tag = TRUE; } } free_xml(cib_top); if(attr_set == NULL) { local_attr_set = crm_concat(rsc_id, attr_set_type, '-'); attr_set = local_attr_set; } if(attr_id == NULL) { local_attr_id = crm_concat(attr_set, attr_name, '-'); attr_id = local_attr_id; } if(use_attributes_tag && safe_str_eq(tag, XML_CIB_TAG_MASTER)) { tag = "master_slave"; /* use the old name */ } xml_top = create_xml_node(NULL, tag); crm_xml_add(xml_top, XML_ATTR_ID, rsc_id); xml_obj = create_xml_node(xml_top, attr_set_type); crm_xml_add(xml_obj, XML_ATTR_ID, attr_set); if(use_attributes_tag) { xml_obj = create_xml_node(xml_obj, XML_TAG_ATTRS); } } xml_obj = create_xml_node(xml_obj, XML_CIB_TAG_NVPAIR); if(xml_top == NULL) { xml_top = xml_obj; } crm_xml_add(xml_obj, XML_ATTR_ID, attr_id); crm_xml_add(xml_obj, XML_NVPAIR_ATTR_NAME, attr_name); crm_xml_add(xml_obj, XML_NVPAIR_ATTR_VALUE, attr_value); crm_log_xml_debug(xml_top, "Update"); rc = cib->cmds->modify(cib, XML_CIB_TAG_RESOURCES, xml_top, cib_options); free_xml(xml_top); crm_free(local_attr_id); crm_free(local_attr_set); return rc; } static int delete_resource_attr( const char *rsc_id, const char *attr_set, const char *attr_id, const char *attr_name, cib_t *cib, pe_working_set_t *data_set) { xmlNode *xml_obj = NULL; int rc = cib_ok; char *local_attr_id = NULL; resource_t *rsc = find_rsc_or_clone(rsc_id, data_set); if(rsc == NULL) { return cib_NOTEXISTS; } rc = find_resource_attr( cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id); if(rc == cib_NOTEXISTS) { return cib_ok; } else if(rc != cib_ok) { return rc; } if(attr_id == NULL) { attr_id = local_attr_id; } xml_obj = create_xml_node(NULL, XML_CIB_TAG_NVPAIR); crm_xml_add(xml_obj, XML_ATTR_ID, attr_id); crm_xml_add(xml_obj, XML_NVPAIR_ATTR_NAME, attr_name); crm_log_xml_debug(xml_obj, "Delete"); rc = cib->cmds->delete(cib, XML_CIB_TAG_RESOURCES, xml_obj, cib_options); if(rc == cib_ok) { printf("Deleted %s option: id=%s%s%s%s%s\n", rsc_id, local_attr_id, attr_set?" set=":"", attr_set?attr_set:"", attr_name?" name=":"", attr_name?attr_name:""); } free_xml(xml_obj); crm_free(local_attr_id); return rc; } static int dump_resource_prop( const char *rsc, const char *attr, pe_working_set_t *data_set) { const char *value = NULL; resource_t *the_rsc = pe_find_resource(data_set->resources, rsc); if(the_rsc == NULL) { return cib_NOTEXISTS; } value = crm_element_value(the_rsc->xml, attr); if(value != NULL) { fprintf(stdout, "%s\n", value); return 0; } return cib_NOTEXISTS; } static void resource_ipc_connection_destroy(gpointer user_data) { crm_info("Connection to CRMd was terminated"); exit(1); } static gboolean crmd_msg_callback(IPC_Channel * server, void *private_data) { int lpc = 0; IPC_Message *msg = NULL; gboolean hack_return_good = TRUE; while (server->ch_status != IPC_DISCONNECT && server->ops->is_message_pending(server) == TRUE) { if (server->ops->recv(server, &msg) != IPC_OK) { perror("Receive failure:"); return !hack_return_good; } if (msg == NULL) { crm_debug_4("No message this time"); continue; } lpc++; msg->msg_done(msg); } if (server->ch_status == IPC_DISCONNECT) { crm_debug_2("admin_msg_callback: received HUP"); return !hack_return_good; } return hack_return_good; } static int send_lrm_rsc_op(IPC_Channel *crmd_channel, const char *op, const char *host_uname, const char *rsc_id, gboolean only_failed, pe_working_set_t *data_set) { char *key = NULL; int rc = cib_send_failed; xmlNode *cmd = NULL; xmlNode *xml_rsc = NULL; const char *value = NULL; xmlNode *params = NULL; xmlNode *msg_data = NULL; resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); if(rsc == NULL) { CMD_ERR("Resource %s not found\n", rsc_id); return cib_NOTEXISTS; } else if(rsc->variant != pe_native) { CMD_ERR("We can only process primitive resources, not %s\n", rsc_id); return cib_invalid_argument; } else if(host_uname == NULL) { CMD_ERR("Please supply a hostname with -H\n"); return cib_invalid_argument; } key = crm_concat("0:0:crm-resource", our_pid, '-'); msg_data = create_xml_node(NULL, XML_GRAPH_TAG_RSC_OP); crm_xml_add(msg_data, XML_ATTR_TRANSITION_KEY, key); crm_free(key); xml_rsc = create_xml_node(msg_data, XML_CIB_TAG_RESOURCE); if(rsc->clone_name) { crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->clone_name); crm_xml_add(xml_rsc, XML_ATTR_ID_LONG, rsc->id); } else { crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->id); crm_xml_add(xml_rsc, XML_ATTR_ID_LONG, rsc->long_name); } value = crm_element_value(rsc->xml, XML_ATTR_TYPE); crm_xml_add(xml_rsc, XML_ATTR_TYPE, value); if(value == NULL) { CMD_ERR("%s has no type! Aborting...\n", rsc_id); return cib_NOTEXISTS; } value = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, value); if(value == NULL) { CMD_ERR("%s has no class! Aborting...\n", rsc_id); return cib_NOTEXISTS; } value = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, value); params = create_xml_node(msg_data, XML_TAG_ATTRS); crm_xml_add(params, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); key = crm_meta_name(XML_LRM_ATTR_INTERVAL); crm_xml_add(params, key, "60000"); /* 1 minute */ crm_free(key); cmd = create_request(op, msg_data, host_uname, CRM_SYSTEM_CRMD, crm_system_name, our_pid); /* crm_log_xml_warn(cmd, "send_lrm_rsc_op"); */ free_xml(msg_data); if(send_ipc_message(crmd_channel, cmd)) { rc = 0; sleep(1); /* dont exit striaght away, give the crmd time * to process our request */ } else { CMD_ERR("Could not send %s op to the crmd", op); } free_xml(cmd); return rc; } static int delete_lrm_rsc(IPC_Channel *crmd_channel, const char *host_uname, const char *rsc_id, pe_working_set_t *data_set) { return send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_DELETE, host_uname, rsc_id, TRUE, data_set); } static int fail_lrm_rsc(IPC_Channel *crmd_channel, const char *host_uname, const char *rsc_id, pe_working_set_t *data_set) { crm_warn("Failing: %s", rsc_id); return send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_FAIL, host_uname, rsc_id, FALSE, data_set); } static int refresh_lrm(IPC_Channel *crmd_channel, const char *host_uname) { xmlNode *cmd = NULL; int rc = cib_send_failed; cmd = create_request(CRM_OP_LRM_REFRESH, NULL, host_uname, CRM_SYSTEM_CRMD, crm_system_name, our_pid); if(send_ipc_message(crmd_channel, cmd)) { rc = 0; } free_xml(cmd); return rc; } static int migrate_resource( const char *rsc_id, const char *existing_node, const char *preferred_node, cib_t * cib_conn) { char *later_s = NULL; enum cib_errors rc = cib_ok; char *id = NULL; xmlNode *rule = NULL; xmlNode *expr = NULL; xmlNode *constraints = NULL; xmlNode *fragment = NULL; xmlNode *can_run = NULL; xmlNode *dont_run = NULL; fragment = create_xml_node(NULL, XML_CIB_TAG_CONSTRAINTS); constraints = fragment; id = crm_concat("cli-prefer", rsc_id, '-'); can_run = create_xml_node(NULL, XML_CONS_TAG_RSC_LOCATION); crm_xml_add(can_run, XML_ATTR_ID, id); crm_free(id); id = crm_concat("cli-standby", rsc_id, '-'); dont_run = create_xml_node(NULL, XML_CONS_TAG_RSC_LOCATION); crm_xml_add(dont_run, XML_ATTR_ID, id); crm_free(id); if(migrate_lifetime) { char *life = crm_strdup(migrate_lifetime); char *life_mutable = life; ha_time_t *now = NULL; ha_time_t *later = NULL; ha_time_t *duration = parse_time_duration(&life_mutable); if(duration == NULL) { CMD_ERR("Invalid duration specified: %s\n", migrate_lifetime); CMD_ERR("Please refer to" " http://en.wikipedia.org/wiki/ISO_8601#Duration" " for examples of valid durations\n"); crm_free(life); return cib_invalid_argument; } now = new_ha_date(TRUE); later = add_time(now, duration); log_date(LOG_INFO, "now ", now, ha_log_date|ha_log_time); log_date(LOG_INFO, "later ", later, ha_log_date|ha_log_time); log_date(LOG_INFO, "duration", duration, ha_log_date|ha_log_time|ha_log_local); later_s = date_to_string(later, ha_log_date|ha_log_time); printf("Migration will take effect until: %s\n", later_s); free_ha_date(duration); free_ha_date(later); free_ha_date(now); crm_free(life); } if(existing_node == NULL) { crm_log_xml_notice(can_run, "Deleting"); rc = cib_conn->cmds->delete( cib_conn, XML_CIB_TAG_CONSTRAINTS, dont_run, cib_options); if(rc == cib_NOTEXISTS) { rc = cib_ok; } else if(rc != cib_ok) { goto bail; } } else { if(BE_QUIET == FALSE) { fprintf(stderr, "WARNING: Creating rsc_location constraint '%s'" " with a score of -INFINITY for resource %s" " on %s.\n", ID(dont_run), rsc_id, existing_node); CMD_ERR("\tThis will prevent %s from running" " on %s until the constraint is removed using" " the 'crm_resource -U' command or manually" " with cibadmin\n", rsc_id, existing_node); CMD_ERR("\tThis will be the case even if %s is" " the last node in the cluster\n", existing_node); CMD_ERR("\tThis message can be disabled with -Q\n"); } crm_xml_add(dont_run, "rsc", rsc_id); rule = create_xml_node(dont_run, XML_TAG_RULE); expr = create_xml_node(rule, XML_TAG_EXPRESSION); id = crm_concat("cli-standby-rule", rsc_id, '-'); crm_xml_add(rule, XML_ATTR_ID, id); crm_free(id); crm_xml_add(rule, XML_RULE_ATTR_SCORE, MINUS_INFINITY_S); crm_xml_add(rule, XML_RULE_ATTR_BOOLEAN_OP, "and"); id = crm_concat("cli-standby-expr", rsc_id, '-'); crm_xml_add(expr, XML_ATTR_ID, id); crm_free(id); crm_xml_add(expr, XML_EXPR_ATTR_ATTRIBUTE, "#uname"); crm_xml_add(expr, XML_EXPR_ATTR_OPERATION, "eq"); crm_xml_add(expr, XML_EXPR_ATTR_VALUE, existing_node); crm_xml_add(expr, XML_EXPR_ATTR_TYPE, "string"); if(later_s) { expr = create_xml_node(rule, "date_expression"); id = crm_concat("cli-standby-lifetime-end",rsc_id,'-'); crm_xml_add(expr, XML_ATTR_ID, id); crm_free(id); crm_xml_add(expr, "operation", "lt"); crm_xml_add(expr, "end", later_s); } add_node_copy(constraints, dont_run); } if(preferred_node == NULL) { crm_log_xml_notice(can_run, "Deleting"); rc = cib_conn->cmds->delete( cib_conn, XML_CIB_TAG_CONSTRAINTS, can_run, cib_options); if(rc == cib_NOTEXISTS) { rc = cib_ok; } else if(rc != cib_ok) { goto bail; } } else { crm_xml_add(can_run, "rsc", rsc_id); rule = create_xml_node(can_run, XML_TAG_RULE); expr = create_xml_node(rule, XML_TAG_EXPRESSION); id = crm_concat("cli-prefer-rule", rsc_id, '-'); crm_xml_add(rule, XML_ATTR_ID, id); crm_free(id); crm_xml_add(rule, XML_RULE_ATTR_SCORE, INFINITY_S); crm_xml_add(rule, XML_RULE_ATTR_BOOLEAN_OP, "and"); id = crm_concat("cli-prefer-expr", rsc_id, '-'); crm_xml_add(expr, XML_ATTR_ID, id); crm_free(id); crm_xml_add(expr, XML_EXPR_ATTR_ATTRIBUTE, "#uname"); crm_xml_add(expr, XML_EXPR_ATTR_OPERATION, "eq"); crm_xml_add(expr, XML_EXPR_ATTR_VALUE, preferred_node); crm_xml_add(expr, XML_EXPR_ATTR_TYPE, "string"); if(later_s) { expr = create_xml_node(rule, "date_expression"); id = crm_concat("cli-prefer-lifetime-end", rsc_id, '-'); crm_xml_add(expr, XML_ATTR_ID, id); crm_free(id); crm_xml_add(expr, "operation", "lt"); crm_xml_add(expr, "end", later_s); } add_node_copy(constraints, can_run); } if(preferred_node != NULL || existing_node != NULL) { crm_log_xml_notice(fragment, "CLI Update"); rc = cib_conn->cmds->update( cib_conn, XML_CIB_TAG_CONSTRAINTS, fragment, cib_options); } bail: free_xml(fragment); free_xml(dont_run); free_xml(can_run); crm_free(later_s); return rc; } static int list_resource_operations( const char *rsc_id, const char *host_uname, gboolean active, pe_working_set_t *data_set) { resource_t *rsc = NULL; int opts = pe_print_printf|pe_print_rsconly|pe_print_suppres_nl; GListPtr ops = find_operations(rsc_id, host_uname, active, data_set); slist_iter(xml_op, xmlNode, ops, lpc, const char *op_rsc = crm_element_value(xml_op, "resource"); const char *last = crm_element_value(xml_op, "last_run"); const char *status_s = crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS); int status = crm_parse_int(status_s, "0"); rsc = pe_find_resource(data_set->resources, op_rsc); rsc->fns->print(rsc, "", opts, stdout); fprintf(stdout, ": %s (node=%s, call=%s, rc=%s", ID(xml_op), crm_element_value(xml_op, XML_ATTR_UNAME), crm_element_value(xml_op, XML_LRM_ATTR_CALLID), crm_element_value(xml_op, XML_LRM_ATTR_RC)); if(last) { time_t run_at = crm_parse_int(last, "0"); fprintf(stdout, ", last-run=%s, exec=%sms\n", ctime(&run_at), crm_element_value(xml_op, "exec_time")); } fprintf(stdout, "): %s\n", op_status2text(status)); ); return cib_ok; } #include "../pengine/pengine.h" static void show_location(resource_t *rsc) { GListPtr list = rsc->rsc_location; slist_iter(cons, rsc_to_node_t, list, lpc, slist_iter(node, node_t, cons->node_list_rh, lpc2, fprintf(stdout, "+ '%s': %s = %s \n", cons->id, node->details->uname, score2char(node->weight)); ); ); } static void show_colocation(resource_t *rsc, gboolean dependants, gboolean raw) { const char *prefix = " "; GListPtr list = rsc->rsc_cons; if(dependants) { prefix = " "; list = rsc->rsc_cons_lhs; } if(is_set(rsc->flags, pe_rsc_allocating)) { /* Break colocation loops */ return; } set_bit(rsc->flags, pe_rsc_allocating); slist_iter(cons, rsc_colocation_t, list, lpc, resource_t *peer = cons->rsc_rh; if(dependants) { peer = cons->rsc_lh; } if(raw) { fprintf(stdout, "%s '%s': %s = %s\n", prefix, cons->id, peer->id, score2char(cons->score)); continue; } if(dependants) { if(is_set(peer->flags, pe_rsc_allocating)) { continue; } show_colocation(peer, dependants, raw); } fprintf(stdout, "%s%s%s\n", prefix, peer->id, is_set(peer->flags, pe_rsc_allocating)?" (loop) ":""); if(!dependants) { show_colocation(peer, dependants, raw); } ); clear_bit(rsc->flags, pe_rsc_allocating); } static struct crm_option long_options[] = { /* Top-level Options */ {"help", 0, 0, '?', "\t\tThis text"}, {"version", 0, 0, '$', "\t\tVersion information" }, {"verbose", 0, 0, 'V', "\t\tIncrease debug output"}, {"quiet", 0, 0, 'Q', "\t\tPrint only the value on stdout\n"}, {"resource", 1, 0, 'r', "\tResource ID" }, {"-spacer-",1, 0, '-', "\nQueries:"}, {"list", 0, 0, 'L', "\t\tList all resources"}, {"list-raw", 0, 0, 'l', "\tList the IDs of all instansiated resources (no groups/clones/...)"}, {"list-cts", 0, 0, 'c', NULL, 1}, {"list-operations", 0, 0, 'O', "\tList active resource operations. Optionally filtered by resource (-r) and/or node (-N)"}, {"list-all-operations", 0, 0, 'o', "List all resource operations. Optionally filtered by resource (-r) and/or node (-N)\n"}, {"query-xml", 0, 0, 'q', "\tQuery the definition of a resource"}, {"locate", 0, 0, 'W', "\t\tDisplay the current location(s) of a resource"}, {"stack", 0, 0, 'A', "\t\tDisplay the pre-requisits and depandants of a resource"}, {"constraints",0, 0, 'a', "\tDisplay the (co)location constraints that apply to a resource"}, {"-spacer-", 1, 0, '-', "\nCommands:"}, {"set-parameter", 1, 0, 'p', "Set the named parameter for a resource. See also -m, --meta"}, {"get-parameter", 1, 0, 'g', "Display the named parameter for a resource. See also -m, --meta"}, {"delete-parameter",1, 0, 'd', "Delete the named parameter for a resource. See also -m, --meta"}, {"get-property", 1, 0, 'G', "Display the 'class', 'type' or 'provider' of a resource", 1}, {"set-property", 1, 0, 'S', "(Advanced) Set the class, type or provider of a resource", 1}, {"migrate", 0, 0, 'M', "\t\tMigrate a resource from its current location, optionally specifying a destination (-N) and/or a period for which it should take effect (-u)" "\n\t\t\t\tIf -N is not specified, the cluster will force the resource to move by creating a rule for the current location and a score of -INFINITY" "\n\t\t\t\tNOTE: This will prevent the resource from running on this node until the constraint is removed with -U"}, {"un-migrate", 0, 0, 'U', "\tRemove all constraints created by a migrate command"}, {"-spacer-", 1, 0, '-', "\nAdvanced Commands:"}, {"delete", 0, 0, 'D', "\t\tDelete a resource from the CIB"}, {"fail", 0, 0, 'F', "\t\tTell the cluster this resource has failed"}, {"refresh", 0, 0, 'R', "\t\t(Advanced) Refresh the CIB from the LRM"}, {"cleanup", 0, 0, 'C', "\t\t(Advanced) Delete a resource from the LRM"}, {"reprobe", 0, 0, 'P', "\t\t(Advanced) Re-check for resources started outside of the CRM\n"}, {"-spacer-", 1, 0, '-', "\nAdditional Options:"}, {"node", 1, 0, 'N', "\tHost uname"}, {"resource-type", 1, 0, 't', "Resource type (primitive, clone, group, ...)"}, {"parameter-value", 1, 0, 'v', "Value to use with -p, -g or -d"}, {"lifetime", 1, 0, 'u', "\tLifespan of migration constraints\n"}, {"meta", 0, 0, 'm', "\t\tModify a resource's configuration option rather than one which is passed to the resource agent script. For use with -p, -g, -d"}, {"set-name", 1, 0, 's', "\t(Advanced) ID of the instance_attributes object to change"}, {"nvpair", 1, 0, 'i', "\t(Advanced) ID of the nvpair object to change/delete"}, {"force", 0, 0, 'f', "\n" /* Is this actually true anymore? "\t\tForce the resource to move by creating a rule for the current location and a score of -INFINITY" "\n\t\tThis should be used if the resource's stickiness and constraint scores total more than INFINITY (Currently 100,000)" "\n\t\tNOTE: This will prevent the resource from running on this node until the constraint is removed with -U or the --lifetime duration expires\n"*/ }, {"xml-file", 1, 0, 'x', NULL, 1},\ /* legacy options */ {"host-uname", 1, 0, 'H', NULL, 1}, {"-spacer-", 1, 0, '-', "\nExamples:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', "List the configured resources:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --list", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Display the current location of 'myResource':", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --locate", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Migrate 'myResource' to another machine:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --migrate", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Migrate 'myResource' to a specific machine:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --migrate --node altNode", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Allow (but not force) 'myResource' to migrate back to its original location:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --un-migrate", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Tell the cluster that 'myResource' failed:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --fail", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Stop a 'myResource' (and anything that depends on it):", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --set-parameter target-role --meta --parameter-value Stopped", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Tell the cluster not to manage 'myResource':", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', "The cluster will not attempt to start or stop the resource under any circumstances."}, {"-spacer-", 1, 0, '-', "Useful when performing maintenance tasks on a resource.", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --set-parameter is-managed --meta --parameter-value false", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Erase the operation history of 'myResource' on 'aNode':", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', "The cluster will 'forget' the existing resource state (including any errors) and attempt to recover the resource."}, {"-spacer-", 1, 0, '-', "Useful when a resource had failed permanently and has been repaired by an administrator.", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_resource --resource myResource --cleanup --node aNode", pcmk_option_example}, {0, 0, 0, 0} }; int main(int argc, char **argv) { pe_working_set_t data_set; xmlNode *cib_xml_copy = NULL; cib_t * cib_conn = NULL; enum cib_errors rc = cib_ok; gboolean need_cib = TRUE; int option_index = 0; int argerr = 0; int flag; crm_log_init(basename(argv[0]), LOG_ERR, FALSE, FALSE, argc, argv); crm_set_options("V?$LRQxDCPp:WMUr:H:h:v:t:p:g:d:i:s:G:S:fx:lmu:FOocqN:aA", "(query|command) [options]", long_options, "Perform tasks related to cluster resources.\n Allows resources to be queried (definition and location), modified, and moved around the cluster.\n"); if(argc < 2) { crm_help('?', LSB_EXIT_EINVAL); } while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch(flag) { case 'V': cl_log_enable_stderr(TRUE); alter_debug(DEBUG_INC); break; case '$': case '?': crm_help(flag, LSB_EXIT_OK); break; case 'x': xml_file = crm_strdup(optarg); break; case 'Q': BE_QUIET = TRUE; break; case 'm': attr_set_type = XML_TAG_META_SETS; break; case 'u': migrate_lifetime = crm_strdup(optarg); break; case 'f': do_force = TRUE; break; case 'i': prop_id = optarg; break; case 's': prop_set = optarg; break; case 'r': rsc_id = optarg; break; case 'v': prop_value = optarg; break; case 't': rsc_type = optarg; break; case 'R': case 'P': need_cib = FALSE; rsc_cmd = flag; break; case 'L': case 'c': case 'l': case 'q': case 'D': case 'F': case 'C': case 'W': case 'M': case 'U': case 'O': case 'o': case 'A': case 'a': rsc_cmd = flag; break; case 'p': case 'g': case 'd': case 'S': case 'G': prop_name = optarg; rsc_cmd = flag; break; case 'h': case 'H': case 'N': crm_debug_2("Option %c => %s", flag, optarg); host_uname = optarg; break; default: CMD_ERR("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); ++argerr; break; } } if (optind < argc && argv[optind] != NULL) { CMD_ERR("non-option ARGV-elements: "); while (optind < argc && argv[optind] != NULL) { CMD_ERR("%s ", argv[optind++]); ++argerr; } CMD_ERR("\n"); } if (optind > argc) { ++argerr; } if (argerr) { crm_help('?', LSB_EXIT_GENERIC); } crm_malloc0(our_pid, 11); if(our_pid != NULL) { snprintf(our_pid, 10, "%d", getpid()); our_pid[10] = '\0'; } if(do_force) { crm_debug("Forcing..."); cib_options |= cib_scope_local|cib_quorum_override; } if(need_cib) { resource_t *rsc = NULL; if(xml_file != NULL) { cib_xml_copy = filename2xml(xml_file); } else { cib_conn = cib_new(); rc = cib_conn->cmds->signon( cib_conn, crm_system_name, cib_command); if(rc != cib_ok) { CMD_ERR("Error signing on to the CIB service: %s\n", cib_error2string(rc)); return rc; } cib_xml_copy = get_cib_copy(cib_conn); } set_working_set_defaults(&data_set); if(cli_config_update(&cib_xml_copy, NULL) == FALSE) { return cib_STALE; } data_set.input = cib_xml_copy; data_set.now = new_ha_date(TRUE); cluster_status(&data_set); if(rsc_id) { rsc = find_rsc_or_clone(rsc_id, &data_set); } if(rsc == NULL) { rc = cib_NOTEXISTS; } } if(rsc_cmd == 'R' || rsc_cmd == 'C' || rsc_cmd == 'F' || rsc_cmd == 'P') { GCHSource *src = NULL; src = init_client_ipc_comms(CRM_SYSTEM_CRMD, crmd_msg_callback, NULL, &crmd_channel); if(src == NULL) { CMD_ERR("Error signing on to the CRMd service\n"); return 1; } send_hello_message( crmd_channel, our_pid, crm_system_name, "0", "1"); set_IPC_Channel_dnotify(src, resource_ipc_connection_destroy); } if(rsc_cmd == 'L') { rc = cib_ok; do_find_resource_list(&data_set, FALSE); } else if(rsc_cmd == 'l') { int found = 0; rc = cib_ok; slist_iter( rsc, resource_t, data_set.resources, lpc, found++; print_raw_rsc(rsc); ); if(found == 0) { printf("NO resources configured\n"); return cib_NOTEXISTS; } } else if(rsc_cmd == 'A') { resource_t *rsc = pe_find_resource(data_set.resources, rsc_id); xmlNode * cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set.input); unpack_constraints(cib_constraints, &data_set); show_colocation(rsc, TRUE, FALSE); fprintf(stdout, "* %s\n", rsc->id); show_colocation(rsc, FALSE, FALSE); } else if(rsc_cmd == 'a') { resource_t *rsc = pe_find_resource(data_set.resources, rsc_id); xmlNode * cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set.input); unpack_constraints(cib_constraints, &data_set); show_colocation(rsc, TRUE, TRUE); fprintf(stdout, "* %s\n", rsc->id); show_colocation(rsc, FALSE, TRUE); show_location(rsc); } else if(rsc_cmd == 'c') { int found = 0; rc = cib_ok; slist_iter( rsc, resource_t, data_set.resources, lpc, found++; print_cts_rsc(rsc); ); print_cts_constraints(&data_set); } else if(rsc_cmd == 'C') { resource_t *rsc = pe_find_resource(data_set.resources, rsc_id); if(rsc && rsc->variant != pe_native) { fprintf(stderr, "We can only clean up primitive resources and %s is a %s\n", rsc_id, get_resource_typename(rsc->variant)); rc = cib_NOTEXISTS; } else { rc = delete_lrm_rsc(crmd_channel, host_uname, rsc_id, &data_set); } if(rc == cib_ok) { char *host_uuid = NULL; char *attr_name = crm_concat("fail-count", rsc_id, '-'); rc = query_node_uuid(cib_conn, host_uname, &host_uuid); if(rc != cib_ok) { fprintf(stderr,"Could not map uname=%s to a UUID: %s\n", host_uname, cib_error2string(rc)); } else { crm_info("Mapped %s to %s", host_uname, crm_str(host_uuid)); rc = delete_attr(cib_conn, cib_sync_call, XML_CIB_TAG_STATUS, host_uuid, NULL, NULL, attr_name, NULL, FALSE); } } } else if(rsc_cmd == 'F') { rc = fail_lrm_rsc(crmd_channel, host_uname, rsc_id, &data_set); } else if(rsc_cmd == 'O') { rc = list_resource_operations(rsc_id, host_uname, TRUE, &data_set); } else if(rsc_cmd == 'o') { rc = list_resource_operations(rsc_id, host_uname, FALSE, &data_set); } else if(rc == cib_NOTEXISTS) { CMD_ERR("Resource %s not found: %s\n", crm_str(rsc_id), cib_error2string(rc)); } else if(rsc_cmd == 'W') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = do_find_resource(rsc_id, &data_set); } else if(rsc_cmd == 'q') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = dump_resource(rsc_id, &data_set); } else if(rsc_cmd == 'U') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = migrate_resource(rsc_id, NULL, NULL, cib_conn); } else if(rsc_cmd == 'M') { node_t *dest = NULL; node_t *current = NULL; const char *current_uname = NULL; resource_t *rsc = pe_find_resource(data_set.resources, rsc_id); if(rsc != NULL && rsc->running_on != NULL) { current = rsc->running_on->data; if(current != NULL) { current_uname = current->details->uname; } } if(host_uname != NULL) { dest = pe_find_node(data_set.nodes, host_uname); } if(rsc == NULL) { CMD_ERR("Resource %s not migrated:" " not found\n", rsc_id); } else if(rsc->variant == pe_native && g_list_length(rsc->running_on) > 1) { CMD_ERR("Resource %s not migrated:" " active on multiple nodes\n", rsc_id); } else if(host_uname != NULL && dest == NULL) { CMD_ERR("Error performing operation: " "%s is not a known node\n", host_uname); rc = cib_NOTEXISTS; } else if(host_uname != NULL && safe_str_eq(current_uname, host_uname)) { CMD_ERR("Error performing operation: " "%s is already active on %s\n", rsc_id, host_uname); } else if(current_uname != NULL && (do_force || host_uname == NULL)) { rc = migrate_resource(rsc_id, current_uname, host_uname, cib_conn); } else if(host_uname != NULL) { rc = migrate_resource( rsc_id, NULL, host_uname, cib_conn); } else { CMD_ERR("Resource %s not migrated: " "not-active and no prefered location" " specified.\n", rsc_id); rc = cib_missing; } } else if(rsc_cmd == 'G') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = dump_resource_prop(rsc_id, prop_name, &data_set); } else if(rsc_cmd == 'S') { xmlNode *msg_data = NULL; if(prop_value == NULL || strlen(prop_value) == 0) { CMD_ERR("You need to supply a value with the -v option\n"); return CIBRES_MISSING_FIELD; } else if(cib_conn == NULL) { return cib_connection; } if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } CRM_DEV_ASSERT(rsc_type != NULL); CRM_DEV_ASSERT(prop_name != NULL); CRM_DEV_ASSERT(prop_value != NULL); msg_data = create_xml_node(NULL, rsc_type); crm_xml_add(msg_data, XML_ATTR_ID, rsc_id); crm_xml_add(msg_data, prop_name, prop_value); rc = cib_conn->cmds->modify( cib_conn, XML_CIB_TAG_RESOURCES, msg_data, cib_options); free_xml(msg_data); } else if(rsc_cmd == 'g') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = dump_resource_attr(rsc_id, prop_name, &data_set); } else if(rsc_cmd == 'p') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } if(prop_value == NULL || strlen(prop_value) == 0) { CMD_ERR("You need to supply a value with the -v option\n"); return CIBRES_MISSING_FIELD; } rc = set_resource_attr(rsc_id, prop_set, prop_id, prop_name, prop_value, cib_conn, &data_set); } else if(rsc_cmd == 'd') { if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } rc = delete_resource_attr(rsc_id, prop_set, prop_id, prop_name, cib_conn, &data_set); } else if(rsc_cmd == 'P') { xmlNode *cmd = NULL; cmd = create_request(CRM_OP_REPROBE, NULL, host_uname, CRM_SYSTEM_CRMD, crm_system_name, our_pid); send_ipc_message(crmd_channel, cmd); free_xml(cmd); } else if(rsc_cmd == 'R') { refresh_lrm(crmd_channel, host_uname); } else if(rsc_cmd == 'D') { xmlNode *msg_data = NULL; if(rsc_id == NULL) { CMD_ERR("Must supply a resource id with -r\n"); return cib_NOTEXISTS; } if(rsc_type == NULL) { CMD_ERR("You need to specify a resource type with -t"); return cib_NOTEXISTS; } else if(cib_conn == NULL) { return cib_connection; } msg_data = create_xml_node(NULL, rsc_type); crm_xml_add(msg_data, XML_ATTR_ID, rsc_id); rc = cib_conn->cmds->delete( cib_conn, XML_CIB_TAG_RESOURCES, msg_data, cib_options); free_xml(msg_data); } else { CMD_ERR("Unknown command: %c\n", rsc_cmd); } if(cib_conn != NULL) { cleanup_calculations(&data_set); cib_conn->cmds->signoff(cib_conn); } if(rc == cib_no_quorum) { CMD_ERR("Error performing operation: %s\n", cib_error2string(rc)); CMD_ERR("Try using -f\n"); } else if(rc != cib_ok) { CMD_ERR("Error performing operation: %s\n", cib_error2string(rc)); } return rc; }