diff --git a/cts/CIB.py b/cts/CIB.py index 7a3cbb6a38..7cf325bd98 100644 --- a/cts/CIB.py +++ b/cts/CIB.py @@ -1,522 +1,546 @@ '''CTS: Cluster Testing System: CIB generator ''' __copyright__=''' Author: Andrew Beekhof Copyright (C) 2008 Andrew Beekhof ''' from UserDict import UserDict import sys, time, types, syslog, os, struct, string, signal, traceback, warnings from CTSvars import * from CTS import ClusterManager -from popen2 import Popen3 class CibBase: cts_cib = None cib_tmpfile = None version = "unknown" feature_set = "unknown" target = "localhost" def __init__(self, CM, tmpfile=None): self.CM = CM #self.target = self.CM.Env["nodes"][0] if not tmpfile: warnings.filterwarnings("ignore") self.cib_tmpfile=os.tmpnam() warnings.resetwarnings() else: self.cib_tmpfile = tmpfile def version(self): return self.version def NextIP(self): fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"] = ip return ip class CIB06(CibBase): version = "transitional-0.6" coloc_template = """""" cib_template =''' %s %s %s ''' cib_option_template = ''' ''' lsb_resource = ''' ''' clustermon_location_constraint = ''' ''' resource_group_template = '''%s %s %s''' per_node_constraint_template = ''' ''' pingd_constraint_template = ''' ''' dummy_resource_template = ''' ''' clustermon_resource_template = ''' ''' master_slave_resource = ''' ''' pingd_resource_template = """ """ stonith_resource_template = """ """ bsc_template = ''' ''' def NewIP(self, name=None): template = ''' ''' ip = self.NextIP() if not name: name = "r"+ip return template % (name, name, name, name, name, ip) def NewHBIP(self, name=None): template = ''' ''' ip = self.NextIP() if not name: name = "r"+ip return template % (name, name, name, name, ip) def NewDummy(self, name): return self.dummy_resource_template % (name, name, name, name) - def contents(self): + def install(self, target): + self.CM.rsh("localhost", "echo \'" + self.contents() + "\' > " + self.cib_tmpfile) + rc = self.CM.rsh.cp(cib_file, "root@%s:%s/cib.xml" + (target, CTSvars.CRM_CONFIG_DIR)) + if rc != 0: + raise ValueError("Can not copy %s to %s (%d)"%(self.cib_tmpfile, target, rc)) + + self.CM.rsh(target, "chown "+CTSvars.CRM_DAEMON_USER+" "+CTSvars.CRM_CONFIG_DIR+"/cib.xml") + self.CM.rsh("localhost", "rm -f "+self.cib_tmpfile) + + def contents(self, target=None): # fencing resource if self.cts_cib: - return self.cts_cib + return self.cts_cib nodelist = "" num_nodes = 0 for node in self.CM.Env["nodes"]: nodelist += node + " " num_nodes = num_nodes + 1 no_quorum = "stop" if num_nodes < 3: no_quorum = "ignore" self.CM.debug("Cluster only has %d nodes, ignoring quorum" % num_nodes) #make up crm config cib_options = self.cib_option_template % (self.CM.Env["DoFencing"], no_quorum) #create resources and their constraints resources = "" constraints = "" if self.CM.Env["DoBSC"] == 1: cib_options = cib_options + self.bsc_template if self.CM.Env["CIBResource"] != 1: # generate cib self.cts_cib = self.cib_template % (cib_options, resources, constraints) return self.cts_cib if self.CM.cluster_monitor == 1: resources += self.clustermon_resource_template constraints += self.clustermon_location_constraint ip1_rsc = self.NewIP() ip2_rsc = self.NewHBIP() ip3_rsc = self.NewIP() resources += self.resource_group_template % (ip1_rsc, ip2_rsc, ip3_rsc) # lsb resource resources += self.lsb_resource # Mirgator resources += self.NewDummy("migrator") constraints += self.coloc_template % ("group-with-master", "group-1", "master-1", "Master", "INFINITY") constraints += self.coloc_template % ("lsb-with-group", "lsb_dummy", "group-1", "Started", "INFINITY") # per node resource for node in self.CM.Env["nodes"]: per_node_resources = self.NewIP("rsc_"+node) per_node_constraint = self.per_node_constraint_template % (node, "rsc_"+node, node) resources += per_node_resources constraints += per_node_constraint # Ping the test master resources += self.pingd_resource_template % os.uname()[1] # Require conectivity to run constraints += self.pingd_constraint_template % ("master-1", "master-1", "m", "Started", "m", 1) if self.CM.Env["DoFencing"]: p_name = None p_value = None entries = string.split(self.CM.Env["stonith-params"], ',') for entry in entries: (p_name, p_value) = string.split(entry, '=') if p_name == "hostlist" and p_value == "all": p_value = string.join(self.CM.Env["nodes"], " ") stonith_resource = self.stonith_resource_template % (self.CM.Env["stonith-type"], p_name, p_value) resources += stonith_resource #master slave resource resources += self.master_slave_resource % (num_nodes, 1, 1, 1) # generate cib self.cts_cib = self.cib_template % (cib_options, resources, constraints) return self.cts_cib class CIB10(CibBase): feature_set = "3.0" version = "pacemaker-1.0" cib_template = ''' ''' def _create(self, command): fixed = "CIB_file="+self.cib_tmpfile+" crm configure " + command rc = self.CM.rsh(self.target, fixed) if rc != 0: self.CM.log("Configure call failed: "+fixed) sys.exit(1) def _show(self, command=""): output = "" - p = Popen3("CIB_file="+self.cib_tmpfile+" crm configure show "+command, None) - p.tochild.close() - result = p.fromchild.readlines() - p.fromchild.close() - self.lastrc = p.wait() + (rc, result) = self.CM.rsh(self.target, "CIB_file="+self.cib_tmpfile+" crm configure show "+command, None, ) for line in result: output += line self.CM.debug("Generated Config: "+line) return output def NewIP(self, name=None, standard="ocf:heartbeat"): ip = self.NextIP() if not name: name = "r"+ip if not standard: standard = "" else: standard += ":" self._create('''primitive %s %sIPaddr params ip=%s cidr_netmask=32 op monitor interval=5s''' % (name, standard, ip)) return name + def install(self, target): + old = self.cib_tmpfile + + self.cts_cib = None + self.target = target + self.cib_tmpfile = CTSvars.CRM_CONFIG_DIR+"/cib.xml" + + self.contents() + self.CM.rsh(self.target, "chown "+CTSvars.CRM_DAEMON_USER+" "+self.cib_tmpfile) + + self.cib_tmpfile = old + def contents(self): # fencing resource if self.cts_cib: + if target: + self.CM.log("NOT IMPLEMENTED") return self.cts_cib + + if self.target == "localhost": + self.target = self.CM.Env["nodes"][0] - self.CM.rsh(self.target, "rm -f "+self.cib_tmpfile) cib_base = self.cib_template % (self.feature_set, self.version, ''' remote-tls-port='9898' ''') self.CM.rsh(self.target, '''echo "%s" > %s''' % (cib_base, self.cib_tmpfile)) + #self.CM.rsh.cp(self.cib_tmpfile, "root@%s:%s" % (self.target, self.cib_tmpfile)) nodelist = "" self.num_nodes = 0 for node in self.CM.Env["nodes"]: nodelist += node + " " self.num_nodes = self.num_nodes + 1 no_quorum = "stop" if self.num_nodes < 3: no_quorum = "ignore" self.CM.debug("Cluster only has %d nodes, ignoring quorum" % self.num_nodes) self._create('''property start-failure-is-fatal=false pe-input-series-max=5000''') self._create('''property shutdown-escalation=5min startup-fencing=false batch-limit=10''') self._create('''property no-quorum-policy=%s stonith-enabled=%s''' % (no_quorum, self.CM.Env["DoFencing"])) self._create('''property expected-quorum-votes=%d''' % self.num_nodes) if self.CM.Env["DoBSC"] == 1: self._create('''property ident-string="Linux-HA TEST configuration file - REMOVEME!!"''') # Add resources? if self.CM.Env["CIBResource"] == 1: self.add_resources() # Fencing resource if self.CM.Env["DoFencing"]: params = None entries = string.split(self.CM.Env["stonith-params"], ',') for entry in entries: (name, value) = string.split(entry, '=') if name == "hostlist" and value == "all": value = string.join(self.CM.Env["nodes"], " ") if params: - params = ("""%s %s="%s" """ % (params, name, value)) + params = ("""%s '%s="%s"' """ % (params, name, value)) else: - params = ("""%s="%s" """ % (name, value)) + params = ("""'%s="%s"' """ % (name, value)) if params: params = "params %s" % params else: params = "" self._create('''primitive FencingChild stonith::%s %s livedangerously=yes op monitor interval=120s timeout=300 op start interval=0 timeout=180s op stop interval=0 timeout=180s''' % (self.CM.Env["stonith-type"], params)) # Set a threshold for unreliable stonith devices such as the vmware one self._create('''clone Fencing FencingChild meta globally-unique=false migration-threshold=5''') if self.CM.cluster_monitor == 1: self._create('''primitive cluster_mon ocf:pacemaker:ClusterMon params update=10 extra_options="-r -n" user=abeekhof htmlfile=/suse/abeekhof/Export/cluster.html op start interval=0 requires=nothing op monitor interval=5s requires=nothing''') self._create('''location prefer-dc cluster_mon rule -INFINITY: \#is_dc eq false''') # generate cib self.cts_cib = self._show("xml") - self.CM.rsh(self.target, "rm -f "+self.cib_tmpfile) + + if self.cib_tmpfile != CTSvars.CRM_CONFIG_DIR+"/cib.xml": + self.CM.rsh(self.target, "rm -f "+self.cib_tmpfile) + return self.cts_cib def add_resources(self): # Group Resource r1 = self.NewIP() ip = self.NextIP() r2 = "r"+ip self._create('''primitive %s heartbeat::IPaddr params 1=%s/32 op monitor interval=5s''' % (r2, ip)) r3 = self.NewIP() self._create('''group group-1 %s %s %s''' % (r1, r2, r3)) # Per-node resources for node in self.CM.Env["nodes"]: r = self.NewIP("rsc_"+node) self._create('''location prefer-%s %s rule 100: \#uname eq %s''' % (node, r, node)) # LSB resource self._create('''primitive lsb-dummy lsb::''' +CTSvars.CTS_home+ '''/LSBDummy op monitor interval=5s''') self._create('''colocation lsb-with-group INFINITY: lsb-dummy group-1''') self._create('''order lsb-after-group mandatory: group-1 lsb-dummy symmetrical=true''') # Migrator self._create('''primitive migrator ocf:pacemaker:Dummy meta allow-migrate=1 op monitor interval=P10S''') # Ping the test master self._create('''primitive ping-1 ocf:pacemaker:pingd params host_list=%s name=connected op monitor interval=120s''' % os.uname()[1]) self._create('''clone Connectivity ping-1 meta globally-unique=false''') #master slave resource self._create('''primitive stateful-1 ocf:pacemaker:Stateful op monitor interval=15s op monitor interval=16s role=Master''') self._create('''ms master-1 stateful-1 meta clone-max=%d clone-node-max=%d master-max=%d master-node-max=%d''' % (self.num_nodes, 1, 1, 1)) # Require conectivity to run the master self._create('''location %s-is-connected %s rule -INFINITY: connected lt %d''' % ("m1", "master-1", 1)) # Group with the master self._create('''colocation group-with-master INFINITY: group-1 master-1:Master''') self._create('''order group-after-master mandatory: master-1:promote group-1:start symmetrical=true''') class HASI(CIB10): def add_resources(self): # DLM resource self._create('''primitive dlm ocf:pacemaker:controld op monitor interval=120s''') self._create('''clone dlm-clone dlm meta globally-unique=false interleave=true''') # O2CB resource self._create('''primitive o2cb ocf:ocfs2:o2cb op monitor interval=120s''') self._create('''clone o2cb-clone o2cb meta globally-unique=false interleave=true''') self._create('''colocation o2cb-with-dlm INFINITY: o2cb-clone dlm-clone''') self._create('''order start-o2cb-after-dlm mandatory: dlm-clone o2cb-clone''') class ConfigFactory: def __init__(self, CM): self.CM = CM self.register("pacemaker06", CIB06, CM) self.register("pacemaker10", CIB10, CM) self.register("hae", HASI, CM) def register(self, methodName, constructor, *args, **kargs): """register a constructor""" _args = [constructor] _args.extend(args) setattr(self, methodName, apply(ConfigFactoryItem,_args, kargs)) def unregister(self, methodName): """unregister a constructor""" delattr(self, methodName) def createConfig(self, name="pacemaker-1.0"): if name == "pacemaker-0.6": name = "pacemaker06"; elif name == "pacemaker-1.0": name = "pacemaker10"; elif name == "hasi": name = "hae"; if hasattr(self, name): return getattr(self, name)() else: self.CM.log("Configuration variant '%s' is unknown. Defaulting to latest config" % name) return self.pacemaker10() class ConfigFactoryItem: def __init__(self, function, *args, **kargs): assert callable(function), "function should be a callable obj" self._function = function self._args = args self._kargs = kargs def __call__(self, *args, **kargs): """call function""" _args = list(self._args) _args.extend(args) _kargs = self._kargs.copy() _kargs.update(kargs) return apply(self._function,_args,_kargs) #CibFactory = ConfigFactory() diff --git a/cts/CM_lha.py b/cts/CM_lha.py index 533a61055b..468cc2b5a3 100755 --- a/cts/CM_lha.py +++ b/cts/CM_lha.py @@ -1,609 +1,601 @@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTSvars import * from CTS import * from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class crm_lha(ClusterManager): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): ClusterManager.__init__(self, Environment, randseed=randseed) #HeartbeatCM.__init__(self, Environment, randseed=randseed) self.fastfail = 0 self.clear_cache = 0 self.cib_installed = 0 self.config = None self.cluster_monitor = 0 self.use_short_names = 1 self.update({ "Name" : "crm-lha", "DeadTime" : 300, "StartTime" : 300, # Max time to start up "StableTime" : 30, "StartCmd" : CTSvars.INITDIR+"/heartbeat start > /dev/null 2>&1", "StopCmd" : CTSvars.INITDIR+"/heartbeat stop > /dev/null 2>&1", "ElectionCmd" : "crmadmin -E %s", "StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null", "EpocheCmd" : "crm_node -H -e", "QuorumCmd" : "crm_node -H -q", "ParitionCmd" : "crm_node -H -p", "CibQuery" : "cibadmin -Ql", "ExecuteRscOp" : "lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>&1", "CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "LogFileName" : Environment["LogFileName"], "StandbyCmd" : "crm_standby -U %s -v %s 2>/dev/null", "UUIDQueryCmd" : "crmadmin -N", "StandbyQueryCmd" : "crm_standby -GQ -U %s 2>/dev/null", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE", # This wont work if we have multiple partitions "Pat:Local_started" : "%s crmd:.*The local CRM is operational", "Pat:Slave_started" : "%s crmd:.*State transition.*-> S_NOT_DC", "Pat:Master_started" : "%s crmd:.* State transition.*-> S_IDLE", "Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ", "Pat:All_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:ChildKilled" : "%s heartbeat.*%s.*killed by signal 9", "Pat:ChildRespawn" : "%s heartbeat.*Respawning client.*%s", "Pat:ChildExit" : "ERROR: Client .* exited with return code", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r"global_timer_callback:", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", ), }) if self.Env["DoBSC"]: del self["Pat:They_stopped"] del self["Pat:Logd_stopped"] self.Env["use_logd"] = 0 self._finalConditions() self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} self.CibFactory = ConfigFactory(self) self.cib = self.CibFactory.createConfig(self.Env["Schema"]) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "ERROR: crm_abort: crm_glib_handler: ", "ERROR: Message hist queue is filling up", "stonithd: .*CRIT: external_hostlist: 'vmware gethosts' returned an empty hostlist", "stonithd: .*ERROR: Could not list nodes for stonith RA external/vmware.", "pengine: Preventing .* from re-starting", ] return [] def install_config(self, node): if not self.ns.WaitForNodeToComeUp(node): self.log("Node %s is not up." % node) return None if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") # Only install the CIB on the first node, all the other ones will pick it up from there if self.cib_installed == 1: return None self.cib_installed = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) - warnings.filterwarnings("ignore") - cib_file=os.tmpnam() - warnings.resetwarnings() - self.rsh("localhost", "rm -f "+cib_file) - self.debug("Creating new CIB for " + node + " in: " + cib_file) - self.rsh("localhost", "echo \'" + self.cib.contents() + "\' > " + cib_file) - if 0 != self.rsh.cp(cib_file, "root@" + (self["CIBfile"]%node)): - raise ValueError("Can not copy %s to %s %d"%(cib_file, node)) - - self.rsh("localhost", "rm -f "+cib_file) + self.cib.install(node) + else: self.log("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0 != self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s %d"%(node)) self.rsh(node, "chown "+CTSvars.CRM_DAEMON_USER+" "+CTSvars.CRM_CONFIG_DIR+"/cib.xml") def prepare(self): '''Finish the Initialization process. Prepare to test...''' self.partitions_expected = 1 for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" self.unisolate_node(node) self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") watchpats.append(self["Pat:Slave_started"]%node) idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats) idle_watch.setwatch() out = self.rsh(node, self["StatusCmd"]%node, 1) self.debug("Node %s status: '%s'" %(node, out)) if not out or string.find(out, 'ok') < 0: if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" %(node, "down", self.ShouldBeStatus[node])) self.ShouldBeStatus[node]="down" return 0 if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s: %s" %(node, "up", self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node]="up" # check the output first - because syslog-ng looses messages if string.find(out, 'S_NOT_DC') != -1: # Up and stable return 2 if string.find(out, 'S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" %(node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" %(node)) return None def partition_stable(self, nodes, timeout=None): watchpats = [ ] watchpats.append("Current ping state: S_IDLE") watchpats.append(self["Pat:DC_IDLE"]) self.debug("Waiting for cluster stability...") if timeout == None: timeout = self["DeadTime"] idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout) idle_watch.setwatch() any_up = 0 for node in self.Env["nodes"]: # have each node dump its current state if self.ShouldBeStatus[node] == "up": self.rsh(node, self["StatusCmd"] %node, 1) any_up = 1 if any_up == 0: self.debug("Cluster is inactive") return 1 ret = idle_watch.look() while ret: self.debug(ret) for node in nodes: if re.search(node, ret): return 1 ret = idle_watch.look() self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) return None def cluster_stable(self, timeout=None): partitions = self.find_partitions() for partition in partitions: if not self.partition_stable(partition, timeout): return None return 1 def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh(node, self["StatusCmd"]%node, 1) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 return rc def active_resources(self, node): # [SM].* {node} matches Started, Slave, Master # Stopped wont be matched as it wont include {node} (rc, output) = self.rsh(node, """crm_resource -c""", None) resources = [] for line in output: if re.search("^Resource", line): tmp = AuditResource(self, line) if tmp.type == "primitive" and tmp.host == node: resources.append(tmp.id) return resources def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"): ''' Execute an operation on a resource ''' cmd = self["ExecuteRscOp"] % (app, resource, op, interval) (rc, lines) = self.rsh(node, cmd, None) if rc == 127: self.log("Command '%s' failed. Binary not installed?" % cmd) for line in lines: self.log("Output: "+line) return rc def ResourceLocation(self, rid): ResourceNodes = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": dummy = 0 rc = self.ResourceOp(rid, "monitor", node) # Strange error codes from remote_py # 65024 == not installed # 2048 == 8 # 1792 == 7 # 0 == 0 if rc == 127: dummy = 1 elif rc == 254 or rc == 65024: dummy = 1 #self.debug("%s is not installed on %s: %d" % (rid, node, rc)) elif rc == 0 or rc == 2048 or rc == 8: ResourceNodes.append(node) elif rc == 7 or rc == 1792: dummy = 1 #self.debug("%s is not running on %s: %d" % (rid, node, rc)) else: # not active on this node? self.log("Unknown rc code for %s on %s: %d" % (rid, node, rc)) return ResourceNodes def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": partition = self.rsh(node, self["ParitionCmd"], 1) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" %(node, partition)) ccm_partitions.append(partition) else: self.debug("Partition '%s' from %s is consistent with existing entries" %(partition, node)) else: self.log("bad partition details for %s" %node) else: self.debug("Node %s is down... skipping" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == "up": quorum = self.rsh(node, self["QuorumCmd"], 1) if string.find(quorum, "1") != -1: return 1 elif string.find(quorum, "0") != -1: return 0 else: self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "crmd: .*Action A_RECOVER .* not supported", "ERROR: stonithd_op_result_ready: not signed on", "pingd: .*ERROR: send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd", "verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", ] stonith_ignore = [ "ERROR: stonithd_signon: ", "update_failcount: Updating failcount for child_DoFencing", "ERROR: te_connect_stonith: Sign-in failed: triggered a retry", ] stonith_ignore.extend(common_ignore) ccm = Process("ccm", 0, [ "State transition S_IDLE", "CCM connection appears to have failed", "crmd: .*Action A_RECOVER .* not supported", "crmd: .*Input I_TERMINATE from do_recover", "Exiting to recover from CCM connection failure", "crmd:.*do_exit: Could not recover from internal error", "crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", "cib .*exited with return code 2.", # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "A new node joined the cluster", # "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE", # "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN", "State transition S_STARTING -> S_PENDING", ], [], common_ignore, self.fastfail, self) cib = Process("cib", 0, [ "State transition S_IDLE", "Lost connection to the CIB service", "Connection to the CIB terminated...", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", ], [], common_ignore, self.fastfail, self) lrmd = Process("lrmd", 0, [ "State transition S_IDLE", "LRM Connection failed", "crmd: .*I_ERROR.*lrm_connection_destroy", "State transition S_STARTING -> S_PENDING", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) crmd = Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition .* S_IDLE", "State transition S_STARTING -> S_PENDING", ], [ ], common_ignore, self.fastfail, self) pengine = Process("pengine", 1, [ "State transition S_IDLE", "crmd .*exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*do_exit: Could not recover from internal error", "crmd: .*CRIT: pe_connection_destroy: Connection to the Policy Engine failed", "crmd: .*I_ERROR.*save_cib_contents", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) if self.Env["DoFencing"] == 1 : complist.append(Process("stonithd", 0, [], [ "crmd: .*CRIT: tengine_stonith_connection_destroy: Fencing daemon connection failed", "Attempting connection to fencing daemon", "te_connect_stonith: Connected", ], stonith_ignore, 0, self)) # complist.append(Process("heartbeat", 0, [], [], [], None, self)) if self.fastfail == 0: ccm.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "cib .* exited with return code 2", "ERROR: Respawning client .*cib", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) cib.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) lrmd.pats.extend([ "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) pengine.pats.extend([ "ERROR: Respawning client .*crmd" ]) complist.append(ccm) complist.append(cib) complist.append(lrmd) complist.append(crmd) complist.append(pengine) return complist def NodeUUID(self, node): lines = self.rsh(node, self["UUIDQueryCmd"], 1) for line in lines: self.debug("UUIDLine:"+ line) m = re.search(r'%s.+\((.+)\)' % node, line) if m: return m.group(1) return "" def StandbyStatus(self, node): out=self.rsh(node, self["StandbyQueryCmd"]%node, 1) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/cts/CTS.py b/cts/CTS.py index 564c0614b6..805e256c91 100755 --- a/cts/CTS.py +++ b/cts/CTS.py @@ -1,1174 +1,1174 @@ '''CTS: Cluster Testing System: Main module Classes related to testing high-availability clusters... ''' __copyright__=''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import types, string, select, sys, time, re, os, struct, os, signal import base64, pickle, binascii from UserDict import UserDict from syslog import * from subprocess import Popen,PIPE from CTSvars import * class RemoteExec: '''This is an abstract remote execution class. It runs a command on another machine - somehow. The somehow is up to us. This particular class uses ssh. Most of the work is done by fork/exec of ssh or scp. ''' def __init__(self, Env=None): self.Env = Env # -n: no stdin, -x: no X11 self.Command = "ssh -l root -n -x" # -f: ssh to background self.CommandnoBlock = "ssh -f -l root -n -x" # -B: batch mode, -q: no stats (quiet) self.CpCommand = "scp -B -q" self.OurNode=string.lower(os.uname()[1]) def _fixcmd(self, cmd): return re.sub("\'", "'\\''", cmd) def _cmd(self, *args): '''Compute the string that will run the given command on the given remote system''' args= args[0] sysname = args[0] command = args[1] #print "sysname: %s, us: %s" % (sysname, self.OurNode) if sysname == None or string.lower(sysname) == self.OurNode or sysname == "localhost": ret = command else: ret = self.Command + " " + sysname + " '" + self._fixcmd(command) + "'" #print ("About to run %s\n" % ret) return ret def _cmd_noblock(self, *args): '''Compute the string that will run the given command on the given remote system''' args= args[0] sysname = args[0] command = args[1] #print "sysname: %s, us: %s" % (sysname, self.OurNode) if sysname == None or string.lower(sysname) == self.OurNode or sysname == "localhost": ret = command + " &" else: ret = self.CommandnoBlock + " " + sysname + " '" + self._fixcmd(command) + "'" #print ("About to run %s\n" % ret) return ret def __call__(self, node, command, stdout=0, blocking=1): '''Run the given command on the given remote system If you call this class like a function, this is the function that gets called. It just runs it roughly as though it were a system() call on the remote machine. The first argument is name of the machine to run it on. ''' rc = 0 result = None if not blocking: return os.system(self._cmd_noblock([node, command])) conn = Popen(self._cmd([node, command]), stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True, shell = True) conn.stdin.close() if stdout == 1: result = conn.stdout.readline() else: result = conn.stdout.readlines() conn.stdout.close() rc = conn.wait() self.Env.debug("cmd: target=%s, rc=%d: %s" % (node, rc, command)) if stdout == 1: return result if conn.stderr: errors = conn.stderr.readlines() conn.stderr.close() for err in errors: if not self.Env: print ("stderr: %s" % err) else: self.Env.debug("stderr: %s" % err) if stdout == 0: return rc return (rc, result) def cp(self, *args): '''Perform a remote copy''' cpstring = self.CpCommand for arg in args: cpstring = cpstring + " \'" + arg + "\'" rc = os.system(cpstring) self.Env.debug("cmd: rc=%d: %s" % (rc, cpstring)) return rc class LogWatcher: '''This class watches logs for messages that fit certain regular expressions. Watching logs for events isn't the ideal way to do business, but it's better than nothing :-) On the other hand, this class is really pretty cool ;-) The way you use this class is as follows: Construct a LogWatcher object Call setwatch() when you want to start watching the log Call look() to scan the log looking for the patterns ''' def __init__(self, log, regexes, timeout=10, debug=None): '''This is the constructor for the LogWatcher class. It takes a log name to watch, and a list of regular expressions to watch for." ''' # Validate our arguments. Better sooner than later ;-) for regex in regexes: assert re.compile(regex) self.regexes = regexes self.filename = log self.debug=debug self.whichmatch = -1 self.unmatched = None if self.debug: print "Debug now on for for log", log self.Timeout = int(timeout) self.returnonlymatch = None if not os.access(log, os.R_OK): raise ValueError("File [" + log + "] not accessible (r)") def setwatch(self, frombeginning=None): '''Mark the place to start watching the log from. ''' self.file = open(self.filename, "r") self.size = os.path.getsize(self.filename) if not frombeginning: self.file.seek(0, 2) # 2 means seek to EOF def ReturnOnlyMatch(self, onlymatch=1): '''Mark the place to start watching the log from. ''' self.returnonlymatch = onlymatch def look(self, timeout=None): '''Examine the log looking for the given patterns. It starts looking from the place marked by setwatch(). This function looks in the file in the fashion of tail -f. It properly recovers from log file truncation, but not from removing and recreating the log. It would be nice if it recovered from this as well :-) We return the first line which matches any of our patterns. ''' last_line=None first_line=None if timeout == None: timeout = self.Timeout done=time.time()+timeout+1 if self.debug: print "starting search: timeout=%d" % timeout for regex in self.regexes: print "Looking for regex: ", regex while (timeout <= 0 or time.time() <= done): newsize=os.path.getsize(self.filename) if self.debug > 4: print "newsize = %d" % newsize if newsize < self.size: # Somebody truncated the log! if self.debug: print "Log truncated!" self.setwatch(frombeginning=1) continue if newsize > self.file.tell(): line=self.file.readline() if self.debug > 2: print "Looking at line:", line if line: last_line=line if not first_line: first_line=line if self.debug: print "First line: "+ line which=-1 for regex in self.regexes: which=which+1 if self.debug > 3: print "Comparing line to ", regex #matchobj = re.search(string.lower(regex), string.lower(line)) matchobj = re.search(regex, line) if matchobj: self.whichmatch=which if self.returnonlymatch: return matchobj.group(self.returnonlymatch) else: if self.debug: print "Returning line" return line newsize=os.path.getsize(self.filename) if self.file.tell() == newsize: if timeout > 0: time.sleep(0.025) else: if self.debug: print "End of file" if self.debug: print "Last line: "+last_line return None if self.debug: print "Timeout" if self.debug: print "Last line: "+last_line return None def lookforall(self, timeout=None, allow_multiple_matches=None): '''Examine the log looking for ALL of the given patterns. It starts looking from the place marked by setwatch(). We return when the timeout is reached, or when we have found ALL of the regexes that were part of the watch ''' if timeout == None: timeout = self.Timeout save_regexes = self.regexes returnresult = [] while (len(self.regexes) > 0): oneresult = self.look(timeout) if not oneresult: self.unmatched = self.regexes self.matched = returnresult self.regexes = save_regexes return None returnresult.append(oneresult) if not allow_multiple_matches: del self.regexes[self.whichmatch] else: # Allow multiple regexes to match a single line tmp_regexes = self.regexes self.regexes = [] which = 0 for regex in tmp_regexes: matchobj = re.search(regex, oneresult) if not matchobj: self.regexes.append(regex) self.unmatched = None self.matched = returnresult self.regexes = save_regexes return returnresult class NodeStatus: def __init__(self, Env): self.Env = Env def IsNodeBooted(self, node): '''Return TRUE if the given node is booted (responds to pings)''' return self.Env.rsh("localhost", "ping -nq -c1 -w1 %s >/dev/null 2>&1" % node, 0) == 0 def IsSshdUp(self, node): #return self.rsh(node, "true") == 0; rc = self.Env.rsh(node, "true") return rc == 0 def WaitForNodeToComeUp(self, node, Timeout=300): '''Return TRUE when given node comes up, or None/FALSE if timeout''' timeout=Timeout anytimeouts=0 while timeout > 0: if self.IsNodeBooted(node) and self.IsSshdUp(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) self.Env.debug("Node %s now up" % node) return 1 time.sleep(30) if (not anytimeouts): self.Env.debug("Waiting for node %s to come up" % node) anytimeouts=1 timeout = timeout - 1 self.Env.log("%s did not come up within %d tries" % (node, Timeout)) answer = raw_input('Continue? [nY]') if answer and answer == "n": raise ValueError("%s did not come up within %d tries" % (node, Timeout)) def WaitForAllNodesToComeUp(self, nodes, timeout=300): '''Return TRUE when all nodes come up, or FALSE if timeout''' for node in nodes: if not self.WaitForNodeToComeUp(node, timeout): return None return 1 class ClusterManager(UserDict): '''The Cluster Manager class. This is an subclass of the Python dictionary class. (this is because it contains lots of {name,value} pairs, not because it's behavior is that terribly similar to a dictionary in other ways.) This is an abstract class which class implements high-level operations on the cluster and/or its cluster managers. Actual cluster managers classes are subclassed from this type. One of the things we do is track the state we think every node should be in. ''' def __InitialConditions(self): #if os.geteuid() != 0: # raise ValueError("Must Be Root!") None def _finalConditions(self): for key in self.keys(): if self[key] == None: raise ValueError("Improper derivation: self[" + key + "] must be overridden by subclass.") def __init__(self, Environment, randseed=None): self.Env = Environment self.__InitialConditions() self.clear_cache = 0 self.TestLoggingLevel=0 self.data = { "up" : "up", # Status meaning up "down" : "down", # Status meaning down "StonithCmd" : "stonith -t baytech -p '10.10.10.100 admin admin' %s", "DeadTime" : 30, # Max time to detect dead node... "StartTime" : 90, # Max time to start up # # These next values need to be overridden in the derived class. # "Name" : None, "StartCmd" : None, "StopCmd" : None, "StatusCmd" : None, #"RereadCmd" : None, "BreakCommCmd" : None, "FixCommCmd" : None, #"TestConfigDir" : None, "LogFileName" : None, #"Pat:Master_started" : None, #"Pat:Slave_started" : None, "Pat:We_stopped" : None, "Pat:They_stopped" : None, "BadRegexes" : None, # A set of "bad news" regexes # to apply to the log } self.rsh = self.Env.rsh self.ShouldBeStatus={} self.OurNode=string.lower(os.uname()[1]) self.ShouldBeStatus={} self.ns = NodeStatus(self.Env) def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] def log(self, args): self.Env.log(args) def debug(self, args): self.Env.debug(args) def prepare(self): '''Finish the Initialization process. Prepare to test...''' for node in self.Env["nodes"]: if self.StataCM(node): self.ShouldBeStatus[node]="up" else: self.ShouldBeStatus[node]="down" self.unisolate_node(node) def upcount(self): '''How many nodes are up?''' count=0 for node in self.Env["nodes"]: if self.ShouldBeStatus[node]=="up": count=count+1 return count def TruncLogs(self): '''Truncate the log for the cluster manager so we can start clean''' if self["LogFileName"] != None: self.Env.rsh("localhost", "cp /dev/null " + self["LogFileName"]) def install_config(self, node): return None def clear_all_caches(self): if self.clear_cache: for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "down": self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") else: self.debug("NOT Removing cache file on: "+node) def StartaCM(self, node): '''Start up the cluster manager on a given node''' self.debug("Starting %s on node %s" %(self["Name"], node)) ret = 1 if not self.ShouldBeStatus.has_key(node): self.ShouldBeStatus[node] = "down" if self.ShouldBeStatus[node] != "down": return 1 patterns = [] # Technically we should always be able to notice ourselves starting patterns.append(self["Pat:Local_started"] % node) if self.upcount() == 0: patterns.append(self["Pat:Master_started"] % node) else: patterns.append(self["Pat:Slave_started"] % node) watch = LogWatcher( self["LogFileName"], patterns, timeout=self["StartTime"]+10) watch.setwatch() self.install_config(node) self.ShouldBeStatus[node] = "any" if self.StataCM(node) and self.cluster_stable(self["DeadTime"]): self.log ("%s was already started" %(node)) return 1 # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self["StartCmd"]) if self.rsh(node, startCmd) != 0: self.log ("Warn: Start command failed on node %s" %(node)) return None self.ShouldBeStatus[node]="up" watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.log ("Warn: Startup pattern not found: %s" %(regex)) if watch_result: #self.debug("Found match: "+ repr(watch_result)) self.cluster_stable(self["DeadTime"]) return 1 if self.StataCM(node) and self.cluster_stable(self["DeadTime"]): return 1 self.log ("Warn: Start failed for node %s" %(node)) return None def StartaCMnoBlock(self, node): '''Start up the cluster manager on a given node with none-block mode''' self.debug("Starting %s on node %s" %(self["Name"], node)) # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self["StartCmd"]) self.rsh(node, startCmd, blocking=0) self.ShouldBeStatus[node]="up" return 1 def StopaCM(self, node): '''Stop the cluster manager on a given node''' self.debug("Stopping %s on node %s" %(self["Name"], node)) if self.ShouldBeStatus[node] != "up": return 1 if self.rsh(node, self["StopCmd"]) == 0: self.ShouldBeStatus[node]="down" self.cluster_stable(self["DeadTime"]) return 1 else: self.log ("Could not stop %s on node %s" %(self["Name"], node)) return None def StopaCMnoBlock(self, node): '''Stop the cluster manager on a given node with none-block mode''' self.debug("Stopping %s on node %s" %(self["Name"], node)) self.rsh(node, self["StopCmd"], blocking=0) self.ShouldBeStatus[node]="down" return 1 def cluster_stable(self, timeout = None): time.sleep(self["StableTime"]) return 1 def node_stable(self, node): return 1 def RereadCM(self, node): '''Force the cluster manager on a given node to reread its config This may be a no-op on certain cluster managers. ''' rc=self.rsh(node, self["RereadCmd"]) if rc == 0: return 1 else: self.log ("Could not force %s on node %s to reread its config" % (self["Name"], node)) return None def StataCM(self, node): '''Report the status of the cluster manager on a given node''' out=self.rsh(node, self["StatusCmd"], 1) ret= (string.find(out, 'stopped') == -1) try: if ret: if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s" % (node, "up", self.ShouldBeStatus[node])) else: if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" % (node, "down", self.ShouldBeStatus[node])) except KeyError: pass if ret: self.ShouldBeStatus[node]="up" else: self.ShouldBeStatus[node]="down" return ret def startall(self, nodelist=None): '''Start the cluster manager on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' ret = 1 map = {} if not nodelist: nodelist=self.Env["nodes"] for node in nodelist: if self.ShouldBeStatus[node] == "down": if not self.StartaCM(node): ret = 0 return ret def stopall(self, nodelist=None): '''Stop the cluster managers on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' ret = 1 map = {} if not nodelist: nodelist=self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": if not self.StopaCM(node): ret = 0 return ret def rereadall(self, nodelist=None): '''Force the cluster managers on every node in the cluster to reread their config files. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist=self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": self.RereadCM(node) def statall(self, nodelist=None): '''Return the status of the cluster managers in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' result={} if not nodelist: nodelist=self.Env["nodes"] for node in nodelist: if self.StataCM(node): result[node] = "up" else: result[node] = "down" return result def isolate_node(self, target, nodes=None): '''isolate the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: rc = self.rsh(target, self["BreakCommCmd"] % node) if rc != 0: self.log("Could not break the communication between %s and %s: %d" % (target, node, rc)) return None else: self.debug("Communication cut between %s and %s" % (target, node)) return 1 def unisolate_node(self, target, nodes=None): '''fix the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: restored = 0 # Limit the amount of time we have asynchronous connectivity for # Restore both sides as simultaneously as possible self.rsh(target, self["FixCommCmd"] % node, blocking=0) self.rsh(node, self["FixCommCmd"] % target, blocking=0) self.debug("Communication restored between %s and %s" % (target, node)) def reducecomm_node(self,node): '''reduce the communication between the nodes''' rc = self.rsh(node, self["ReduceCommCmd"]%(self.Env["XmitLoss"],self.Env["RecvLoss"])) if rc == 0: return 1 else: self.log("Could not reduce the communication between the nodes from node: %s" % node) return None def restorecomm_node(self,node): '''restore the saved communication between the nodes''' rc = 0 if float(self.Env["XmitLoss"])!=0 or float(self.Env["RecvLoss"])!=0 : rc = self.rsh(node, self["RestoreCommCmd"]); if rc == 0: return 1 else: self.log("Could not restore the communication between the nodes from node: %s" % node) return None def HasQuorum(self, node_list): "Return TRUE if the cluster currently has quorum" # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes raise ValueError("Abstract Class member (HasQuorum)") def Components(self): raise ValueError("Abstract Class member (Components)") def oprofileStart(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStart(n) elif node in self.Env["oprofile"]: self.debug("Enabling oprofile on %s" % node) self.rsh(node, "opcontrol --init") - self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=10 --image=all") + self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") self.rsh(node, "opcontrol --start") self.rsh(node, "opcontrol --reset") def oprofileSave(self, test, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileSave(test, n) elif node in self.Env["oprofile"]: self.rsh(node, "opcontrol --dump") self.rsh(node, "opcontrol --save=cts.%d" % test) # Read back with: opreport -l session:cts.0 image:/usr/lib/heartbeat/c* if None: self.rsh(node, "opcontrol --reset") else: self.oprofileStop(node) self.oprofileStart(node) def oprofileStop(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStop(n) elif node in self.Env["oprofile"]: self.debug("Stopping oprofile on %s" % node) self.rsh(node, "opcontrol --reset") self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") class Resource: ''' This is an HA resource (not a resource group). A resource group is just an ordered list of Resource objects. ''' def __init__(self, cm, rsctype=None, instance=None): self.CM = cm self.ResourceType = rsctype self.Instance = instance self.needs_quorum = 1 def Type(self): return self.ResourceType def Instance(self, nodename): return self.Instance def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. It is analagous to the "status" operation on SystemV init scripts and heartbeat scripts. FailSafe calls it the "exclusive" operation. ''' raise ValueError("Abstract Class member (IsRunningOn)") return None def IsWorkingCorrectly(self, nodename): ''' This member function returns true if our resource is operating correctly on the given node in the cluster. Heartbeat does not require this operation, but it might be called the Monitor operation, which is what FailSafe calls it. For remotely monitorable resources (like IP addresses), they *should* be monitored remotely for testing. ''' raise ValueError("Abstract Class member (IsWorkingCorrectly)") return None def Start(self, nodename): ''' This member function starts or activates the resource. ''' raise ValueError("Abstract Class member (Start)") return None def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' raise ValueError("Abstract Class member (Stop)") return None def __repr__(self): if (self.Instance and len(self.Instance) > 1): return "{" + self.ResourceType + "::" + self.Instance + "}" else: return "{" + self.ResourceType + "}" class Component: def kill(self, node): None class Process(Component): def __init__(self, name, dc_only, pats, dc_pats, badnews_ignore, triggersreboot, cm): self.name = str(name) self.dc_only = dc_only self.pats = pats self.dc_pats = dc_pats self.CM = cm self.badnews_ignore = badnews_ignore self.triggersreboot = triggersreboot self.KillCmd = "killall -9 " + self.name def kill(self, node): if self.CM.rsh(node, self.KillCmd) != 0: self.CM.log ("ERROR: Kill %s failed on node %s" %(self.name,node)) return None return 1 class ScenarioComponent: def __init__(self, Env): self.Env = Env def IsApplicable(self): '''Return TRUE if the current ScenarioComponent is applicable in the given LabEnvironment given to the constructor. ''' raise ValueError("Abstract Class member (IsApplicable)") def SetUp(self, CM): '''Set up the given ScenarioComponent''' raise ValueError("Abstract Class member (Setup)") def TearDown(self, CM): '''Tear down (undo) the given ScenarioComponent''' raise ValueError("Abstract Class member (Setup)") class Scenario: ( '''The basic idea of a scenario is that of an ordered list of ScenarioComponent objects. Each ScenarioComponent is SetUp() in turn, and then after the tests have been run, they are torn down using TearDown() (in reverse order). A Scenario is applicable to a particular cluster manager iff each ScenarioComponent is applicable. A partially set up scenario is torn down if it fails during setup. ''') def __init__(self, Components): "Initialize the Scenario from the list of ScenarioComponents" for comp in Components: if not issubclass(comp.__class__, ScenarioComponent): raise ValueError("Init value must be subclass of" " ScenarioComponent") self.Components = Components def IsApplicable(self): ( '''A Scenario IsApplicable() iff each of its ScenarioComponents IsApplicable() ''' ) for comp in self.Components: if not comp.IsApplicable(): return None return 1 def SetUp(self, CM): '''Set up the Scenario. Return TRUE on success.''' j=0 while j < len(self.Components): if not self.Components[j].SetUp(CM): # OOPS! We failed. Tear partial setups down. CM.log("Tearing down partial setup") self.TearDown(CM, j) return None j=j+1 return 1 def TearDown(self, CM, max=None): '''Tear Down the Scenario - in reverse order.''' if max == None: max = len(self.Components)-1 j=max while j >= 0: self.Components[j].TearDown(CM) j=j-1 class InitClusterManager(ScenarioComponent): ( '''InitClusterManager is the most basic of ScenarioComponents. This ScenarioComponent simply starts the cluster manager on all the nodes. It is fairly robust as it waits for all nodes to come up before starting as they might have been rebooted or crashed for some reason beforehand. ''') def __init__(self, Env): pass def IsApplicable(self): '''InitClusterManager is so generic it is always Applicable''' return 1 def SetUp(self, CM): '''Basic Cluster Manager startup. Start everything''' CM.prepare() # Clear out the cobwebs ;-) self.TearDown(CM) # Now start the Cluster Manager on all the nodes. CM.log("Starting Cluster Manager on all nodes.") return CM.startall() def TearDown(self, CM): '''Set up the given ScenarioComponent''' # Stop the cluster manager everywhere CM.log("Stopping Cluster Manager on all nodes") return CM.stopall() class PingFest(ScenarioComponent): ( '''PingFest does a flood ping to each node in the cluster from the test machine. If the LabEnvironment Parameter PingSize is set, it will be used as the size of ping packet requested (via the -s option). If it is not set, it defaults to 1024 bytes. According to the manual page for ping: Outputs packets as fast as they come back or one hundred times per second, whichever is more. For every ECHO_REQUEST sent a period ``.'' is printed, while for every ECHO_REPLY received a backspace is printed. This provides a rapid display of how many packets are being dropped. Only the super-user may use this option. This can be very hard on a net- work and should be used with caution. ''' ) def __init__(self, Env): self.Env = Env def IsApplicable(self): '''PingFests are always applicable ;-) ''' return 1 def SetUp(self, CM): '''Start the PingFest!''' self.PingSize=1024 if CM.Env.has_key("PingSize"): self.PingSize=CM.Env["PingSize"] CM.log("Starting %d byte flood pings" % self.PingSize) self.PingPids=[] for node in CM.Env["nodes"]: self.PingPids.append(self._pingchild(node)) CM.log("Ping PIDs: " + repr(self.PingPids)) return 1 def TearDown(self, CM): '''Stop it right now! My ears are pinging!!''' for pid in self.PingPids: if pid != None: CM.log("Stopping ping process %d" % pid) os.kill(pid, signal.SIGKILL) def _pingchild(self, node): Args = ["ping", "-qfn", "-s", str(self.PingSize), node] sys.stdin.flush() sys.stdout.flush() sys.stderr.flush() pid = os.fork() if pid < 0: self.Env.log("Cannot fork ping child") return None if pid > 0: return pid # Otherwise, we're the child process. os.execvp("ping", Args) self.Env.log("Cannot execvp ping: " + repr(Args)) sys.exit(1) class PacketLoss(ScenarioComponent): ( ''' It would be useful to do some testing of CTS with a modest amount of packet loss enabled - so we could see that everything runs like it should with a certain amount of packet loss present. ''') def IsApplicable(self): '''always Applicable''' return 1 def SetUp(self, CM): '''Reduce the reliability of communications''' if float(CM.Env["XmitLoss"]) == 0 and float(CM.Env["RecvLoss"]) == 0 : return 1 for node in CM.Env["nodes"]: CM.reducecomm_node(node) CM.log("Reduce the reliability of communications") return 1 def TearDown(self, CM): '''Fix the reliability of communications''' if float(CM.Env["XmitLoss"]) == 0 and float(CM.Env["RecvLoss"]) == 0 : return 1 for node in CM.Env["nodes"]: CM.unisolate_node(node) CM.log("Fix the reliability of communications") class BasicSanityCheck(ScenarioComponent): ( ''' ''') def IsApplicable(self): return self.Env["DoBSC"] def SetUp(self, CM): CM.prepare() # Clear out the cobwebs self.TearDown(CM) # Now start the Cluster Manager on all the nodes. CM.log("Starting Cluster Manager on BSC node(s).") return CM.startall() def TearDown(self, CM): CM.log("Stopping Cluster Manager on BSC node(s).") return CM.stopall() class RollingUpgrade(ScenarioComponent): ( ''' Test a rolling upgrade between two versions of the stack ''') def __init__(self, Env): self.Env = Env def IsApplicable(self): if not self.Env["rpm-dir"]: return None if not self.Env["current-version"]: return None if not self.Env["previous-version"]: return None return 1 def install(self, node, version): target_dir = "/tmp/rpm-%s" % version src_dir = "%s/%s" % (self.CM.Env["rpm-dir"], version) rc = self.CM.rsh(node, "mkdir -p %s" % target_dir) rc = self.CM.cp("%s/*.rpm %s:%s" % (src_dir, node, target_dir)) rc = self.CM.rsh(node, "rpm -Uvh --force %s/*.rpm" % (target_dir)) return self.success() def upgrade(self, node): return self.install(node, self.CM.Env["current-version"]) def downgrade(self, node): return self.install(node, self.CM.Env["previous-version"]) def SetUp(self, CM): CM.prepare() # Clear out the cobwebs CM.stopall() CM.log("Downgrading all nodes to %s." % self.Env["previous-version"]) for node in self.Env["nodes"]: if not self.downgrade(node): CM.log("Couldn't downgrade %s" % node) return None return 1 def TearDown(self, CM): # Stop everything CM.log("Stopping Cluster Manager on Upgrade nodes.") CM.stopall() CM.log("Upgrading all nodes to %s." % self.Env["current-version"]) for node in self.Env["nodes"]: if not self.upgrade(node): CM.log("Couldn't upgrade %s" % node) return None return 1