diff --git a/cts/CM_hb.py.in b/cts/CM_hb.py.in index 41682d4762..355a1463f3 100755 --- a/cts/CM_hb.py.in +++ b/cts/CM_hb.py.in @@ -1,649 +1,649 @@ #!@PYTHON@ '''CTS: Cluster Testing System: heartbeat dependent modules... Classes related to testing high-availability clusters... Lots of things are implemented. Lots of things are not implemented. We have many more ideas of what to do than we've implemented. ''' __copyright__=''' -Copyright (C) 2000,2001 Alan Robertson +Copyright (C) 2000,2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from CTS import * class HeartbeatCM(ClusterManager): ''' The heartbeat cluster manager class. It implements the things we need to talk to and manipulate heartbeat clusters ''' def __init__(self, Environment, randseed=None): self.ResourceDirs = ["@sysconfdir@/ha.d/resource.d", "@sysconfdir@/rc.d/init.d", "@sysconfdir@/rc.d/"] self.ResourceFile = Environment["HAdir"] + "/haresources" self.ConfigFile = Environment["HAdir"]+ "/ha.cf" ClusterManager.__init__(self, Environment, randseed=randseed) self.update({ "Name" : "heartbeat", "DeadTime" : 30, "StableTime" : 30, "StartCmd" : "@libdir@/heartbeat/ha_logd -d >/dev/null 2>&1; MALLOC_CHECK_=2 @libdir@/heartbeat/heartbeat >/dev/null 2>&1", "StopCmd" : "@libdir@/heartbeat/heartbeat -k", "StatusCmd" : "@libdir@/heartbeat/heartbeat -s", "RereadCmd" : "@libdir@/heartbeat/heartbeat -r", "StartDRBDCmd" : "@sysconfdir@/init.d/drbd start >/dev/null 2>&1", "StopDRBDCmd" : "@sysconfdir@/init.d/drbd stop", "StatusDRBDCmd" : "@sysconfdir@/init.d/drbd status", "DRBDCheckconf" : "@sysconfdir@/init.d/drbd checkconfig >/var/run/drbdconf 2>&1", "BreakCommCmd" : "@libdir@/heartbeat/TestHeartbeatComm break-communication >/dev/null 2>&1", "FixCommCmd" : "@libdir@/heartbeat/TestHeartbeatComm fix-communication >/dev/null 2>&1", "DelFileCommCmd" : "/usr/lib/heartbeat/TestHeartbeatComm delete-testingfile >/dev/null 2>&1", "SaveFileCmd" : "/usr/lib/heartbeat/TestHeartbeatComm save-testingfile /tmp/OnlyForTesting >/dev/null 2>&1", "ReduceCommCmd" : "/usr/lib/heartbeat/TestHeartbeatComm reduce-communication %s %s>/dev/null 2>&1", "RestoreCommCmd" : "/usr/lib/heartbeat/TestHeartbeatComm restore-communication /tmp/OnlyForTesting >/dev/null 2>&1", "IPaddrCmd" : "@sysconfdir@/ha.d/resource.d/IPaddr %s status", "Standby" : "@libdir@/heartbeat/hb_standby >/dev/null 2>&1", "TestConfigDir" : "@sysconfdir@/ha.d/testconfigs", "LogFileName" : Environment["LogFileName"], # Patterns to look for in the log files for various occasions... "Pat:We_started" : " (%s) .* Initial resource acquisition complete", "Pat:They_started" : " (%s) .* Initial resource acquisition complete", "Pat:We_stopped" : "%s heartbeat.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s heartbeat.*node (%s).*: is dead", "Pat:They_dead" : "node (%s).*: is dead", "Pat:All_stopped" : " (%s).*heartbeat.*Heartbeat shutdown complete", "Pat:StandbyOK" : "Standby resource acquisition done", "Pat:StandbyNONE" : "No reply to standby request", "Pat:StandbyTRANSIENT" : "standby message.*ignored.*in flux", "Pat:Return_partition" : "Cluster node %s returning after partition", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"Shutting down\.", r"Forcing shutdown\.", r"Both machines own .* resources!", r"No one owns .* resources!", r", exiting\.", r"ERROR:", r"CRIT.*:", ), }) self.cf=HBConfig(Environment["HAdir"]) self._finalConditions() def SetClusterConfig(self, configpath="default", nodelist=None): '''Activate the named test configuration throughout the cluster. This code is specialized to heartbeat. ''' rc=1 Command=''' cd %s%s%s; : cd to test configuration directory for j in * do if [ -f "@sysconfdir@/ha.d/$j" ]; then if cmp $j @sysconfdir@/ha.d/$j >/dev/null 2>&1; then : Config file $j is already up to correct. else echo "Touching $j" cp $j @sysconfdir@/ha.d/$j fi fi done ''' % (self["TestConfigDir"], os.sep, configpath) if nodelist == None: nodelist=self.Env["nodes"] for node in nodelist: if not self.rsh(node, Command): rc=None self.rereadall() return rc def ResourceGroups(self): ''' Return the list of resources groups defined in this configuration. This code is specialized to heartbeat. We make the assumption that the resource file on the local machine is the same as that of a cluster member. We aren't necessarily a member of the cluster (In fact, we usually aren't). ''' RscGroups=[] file = open(self.ResourceFile, "r") while (1): line = file.readline() if line == "": break idx=string.find(line, '#') if idx >= 0: line=line[:idx] if line == "": continue line = string.strip(line) # Is this wrong? tokens = re.split("[ \t]+", line) # Ignore the default server for this resource group del tokens[0] Group=[] for token in tokens: if token != "": idx=string.find(token, "::") if idx > 0: tuple=string.split(token, "::") else: # # Is this an IPaddr default resource type? # if re.match("^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$" , token): tuple=["IPaddr", token] else: tuple = [token, None] Resource = self.hbResource(tuple[0], tuple[1]) Group.append(Resource) RscGroups.append(Group) file.close() return RscGroups def InternalCommConfig(self): ''' Return a list of communication paths. Each path consists of a tuple like this: mediatype serial | ip interface/dev name eth0 | /dev/ttyS0... protocol tcp?? | udp | None port Number | None ''' Path = {"mediatype" : None, "interface": None, "protocol" : None, "port": None} cf = self.cf for cfp in cf.Parameters: if cfp == "serial": if Path["mediatype"] == None: Path["mediatype"] = ["serial"] else: Path["mediatype"].append("serial") if Path["interface"] == None: Path["interface"] = cf.Parameters["serial"] else: for serial in cf.Parameters["serial"]: Path["interface"].append(serial) if cfp == "bcast" or cfp == "mcast" or cfp == "ucast" : if Path["mediatype"] == None: Path["mediatype"] = ["ip"] else: Path["mediatype"].append("ip") if cfp == "bcast": interfaces = cf.Parameters[cfp] if cfp == "ucast": interfaces = [cf.Parameters[cfp][0]] if cfp == "mcast": Path["port"] = [cf.Parameters[cfp][0][2]] Path["protocol"] = "udp" interfaces = [cf.Parameters[cfp][0][0]] if Path["interface"] == None: Path["interface"] = interfaces else: for interface in interfaces: if interface not in Path["interface"]: Path["interface"].append(interface) if cfp == "udpport": Path["port"] = cf.Parameters["udpport"] Path["protocol"] = ["udp"] if Path["port"] == None: Path["port"] = [694] return Path def HasQuorum(self, node_list): ( '''Return TRUE if the cluster currently has quorum. According to current heartbeat code this means one node is up. ''') return self.upcount() >= 1 def hbResource(self, type, instance): ''' Our job is to create the right kind of resource. For most resources, we just create an HBResource object, but for IP addresses, we create an HBipResource instead. Some other types of resources may also be added as special cases. ''' if type == "IPaddr": return HBipResource(self, type, instance) return HBResource(self, type, instance) class HBResource(Resource): def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. We call the status operation for the resource script. ''' return self._ResourceOperation("status", "OK|running", nodename) def _ResourceOperation(self, operation, pattern, nodename): ''' We call the requested operation for the resource script. We don't care what kind of operation we were called to do particularly. When we were created, we were bound to a cluster manager, which has its own remote execution method (which we use here). ''' if self.Instance == None: instance = "" else: instance = self.Instance Rlist = 'LIST="' for dir in self.CM.ResourceDirs: Rlist = Rlist + " " + dir Rlist = Rlist + '"; ' Script= Rlist + ''' T="''' + self.ResourceType + '''"; I="''' + instance + '''"; for dir in $LIST; do if [ -f "$dir/$T" -a -x "$dir/$T" ] then "$dir/$T" $I ''' + operation + ''' exit $? fi done 2>&1; exit 1;''' #print "Running " + Script + "\n" line = self.CM.rsh.readaline(nodename, Script) if operation == "status": if re.search(pattern, line): return 1 return self.CM.rsh.lastrc == 0 def Start(self, nodename): ''' This member function starts or activates the resource. ''' return self._ResourceOperation("start", None, nodename) def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' return self._ResourceOperation("stop", None, nodename) # def IsWorkingCorrectly(self, nodename): # "We default to returning TRUE for this one..." # if self.Instance == None: # self.CM.log("Faking out: " + self.ResourceType) # else: # self.CM.log("Faking out: " + self.ResourceType + self.Instance) # return 1 def IsWorkingCorrectly(self, nodename): return self._ResourceOperation("monitor", "OK", nodename) class HBipResource(HBResource): ''' We are a specialized IP address resource which knows how to test to see if our resource type is actually being served. We are cheat and run the IPaddr resource script on the current machine, because it's a more interesting case. ''' def IsWorkingCorrectly(self, nodename): return self._ResourceOperation("monitor", "OK", self.CM.OurNode) # # A heartbeat configuration class... # It reads and parses the heartbeat config # files # class HBConfig: # Which options have multiple words on the line? MultiTokenKeywords = {"mcast" : None , "stonith_host": None} def __init__(self, configdir="/etc/ha.d"): self.Parameters = {} self.ResourceGroups = {} self._ReadConfig(os.path.join(configdir, "ha.cf")) FirstUp_NodeSelection() LastUp_NodeSelection() no_failback = NoAutoFailbackPolicy() auto_failback = AutoFailbackPolicy() # # We allow each resource group to have its own failover/back # policies # if self.Parameters.has_key("nice_failback") \ and self.Parameters["nice_failback"] == "on": HBConfig.DefaultFailbackPolicy = no_failback elif self.Parameters.has_key("auto_failback") \ and self.Parameters["auto_failback"] == "off": HBConfig.DefaultFailbackPolicy = no_failback else: HBConfig.DefaultFailbackPolicy = auto_failback HBConfig.DefaultNodeSelectionPolicy = NodeSelectionPolicies["FirstUp"] self._ReadResourceGroups(os.path.join(configdir, "haresources")) # Read ha.cf config file def _ReadConfig(self, ConfigFile): self.ConfigPath = ConfigFile; fp = open(ConfigFile) while 1: line=fp.readline() if not line: return line = re.sub("#.*", "", line) line = string.rstrip(line) if len(line) < 1: continue tokens = line.split() key = tokens[0] values = tokens[1:] if HBConfig.MultiTokenKeywords.has_key(key): # group items from this line together, and separate # from the items on other lines values = [values] if self.Parameters.has_key(key): if key == "node": self.Parameters[key].extend(values) else: self.Parameters[key].append(values[0]) else: self.Parameters[key] = values # Read a line from the haresources file... # - allow for \ continuations... def _GetRscLine(self, fp): linesofar = None continuation=1 while continuation: continuation = 0 line=fp.readline() if not line: break line = re.sub("#.*", "", line) if line[len(line)-2] == "\\": line = line[0:len(line)-2] + "\n" continuation=1 if linesofar == None: linesofar = line else: linesofar = linesofar + line return linesofar def _ReadResourceGroups(self, RscFile): self.RscPath = RscFile; fp = open(RscFile) thisline = "" while 1: line=self._GetRscLine(fp) if not line: return line = line.strip() if len(line) < 1: continue tokens = line.split() node = tokens[0] resources = tokens[1:] rscargs=[] for resource in resources: name=resource.split("::", 1) if len(name) > 1: args=name[1].split("::") else: args=None name = name[0] rscargs.append(Resource(name, args)) name = tokens[0] + "__" + tokens[1] assert not self.ResourceGroups.has_key(name) # # Create the resource group # self.ResourceGroups[name] = ResourceGroup(name \ , rscargs , node.split(",") # Provide default value , HBConfig.DefaultNodeSelectionPolicy , HBConfig.DefaultFailbackPolicy) # # Return the list of nodes in the cluster... # def nodes(self): result = self.Parameters["node"] result.sort() return result class ClusterState: pass class ResourceGroup: def __init__(self, name, resourcelist, possiblenodes , nodeselection_policy, failback_policy): self.name = name self.resourcelist = resourcelist self.possiblenodes = possiblenodes self.nodeselection_policy = nodeselection_policy self.failback_policy = failback_policy self.state = None self.attributes = {} self.history = [] def __str__(self): result = string.join(self.possiblenodes, ",") for rsc in self.resourcelist: result = result + " " + str(rsc) return result class Resource: def __init__(self, name, arguments=None): self.name = name self.arguments = arguments def __str__(self): result = self.name try: for arg in self.arguments: result = result + "::" + arg except TypeError: pass return result ####################################################################### # # Base class defining policies for where we put resources # when we're starting, or when a failure has occurred... # ####################################################################### NodeSelectionPolicies = {} class NodeSelectionPolicy: def __init__(self, name): self.name = name NodeSelectionPolicies[name] = self def name(self): return self.name # # nodenames: the list of nodes eligible to run this resource # ResourceGroup: the group to be started... # ClusterState: Cluster state information # def SelectNode(self, nodenames, ResourceGroup, ClusterState): return None # # Choose the first node in the list... # class FirstUp_NodeSelection(NodeSelectionPolicy): def __init__(self): NodeSelectionPolicy.__init__(self, "FirstUp") def SelectNode(self, nodenames, ResourceGroup, ClusterState): return nodenames[0] # # Choose the last node in the list... # (kind of a dumb example) # class LastUp_NodeSelection(NodeSelectionPolicy): def __init__(self): NodeSelectionPolicy.__init__(self, "LastUp") def SelectNode(self, nodenames, ResourceGroup, ClusterState): return nodenames[len(nodenames)-1] ####################################################################### # # Failback policies... # # Where to locate a resource group when an eligible node rejoins # the cluster... # ####################################################################### FailbackPolicies = {} class FailbackPolicy: def __init__(self, name): self.name = name FailbackPolicies[name] = self def name(self): return self.name # # currentnode: The node the service is currently on # returningnode: The node which just rejoined # eligiblenodes: Permitted nodes which are up # SelectionPolicy: the normal NodeSelectionPolicy # Cluster state information... # def SelectNewNode(self, currentnode, returningnode, eligiblenodes , SelectionPolicy, ResourceGroup, ClusterState): return None # # This FailbackPolicy is like "normal failback" in heartbeat # class AutoFailbackPolicy(FailbackPolicy): def __init__(self): FailbackPolicy.__init__(self, "failback") def SelectNewNode(self, currentnode, returningnode, eligiblenodes , SelectionPolicy, ResourceGroup, ClusterState): # Select where it should run based on current normal policy # just as though we were starting it for the first time. return SelectionPolicy(eligiblenodes, ResourceGroup, ClusterState) # # This FailbackPolicy is like "nice failback" in heartbeat # class NoAutoFailbackPolicy(FailbackPolicy): def __init__(self): FailbackPolicy.__init__(self, "failuresonly") def SelectNewNode(self, currentnode, returningnode, eligiblenodes , SelectionPolicy, ResourceGroup): # Always leave the resource where it is... return currentnode ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': FirstUp_NodeSelection() LastUp_NodeSelection() no_failback = NoAutoFailbackPolicy() auto_failback = AutoFailbackPolicy() cf=HBConfig("/etc/ha.d") print "Cluster configuration:\n" print "Nodes:", cf.nodes(), "\n" print "Config Parameters:", cf.Parameters, "\n" for groupname in cf.ResourceGroups.keys(): print "Resource Group %s:\n\t%s\n" % (groupname, cf.ResourceGroups[groupname]) diff --git a/cts/CTSaudits.py.in b/cts/CTSaudits.py.in index bc98a139ef..1b182e87ea 100755 --- a/cts/CTSaudits.py.in +++ b/cts/CTSaudits.py.in @@ -1,553 +1,553 @@ #!@PYTHON@ '''CTS: Cluster Testing System: Audit module ''' __copyright__=''' -Copyright (C) 2000, 2001 Alan Robertson +Copyright (C) 2000, 2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import time, os, popen2, string import CTS import os import popen2 class ClusterAudit: def __init__(self, cm): self.CM = cm def __call__(self): raise ValueError("Abstract Class member (__call__)") def is_applicable(self): '''Return TRUE if we are applicable in the current test configuration''' raise ValueError("Abstract Class member (is_applicable)") return 1 def name(self): raise ValueError("Abstract Class member (name)") AllAuditClasses = [ ] class ResourceAudit(ClusterAudit): def name(self): return "ResourceAudit" def _doauditRsc(self, resource): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if resource.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def _doaudit(self): '''Check to see if all resources are running in exactly one place in the cluster. We also verify that the members of a resource group are all running on the same node in the cluster, and we monitor that they are all running "properly". ''' Fatal = 0 result = [] # Thought: use self.CM.find_partitions() and make this audit # aware of partitions. Since in a split cluster one # partition may have quorum (and permission to run resources) # and the other not. Groups = self.CM.ResourceGroups() for group in Groups: GrpServedBy = None lastResource = None for resource in group: # # _doauditRsc returns the set of nodes serving # the given resource. This is normally a single node. # ResourceNodes = self._doauditRsc(resource) # Is the resource served without quorum present? if not self.CM.HasQuorum(None) and len(ResourceNodes) != 0 and resource.needs_quorum: result.append("Resource " + repr(resource) + " active without Quorum: " + repr(ResourceNodes)) # Is the resource served at all? elif len(ResourceNodes) == 0 and self.CM.HasQuorum(None): result.append("Resource " + repr(resource) + " not served anywhere.") # Is the resource served too many times? elif len(ResourceNodes) > 1: result.append("Resource " + repr(resource) + " served too many times: " + repr(ResourceNodes)) self.CM.log("Resource " + repr(resource) + " served too many times: " + repr(ResourceNodes)) Fatal = 1 elif GrpServedBy == None: GrpServedBy = ResourceNodes # Are all the members of the Rsc Grp served by the same node? elif GrpServedBy != ResourceNodes: result.append("Resource group resources" + repr(resource) + " running on different nodes: " + repr(ResourceNodes)+" vs "+repr(GrpServedBy) + "(otherRsc = " + repr(lastResource) + ")") self.CM.log("Resource group resources" + repr(resource) + " running on different nodes: " + repr(ResourceNodes)+" vs "+repr(GrpServedBy) + "(otherRsc = " + repr(lastResource) + ")") Fatal = 1 if self.CM.Env.has_key("SuppressMonitoring") and \ self.CM.Env["SuppressMonitoring"]: continue # Is the resource working correctly ? if not Fatal and len(ResourceNodes) == 1: beforearpchild = popen2.Popen3("date;/sbin/arp -n|cut -c1-15,26-50,75-" , None) beforearpchild.tochild.close() # /dev/null if not resource.IsWorkingCorrectly(ResourceNodes[0]): afterarpchild = popen2.Popen3("/sbin/arp -n|cut -c1-15,26-50,75-" , None) afterarpchild.tochild.close() # /dev/null result.append("Resource " + repr(resource) + " not operating properly." + " Resource is running on " + ResourceNodes[0]); Fatal = 1 self.CM.log("ARP table before failure ========"); for line in beforearpchild.fromchild.readlines(): self.CM.log(line) self.CM.log("ARP table after failure ========"); for line in afterarpchild.fromchild.readlines(): self.CM.log(line) self.CM.log("End of ARP tables ========"); try: beforearpchild.wait() afterarpchild.wait() except OSError: pass afterarpchild.fromchild.close() beforearpchild.fromchild.close() lastResource = resource if (Fatal): result.insert(0, "FATAL") # Kludgy. return result def __call__(self): # # Audit the resources. Since heartbeat doesn't really # know when resource acquisition is complete, we will # poll until things get stable. # # Having a resource duplicately implemented is a Fatal Error # with no tolerance granted. # audresult = self._doaudit() # # Probably the constant below should be a CM parameter. # Then it could be 0 for FailSafe. # Of course, it really depends on what resources # you have in the test suite, and how long it takes # for them to settle. # Recently, we've changed heartbeat so we know better when # resource acquisition is done. # audcount=5; while(audcount > 0): audresult = self._doaudit() if (len(audresult) <= 0 or audresult[0] == "FATAL"): audcount=0 else: audcount = audcount - 1 if (audcount > 0): time.sleep(1) if (len(audresult) > 0): self.CM.log("Fatal Audit error: " + repr(audresult)) return (len(audresult) == 0) def is_applicable(self): if self.CM["Name"] == "heartbeat": return 1 return 0 class HAResourceAudit(ClusterAudit): def __init__(self, cm): self.CM = cm def _RscRunningNodes(self, resource): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if resource.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def __call__(self): passed = 1 NodeofRsc = {} NumofInc = {} MaxofInc = {} self.CM.debug("Do Audit HAResourceAudit") #Calculate the count of active nodes up_count = 0; for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: up_count += 1 #Make sure the resouces are running on one and only one node Resources = self.CM.Resources() for resource in Resources : RunningNodes = self._RscRunningNodes(resource) NodeofRsc[resource.rid]=RunningNodes if resource.inc_name == None: #Is the resource served without quorum present? if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum: self.CM.log("Resource " + repr(resource) + " active without Quorum: " + repr(RunningNodes)) passed = 0 #Is the resource served at all? elif len(RunningNodes) == 0 and self.CM.HasQuorum(None): self.CM.log("Resource " + repr(resource) + " not served anywhere.") passed = 0 # Is the resource served too many times? elif len(RunningNodes) > 1: self.CM.log("Resource " + repr(resource) + " served too many times: " + repr(RunningNodes)) passed = 0 else: if not NumofInc.has_key(resource.inc_name): NumofInc[resource.inc_name]=0 MaxofInc[resource.inc_name]=resource.inc_max running = 1 #Is the resource served without quorum present? if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum == 1: self.CM.log("Resource " + repr(resource) + " active without Quorum: " + repr(RunningNodes)) passed = 0 #Is the resource served at all? elif len(RunningNodes) == 0 : running = 0 # Is the resource served too many times? elif len(RunningNodes) > 1: self.CM.log("Resource " + repr(resource) + " served too many times: " + repr(RunningNodes)) passed = 0 if running: NumofInc[resource.inc_name] += 1 if self.CM.HasQuorum(None): for inc_name in NumofInc.keys(): if NumofInc[inc_name] != min(up_count,MaxofInc[inc_name]): passed = 0 self.CM.log("Incarnation %s has %d instances(max %d instances).\ Now %d nodes are up"%(str(inc_name),NumofInc[inc_name], \ MaxofInc[inc_name],up_count)) #Make sure the resouces with "must","placement" constraint are running on the same node Dependancies = self.CM.Dependencies() for dependency in Dependancies: if dependency["type"] == "placement" and dependency["strength"] == "must": if NodeofRsc[dependency["from"]] != NodeofRsc[dependency["to"]]: print dependency["from"] + " and " + dependency["to"] + " should be run on same node" passed = 0 return passed def is_applicable(self): if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["ResCanStop"] == 0: return 1 return 0 def name(self): return "HAResourceAudit" class CrmdStateAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): passed = 1 up_are_down = 0 down_are_up = 0 unstable_list = [] self.CM.debug("Do Audit %s"%self.name()) for node in self.CM.Env["nodes"]: should_be = self.CM.ShouldBeStatus[node] rc = self.CM.StataCM(node) if rc: if should_be == self.CM["down"]: down_are_up = down_are_up + 1 if not self.CM.node_stable(node): unstable_list.append(node) elif should_be == self.CM["up"]: up_are_down = up_are_down + 1 if len(unstable_list) > 0: passed = 0 self.CM.log("Cluster is not stable: %d (of %d): %s" %(len(unstable_list), self.CM.upcount(), repr(unstable_list))) if up_are_down > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be up were down." %(up_are_down, len(self.CM.Env["nodes"]))) if down_are_up > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be down were up." %(down_are_up, len(self.CM.Env["nodes"]))) return passed def name(self): return "CrmdStateAudit" def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 class PartitionAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} self.NodeEpoche={} self.NodeState={} self.NodeQuorum={} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.debug("Do Audit %s"%self.name()) passed = 1 ccm_partitions = self.CM.find_partitions() if len(ccm_partitions) == 0: return 1 if len(ccm_partitions) > 1: self.CM.log("Warn: %d cluster partitions detected:" %len(ccm_partitions)) for partition in ccm_partitions: self.CM.log("\t %s" %partition) for partition in ccm_partitions: partition_passed = 0 if self.audit_partition(partition) == 0: passed = 0 return passed def trim_string(self, avalue): if not avalue: return None if len(avalue) > 1: return avalue[:-1] def trim2int(self, avalue): if not avalue: return None if len(avalue) > 1: return int(avalue[:-1]) def audit_partition(self, partition): passed = 0 dc_found = [] dc_allowed_list = [] lowest_epoche = None node_list = partition.split() self.CM.debug("Auditing partition: %s" %(partition)) for node in node_list: if self.CM.ShouldBeStatus[node] != self.CM["up"]: self.CM.log("Warn: Node %s appeared out of nowhere" %(node)) self.CM.ShouldBeStatus[node] = self.CM["up"] # not in itself a reason to fail the audit (not what we're # checking for in this audit) self.NodeState[node] = self.CM.rsh.readaline( node, self.CM["StatusCmd"]%node) self.NodeEpoche[node] = self.CM.rsh.readaline( node, self.CM["EpocheCmd"]) self.NodeQuorum[node] = self.CM.rsh.readaline( node, self.CM["QuorumCmd"]) self.NodeState[node] = self.trim_string(self.NodeState[node]) self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node]) self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node]) if lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche: lowest_epoche = self.NodeEpoche[node] for node in node_list: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if self.CM.is_node_dc(node, self.NodeState[node]): dc_found.append(node) if self.NodeEpoche[node] == lowest_epoche: passed = 1 elif not self.NodeEpoche[node]: self.CM.log("Cant determin epoche for DC %s" %(node)) passed = 0 else: self.CM.log("DC %s is not the oldest node (%d vs. %d)" %(node, self.NodeEpoche[node], lowest_epoche)) passed = 0 if len(dc_found) == 0: self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)" %(len(dc_allowed_list), str(dc_allowed_list), str(node_list))) elif len(dc_found) > 1: self.CM.log("%d DCs (%s) found in cluster partition: %s" %(len(dc_found), str(dc_found), str(node_list))) passed = 0 if passed == 0: for node in node_list: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.log("epoche %s : %s" %(self.NodeEpoche[node], self.NodeState[node])) if self.CM.Env["CIBResource"] == 1 and len(dc_found) > 0 and self.NodeQuorum[dc_found[0]]: if self.audit_dc_resources(node_list, dc_found) == 0: passed = 0 Resources = self.CM.Resources() for node in node_list: for resource in Resources: if resource.rid == "rsc_"+node: if resource.IsRunningOn(node) == 0: self.CM.log("Node %s is not running its own resource" %(node)) passed = 0 elif self.CM.Env["CIBResource"] == 1: # no quorum means no resource management self.CM.debug("Not auditing resources - no quorum") return passed def audit_dc_resources(self, node_list, dc_list): passed = 1 Resources = self.CM.Resources() for resource in Resources: if resource.rid == "DcIPaddr": self.CM.debug("Auditing resource: %s" %(resource)) # All DCs are running the resource for dc in dc_list: if self.NodeQuorum[dc]: if resource.IsRunningOn(dc) == 0: self.CM.log("Resource %s not running on DC: %s" %(resource, dc)) passed = 0 # All nodes running the resource are DCs for node in node_list: if resource.IsRunningOn(node): if self.CM.is_node_dc(node, self.NodeState[node]) == 0: self.CM.log("Resource %s is running on non-DC node %s" %("DcIPaddr", node)) passed = 0 return passed def name(self): return "PartitionAudit" def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 AllAuditClasses.append(CrmdStateAudit) AllAuditClasses.append(PartitionAudit) AllAuditClasses.append(ResourceAudit) AllAuditClasses.append(HAResourceAudit) def AuditList(cm): result = [] for auditclass in AllAuditClasses: result.append(auditclass(cm)) return result diff --git a/cts/CTSlab.py.in b/cts/CTSlab.py.in index e652db79fd..031e6fa62a 100755 --- a/cts/CTSlab.py.in +++ b/cts/CTSlab.py.in @@ -1,716 +1,716 @@ #!@PYTHON@ '''CTS: Cluster Testing System: Lab environment module ''' __copyright__=''' -Copyright (C) 2001 Alan Robertson +Copyright (C) 2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from UserDict import UserDict import sys, time, types, syslog, whrandom, os, struct, string from CTS import ClusterManager from CM_hb import HeartbeatCM from CTStests import BSC_AddResource from socket import gethostbyname_ex class ResetMechanism: def reset(self, node): raise ValueError("Abstract class member (reset)") class Stonith(ResetMechanism): def __init__(self, sttype="ssh", parm="foobar" , path="@sbindir@/stonith"): self.pathname=path self.configstring=parm self.stonithtype=sttype def reset(self, node): cmdstring = "%s -t '%s' -p '%s' '%s' 2>/dev/null" % (self.pathname , self.stonithtype, self.configstring, node) return (os.system(cmdstring) == 0) class Stonithd(ResetMechanism): def __init__(self, nodes, sttype = 'ssh'): self.sttype = sttype self.nodes = nodes self.query_cmd_pat = '/usr/lib/heartbeat/stonithdtest/apitest 0 %s 4000 0' self.reset_cmd_pat = '/usr/lib/heartbeat/stonithdtest/apitest 1 %s 4000 0' self.poweron_cmd_pat = '/usr/lib/heartbeat/stonithdtest/apitest 2 %s 4000 0' self.poweroff_cmd_pat= '/usr/lib/heartbeat/stonithdtest/apitest 3 %s 4000 0' self.lrmd_add_pat = '/usr/lib/heartbeat/lrmadmin -A %s stonith ' + sttype + ' NULL hostlist=%s' self.lrmd_start_pat = '/usr/lib/heartbeat/lrmadmin -E %s start 0 0 0' self.lrmd_stop_pat = '/usr/lib/heartbeat/lrmadmin -E %s stop 0 0 0' self.lrmd_del_pat = '/usr/lib/heartbeat/lrmadmin -D %s' self.rsc_id = 'my_stonithd_id' self.command = "/usr/bin/ssh -l root -n -x" self.command_noblock = "/usr/bin/ssh -f -l root -n -x" self.stonithd_started_nodes = [] self.fail_reason = '' def _remote_exec(self, node, cmnd): return (os.system("%s %s %s > /dev/null" % (self.command, node, cmnd)) == 0) def _remote_readlines(self, node, cmnd): f = os.popen("%s %s %s" % (self.command, node, cmnd)) return f.readlines() def _stonithd_started(self, node): return node in self.stonithd_started_nodes def _start_stonithd(self, node, hosts): hostlist = string.join(hosts, ',') lrmd_add_cmd = self.lrmd_add_pat % (self.rsc_id, hostlist) ret = self._remote_exec(node, lrmd_add_cmd) if not ret:return ret lrmd_start_cmd = self.lrmd_start_pat % self.rsc_id ret = self._remote_exec(node, lrmd_start_cmd) if not ret:return ret self.stonithd_started_nodes.append(node) return 1 def _stop_stonithd(self, node): lrmd_stop_cmd = self.lrmd_stop_pat % self.rsc_id ret = self._remote_exec(node, lrmd_stop_cmd) if not ret:return ret lrmd_del_cmd = self.lrmd_del_pat % self.rsc_id ret = self._remote_exec(node, lrmd_del_cmd) if not ret:return ret self.stonithd_started_nodes.remove(node) return 1 def _do_stonith(self, init_node, target_node, action): stonithd_started = self._stonithd_started(init_node) if not stonithd_started: ret = self._start_stonithd(init_node, [target_node]) if not ret: self.fail_reason = "failed to start stonithd on node %s" % init_node return ret command = "" if action == "RESET": command = self.reset_cmd_pat % target_node elif action == "POWEROFF": command = self.poweroff_cmd_pat % target_node elif action == "POWERON": command = self.poweron_cmd_pat % target_node else: self.fail_reason = "unknown opration type %s" % action return 0 lines = self._remote_readlines(init_node, command) result = "".join(lines) if not stonithd_started: self._stop_stonithd(init_node) index = result.find("result=0") if index == -1: self.fail_reason = "unexpected stonithd status: %s" % result return 0 return 1 # Should we random choose a node as init_node here if init_node not specified? def reset(self, init_node, target_node): return self._do_stonith(init_node, target_node, "RESET") def poweron(self, init_node, target_node): return self._do_stonith(init_node, target_node, "POWERON") def poweroff(self, init_node, target_node): return self._do_stonith(init_node, target_node, "POWEROFF") class Logger: TimeFormat = "%b %d %H:%M:%S\t" def __call__(self, lines): raise ValueError("Abstract class member (__call__)") class SysLog(Logger): # http://docs.python.org/lib/module-syslog.html defaultsource="CTS" defaultfacility= syslog.LOG_LOCAL7 map = { "kernel": syslog.LOG_KERN, "user": syslog.LOG_USER, "mail": syslog.LOG_MAIL, "daemon": syslog.LOG_MAIL, "auth": syslog.LOG_AUTH, "lpr": syslog.LOG_LPR, "news": syslog.LOG_NEWS, "uucp": syslog.LOG_UUCP, "cron": syslog.LOG_CRON, "local0": syslog.LOG_LOCAL0, "local1": syslog.LOG_LOCAL1, "local2": syslog.LOG_LOCAL2, "local3": syslog.LOG_LOCAL3, "local4": syslog.LOG_LOCAL4, "local5": syslog.LOG_LOCAL5, "local6": syslog.LOG_LOCAL6, "local7": syslog.LOG_LOCAL7, } def __init__(self, labinfo): if labinfo.has_key("syslogsource"): self.source=labinfo["syslogsource"] else: self.source=SysLog.defaultsource if labinfo.has_key("SyslogFacility"): self.facility=labinfo["SyslogFacility"] if SysLog.map.has_key(self.facility): self.facility=SysLog.map[self.facility] else: self.facility=SysLog.defaultfacility syslog.openlog(self.source, 0, self.facility) def setfacility(self, facility): self.facility = facility if SysLog.map.has_key(self.facility): self.facility=SysLog.map[self.facility] syslog.closelog() syslog.openlog(self.source, 0, self.facility) def __call__(self, lines): if isinstance(lines, types.StringType): syslog.syslog(lines) else: for line in lines: syslog.syslog(line) def name(self): return "Syslog" class StdErrLog(Logger): def __init__(self, labinfo): pass def __call__(self, lines): t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): sys.__stderr__.writelines([t, lines, "\n"]) else: for line in lines: sys.__stderr__.writelines([t, line, "\n"]) sys.__stderr__.flush() def name(self): return "StdErrLog" class FileLog(Logger): def __init__(self, labinfo, filename=None): if filename == None: filename=labinfo["LogFileName"] self.logfile=filename import os self.hostname = os.uname()[1]+" " self.source = "CTS: " def __call__(self, lines): fd = open(self.logfile, "a") t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): fd.writelines([t, self.hostname, self.source, lines, "\n"]) else: for line in lines: fd.writelines([t, self.hostname, self.source, line, "\n"]) fd.close() def name(self): return "FileLog" class CtsLab(UserDict): '''This class defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. It is where you define the set of nodes that are in your test lab what kind of reset mechanism you use, etc. This class is derived from a UserDict because we hold many different parameters of different kinds, and this provides provide a uniform and extensible interface useful for any kind of communication between the user/administrator/tester and CTS. At this point in time, it is the intent of this class to model static configuration and/or environmental data about the environment which doesn't change as the tests proceed. Well-known names (keys) are an important concept in this class. The HasMinimalKeys member function knows the minimal set of well-known names for the class. The following names are standard (well-known) at this time: nodes An array of the nodes in the cluster reset A ResetMechanism object logger An array of objects that log strings... CMclass The type of ClusterManager we are running (This is a class object, not a class instance) RandSeed Random seed. It is a triple of bytes. (optional) HAdir Base directory for HA installation The CTS code ignores names it doesn't know about/need. The individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions or other information to the tests through this mechanism. ''' def __init__(self, nodes): self.data = {} self["nodes"] = nodes self.MinimalKeys=["nodes", "reset", "logger", "CMclass", "HAdir"] def HasMinimalKeys(self): 'Return TRUE if our object has the minimal set of keys/values in it' result = 1 for key in self.MinimalKeys: if not self.has_key(key): result = None return result def SupplyDefaults(self): if not self.has_key("logger"): self["logger"] = (SysLog(self), StdErrLog(self)) if not self.has_key("reset"): self["reset"] = Stonith() if not self.has_key("CMclass"): self["CMclass"] = HeartbeatCM if not self.has_key("HAdir"): self["HAdir"] = "@sysconfdir@/ha.d" if not self.has_key("LogFileName"): self["LogFileName"] = "/var/log/ha-log" # # Now set up our random number generator... # self.RandomGen = whrandom.whrandom() # Get a random seed for the random number generator. if self.has_key("RandSeed"): randseed = self["RandSeed"] else: f=open("/dev/urandom", "r") string=f.read(3) f.close() randseed=struct.unpack("BBB", string) self.log("Random seed is: " + str(randseed)) self.randseed=randseed self.RandomGen.seed(randseed[0], randseed[1], randseed[2]) def log(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: logfcn(string.strip(args)) def debug(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: if logfcn.name() != "StdErrLog": logfcn(string.strip(args)) def __setitem__(self, key, value): '''Since this function gets called whenever we modify the dictionary (object), we can (and do) validate those keys that we know how to validate. For the most part, we know how to validate the "MinimalKeys" elements. ''' # # List of nodes in the system # if key == "nodes": self.Nodes = {} for node in value: # I don't think I need the IP address, etc. but this validates # the node name against /etc/hosts and/or DNS, so it's a # GoodThing(tm). self.Nodes[node] = gethostbyname_ex(node) # # Reset Mechanism # elif key == "reset": if not issubclass(value.__class__, ResetMechanism): raise ValueError("'reset' Value must be a subclass" " of ResetMechanism") # # List of Logging Mechanism(s) # elif key == "logger": if len(value) < 1: raise ValueError("Must have at least one logging mechanism") for logger in value: if not callable(logger): raise ValueError("'logger' elements must be callable") self._logfunctions = value # # Cluster Manager Class # elif key == "CMclass": if not issubclass(value, ClusterManager): raise ValueError("'CMclass' must be a subclass of" " ClusterManager") # # Initial Random seed... # elif key == "RandSeed": if len(value) != 3: raise ValueError("'Randseed' must be a 3-element list/tuple") for elem in value: if not isinstance(elem, types.IntType): raise ValueError("'Randseed' list must all be ints") self.data[key] = value def IsValidNode(self, node): 'Return TRUE if the given node is valid' return self.Nodes.has_key(node) def __CheckNode(self, node): "Raise a ValueError if the given node isn't valid" if not self.IsValidNode(node): raise ValueError("Invalid node [%s] in CheckNode" % node) def RandomNode(self): '''Choose a random node from the cluster''' return self.RandomGen.choice(self["nodes"]) def ResetNode(self, node): "Reset a node, (normally) using a hardware mechanism" self.__CheckNode(node) return self["reset"].reset(node) def ResetNode2(self, init_node, target_node, reasons): self.__CheckNode(target_node) stonithd = Stonithd(self["nodes"]) ret = stonithd.reset(init_node, target_node) if not ret: reasons.append(stonithd.fail_reason) return ret def usage(arg): print "Illegal argument " + arg print "usage: " + sys.argv[0] \ + " --directory config-directory" \ + " -D config-directory" \ + " --logfile system-logfile-name" \ + " --trunc (truncate logfile before starting)" \ + " -L system-logfile-name" \ + " --limit-nodes maxnumnodes" \ + " --xmit-loss lost-rate(0.0-1.0)" \ + " --recv-loss lost-rate(0.0-1.0)" \ + " --suppressmonitoring" \ + " --syslog-facility syslog-facility" \ + " --facility syslog-facility" \ + " --choose testcase-name" \ + " --test-ip-base ip" \ + " (-2 |"\ + " -v2 |"\ + " --crm |"\ + " --classic)"\ + " --resource-can-stop" \ + " --stonith (1 | 0 | yes | no)" \ + " --standby (1 | 0 | yes | no)" \ + " --fencing (1 | 0 | yes | no)" \ + " --suppress_cib_writes (1 | 0 | yes | no)" \ + " [number-of-iterations]" sys.exit(1) # # A little test code... # if __name__ == '__main__': from CTSaudits import AuditList from CTStests import TestList,RandomTests from CTS import Scenario, InitClusterManager, PingFest, PacketLoss, BasicSanityCheck import CM_hb HAdir = "/etc/ha.d" LogFile = "/var/log/ha-log-local7" DoStonith = 1 DoStandby = 1 DoFencing = 0 NumIter = 500 SuppressMonitoring = None Version = 1 CIBfilename = None CIBResource = 0 ClobberCIB = 0 LimitNodes = 0 TestCase = None LogFacility = None TruncateLog = 0 ResCanStop = 0 XmitLoss = "0.0" RecvLoss = "0.0" IPBase = "127.0.0.10" SuppressCib = 1 DoBSC = 0 # # The values of the rest of the parameters are now properly derived from # the configuration files. # # Stonith is configurable because it's slow, I have a few machines which # don't reboot very reliably, and it can mild damage to your machine if # you're using a real power switch. # # Standby is configurable because the test is very heartbeat specific # and I haven't written the code to set it properly yet. Patches are # being accepted... # Process arguments... skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == "-D" or args[i] == "--directory": skipthis=1 HAdir = args[i+1] elif args[i] == "-l" or args[i] == "--limit-nodes": skipthis=1 LimitNodes = int(args[i+1]) elif args[i] == "-r": CIBResource = 1 elif args[i] == "-L" or args[i] == "--logfile": skipthis=1 LogFile = args[i+1] elif args[i] == "--test-ip-base": skipthis=1 IPBase = args[i+1] elif args[i] == "--trunc": TruncateLog=1 elif args[i] == "-v2": Version=2 elif args[i] == "--stonith": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": DoStonith=1 elif args[i+1] == "0" or args[i+1] == "no": DoStonith=0 else: usage(args[i+1]) elif args[i] == "--suppress-cib-writes": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": SuppressCib=1 elif args[i+1] == "0" or args[i+1] == "no": SuppressCib=0 else: usage(args[i+1]) elif args[i] == "--bsc": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": DoBSC=1 elif args[i+1] == "0" or args[i+1] == "no": DoSBC=0 else: usage(args[i+1]) elif args[i] == "--standby": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": DoStandby=1 elif args[i+1] == "0" or args[i+1] == "no": DoStandby=0 else: usage(args[i+1]) elif args[i] == "--fencing": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": DoFencing=1 elif args[i+1] == "0" or args[i+1] == "no": DoFencing=0 else: usage(args[i+1]) elif args[i] == "--suppressmonitoring": SuppressMonitoring = 1 elif args[i] == "--resource-can-stop": ResCanStop = 1 elif args[i] == "-2" or args[i] == "--crm": Version = 2 elif args[i] == "-1" or args[i] == "--classic": Version = 1 elif args[i] == "--clobber-cib" or args[i] == "-c": ClobberCIB = 1 elif args[i] == "--cib-filename": skipthis=1 CIBfilename = args[i+1] elif args[i] == "--xmit-loss": try: float(args[i+1]) except ValueError: print ("--xmit-loss parameter should be float") usage(args[i+1]) skipthis=1 XmitLoss = args[i+1] elif args[i] == "--recv-loss": try: float(args[i+1]) except ValueError: print ("--recv-loss parameter should be float") usage(args[i+1]) skipthis=1 RecvLoss = args[i+1] elif args[i] == "--choose": skipthis=1 TestCase = args[i+1] elif args[i] == "--syslog-facility" or args[i] == "--facility": skipthis=1 LogFacility = args[i+1] else: NumIter=int(args[i]) # # This reading of HBconfig here is ugly, and I suppose ought to # be done by the Cluster manager. This would probably mean moving the # list of cluster nodes into the ClusterManager class. A good thought # for our Copious Spare Time in the future... # config = CM_hb.HBConfig(HAdir) node_list = config.Parameters["node"] if LogFacility == None: if config.Parameters.has_key("logfacility"): LogFacility = config.Parameters["logfacility"][0] else: LogFacility = "local7" if LimitNodes > 0: if len(node_list) > LimitNodes: print("Limiting the number of nodes configured=%d (max=%d)" %(len(node_list), LimitNodes)) while len(node_list) > LimitNodes: node_list.pop(len(node_list)-1) Environment = CtsLab(node_list) Environment["HAdir"] = HAdir Environment["ClobberCIB"] = ClobberCIB Environment["CIBfilename"] = CIBfilename Environment["CIBResource"] = CIBResource Environment["LogFileName"] = LogFile Environment["DoStonith"] = DoStonith Environment["SyslogFacility"] = LogFacility Environment["DoStandby"] = DoStandby Environment["DoFencing"] = DoFencing Environment["ResCanStop"] = ResCanStop Environment["SuppressMonitoring"] = SuppressMonitoring Environment["XmitLoss"] = XmitLoss Environment["RecvLoss"] = RecvLoss Environment["IPBase"] = IPBase Environment["SuppressCib"] = SuppressCib Environment["DoBSC"] = DoBSC Environment["use_logd"] = 0 if config.Parameters.has_key("use_logd"): Environment["use_logd"] = 1 if Version == 2: from CM_LinuxHAv2 import LinuxHAv2 Environment['CMclass']=LinuxHAv2 #Environment["RandSeed"] = (156, 104, 218) Environment["reset"] = Stonith(sttype="external/ssh", parm=string.join(node_list, " ")) Environment.SupplyDefaults() # Your basic start up the world type of test scenario... #scenario = Scenario( #[ InitClusterManager(Environment) #, PingFest(Environment)]) if Environment["DoBSC"]: scenario = Scenario([ BasicSanityCheck(Environment) ]) NumIter = 2 else: scenario = Scenario( [ InitClusterManager(Environment), PacketLoss(Environment)]) # Create the Cluster Manager object cm = Environment['CMclass'](Environment) if TruncateLog: lf = open(LogFile, "w"); if lf != None: lf.truncate(0) lf.close() cm.log(">>>>>>>>>>>>>>>> BEGINNING " + repr(NumIter) + " TESTS ") cm.log("HA configuration directory: " + Environment["HAdir"]) cm.log("System log files: " + Environment["LogFileName"]) cm.log("Enable Stonith: " + ("%d" % Environment["DoStonith"])) cm.log("Enable Standby: " + ("%d" % Environment["DoStandby"])) if Environment.has_key("SuppressMonitoring") \ and Environment["SuppressMonitoring"]: cm.log("Resource Monitoring is disabled") cm.log("Cluster nodes: " + repr(config.Parameters["node"])) Audits = AuditList(cm) Tests = [] if Environment["DoBSC"]: test = BSC_AddResource(cm) Tests.append(test) elif TestCase != None: for test in TestList(cm): if test.name == TestCase: Tests.append(test) if Tests == []: usage("--choose: No applicable/valid tests chosen") else: Tests = TestList(cm) tests = RandomTests(scenario, cm, Tests, Audits) Environment.RandomTests = tests overall, detailed = tests.run(NumIter) tests.summarize()