diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in index 3600a09200..229d85205c 100755 --- a/cts/CM_LinuxHAv2.py.in +++ b/cts/CM_LinuxHAv2.py.in @@ -1,558 +1,579 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits: Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import CTS from CTS import * from CM_hb import HeartbeatCM from xml.dom.minidom import * import CTSaudits from CTSaudits import ClusterAudit import CTStests from CTStests import * ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class LinuxHAv2(HeartbeatCM): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): HeartbeatCM.__init__(self, Environment, randseed=randseed) self.update({ "Name" : "linux-ha-v2", "DeadTime" : 90, "StartCmd" : "@libdir@/heartbeat/heartbeat >/dev/null 2>&1", "StopCmd" : "@libdir@/heartbeat/heartbeat -k", "StatusCmd" : "@libdir@/heartbeat/crmadmin -S %s 2>/dev/null", "EpocheCmd" : "@libdir@/heartbeat/ccm_epoche", "IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s status 0 0 EVERYTIME 2>/dev/null|grep return", "IsIPAddrRscRunning" : "", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", # Patterns to look for in the log files for various occasions... "Pat:We_started" : " %s crmd: .* State transition .*-> (S_NOT_DC|S_IDLE)", "Pat:They_started" : " %s crmd: .* State transition .*-> (S_NOT_DC|S_IDLE)", + "Pat:DC_IDLE" : " %s crmd: .* State transition .*-> S_IDLE", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"Both machines own .* resources!", r"No one owns .* resources!", r", exiting\.", r"ERROR:", r"CRIT:", ), }) self.default_cts_cib=''' + + + + + + + + + + + + + + + + +''' + if 1: + self.default_cts_cib=''' ''' + # KLUDGE! Expedient, but a Kludge (FIXME) # CTStests.AllTestClasses = [FlipTest,RestartTest,StartOnebyOne,SimulStart,SimulStop,Split_brainTest,BandwidthTest] CTStests.AllTestClasses = [FlipTest, RestartTest, StartOnebyOne, SimulStart, SimulStop] # CTSaudits.AllAuditClasses = [CrmdStateAudit, HAResourceAudit] CTSaudits.AllAuditClasses = [CrmdStateAudit, DcAudit, DcIPaddrAudit] def StataCM(self, node): '''Report the status of the cluster manager on a given node''' out=self.rsh.readaline(node, self["StatusCmd"]%node) ret= (string.find(out, 'ok') != -1) try: if ret: if self.ShouldBeStatus[node] != self["up"]: self.log( "Node status for %s is %s but we think it should be %s" % (node, self["up"], self.ShouldBeStatus[node])) else: if self.ShouldBeStatus[node] != self["down"]: self.log( "Node status for %s is %s but we think it should be %s" % (node, self["down"], self.ShouldBeStatus[node])) except KeyError: pass if ret: self.ShouldBeStatus[node]=self["up"] else: self.ShouldBeStatus[node]=self["down"] return ret def StartaCM(self, node): '''Start up the cluster manager on a given node''' watch = CTS.LogWatcher(self["LogFileName"] - , [self["Pat:We_started"]%node] + , [self["Pat:We_started"]%node, self["Pat:DC_IDLE"]] , 60) watch.setwatch() self.log ("CM_LinuxHAv2.py: Starting %s on node %s" %(self["Name"], node)) if self.Env["ClobberCIB"] != None: if self.Env["CIBfilename"] == None: os.system("rm -f /tmp/cts.default.cib") os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib") self.rsh.cp("/tmp/cts.default.cib", self["CIBfile"]%node) os.system("rm -f /tmp/cts.default.cib") else: self.rsh.cp(self.Env["CIBfilename"], self["CIBfile"]%node) self.rsh(node, self["StartCmd"]) if watch.look(): self.ShouldBeStatus[node]=self["up"] return 1 out = self.rsh.readaline(node, self["StatusCmd"]) if string.find(out, 'ok') == -1: self.ShouldBeStatus[node]=self["down"] self.log ("Could not start %s on node %s" % (self["Name"], node)) else: self.ShouldBeStatus[node]=self["up"] self.log ("%s only partially started on node %s" % (self["Name"], node)) return None def Configuration(self): if not self.rsh.cp(self["CIBfile"]%self.Env["nodes"][0],self.Env["HAdir"]): raise ValueError("Can not copy file to %s, maybe permission denied"%self.Env["HAdir"]) cib=parse("%s/cib.xml"%self.Env["HAdir"]) return cib.getElementsByTagName('configuration')[0] def Resources(self): ResourceList = [] #read resources in cib configuration=self.Configuration() resources=configuration.getElementsByTagName('resources')[0] rscs=configuration.getElementsByTagName('resource') for rsc in rscs: ResourceList.append(HAResource(self,rsc)) return ResourceList def Dependancies(self): DependancyList = [] #read dependancy in cib configuration=self.Configuration() constraints=configuration.getElementsByTagName('constraints')[0] rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc') for node in rsc_to_rscs: dependancy = {} dependancy["id"]=node.getAttribute('id') dependancy["from"]=node.getAttribute('from') dependancy["to"]=node.getAttribute('to') dependancy["type"]=node.getAttribute('type') dependancy["strength"]=node.getAttribute('strength') DependancyList.append(dependancy) return DependancyList class HAResourceAudit(ClusterAudit): def __init__(self, cm): self.CM = cm def _RscRunningNodes(self, resource): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if resource.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def __call__(self): self.CM.log ("Do Audit %s"%self.name()) passed = 1 NodeofRsc = {} #Make sure the resouces are running on one and only one node Resources = self.CM.Resources() for resource in Resources : RunningNodes = self._RscRunningNodes(resource) NodeofRsc[resource.rid]=RunningNodes if len(RunningNodes) == 0 : print resource.rid + " isn't running anywhere" passed = 0 if len(RunningNodes) > 1: print resource.rid + " is running more than once: " \ + str(RunningNodes) passed = 0 #Make sure the resouces with "must","placement" constraint are running on the same node Dependancies = self.CM.Dependancies() for dependancy in Dependancies: if dependancy["type"] == "placement" and dependancy["strength"] == "must": if NodeofRsc[dependancy["from"]] != NodeofRsc[dependancy["to"]]: print dependancy["from"] + " and " + dependancy["to"] + " should be run on same node" passed = 0 return passed def name(self): return "HAResourceAudit" class HAResource(Resource): def __init__(self, cm, node): ''' Get information from xml node ''' self.rid = node.getAttribute('id') self.rclass = node.getAttribute('class') self.rtype = node.getAttribute('type') self.rparameters = {} attributes = node.getElementsByTagName('instance_attributes')[0] parameters = node.getElementsByTagName('rsc_parameters')[0] nvpairs = node.getElementsByTagName('nvpair') for nvpair in nvpairs: name=nvpair.getAttribute('name') value=nvpair.getAttribute('value') self.rparameters[name]=value Resource.__init__(self, cm, self.rtype, self.rid) def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. We call the status operation for the resource script. ''' out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid) return re.search("0",out) def _ResourceOperation(self, operation, nodename): ''' Execute an operation on the resource ''' self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation)) return self.CM.rsh.lastrc == 0 def Start(self, nodename): ''' This member function starts or activates the resource. ''' return self._ResourceOperation("start", nodename) def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' return self._ResourceOperation("stop", nodename) def IsWorkingCorrectly(self, nodename): return self._ResourceOperation("monitor", nodename) class CrmdStateAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.log ("Do Audit %s"%self.name()) passed = 1 - dc_count = 0 + dc_list = [] up_count = 0 node_count = 0 up_are_down = 0 down_are_up = 0 slave_count = 0 - unstable_count = 0 + unstable_list = [] for node in self.CM.Env["nodes"]: out=self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node) ret = (string.find(out, 'ok') != -1) node_count = node_count + 1 if ret: up_count = up_count + 1 if self.CM.ShouldBeStatus[node] == self.CM["down"]: self.CM.log( "Node %s %s when it should be %s" % (node, self.CM["up"], self.CM.ShouldBeStatus[node])) self.CM.ShouldBeStatus[node] = self.CM["up"] down_are_up = down_are_up + 1 ret= (string.find(out, 'S_NOT_DC') != -1) if ret: slave_count = slave_count + 1 else: ret= (string.find(out, 'S_IDLE') != -1) if ret: - dc_count = dc_count + 1 + dc_list.append(node) else: - unstable_count = unstable_count + 1 + unstable_list.append(node) else: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.log( "Node %s %s when it should be %s" % (node, self.CM["down"], self.CM.ShouldBeStatus[node])) self.CM.ShouldBeStatus[node] = self.CM["down"] up_are_down = up_are_down + 1 - if up_count > 0 and dc_count != 1: + if up_count > 0 and len(dc_list) != 1: passed = 0 - self.CM.log("Exactly 1 node should be DC. We found %d (of %d)" - %(dc_count, up_count)) + self.CM.log("Exactly 1 node should be DC. We found %d (of %d): %s" + %(len(dc_list), up_count, str(dc_list))) - if unstable_count > 0: + if len(unstable_list) > 0: passed = 0 - self.CM.log("Cluster is not stable. We found %d (of %d) unstable nodes" - %(unstable_count, up_count)) + self.CM.log("Cluster is not stable. We found %d (of %d) unstable nodes: %s" + %(len(unstable_list), up_count, str(unstable_list))) if up_are_down > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be up were down." %(up_are_down, node_count)) if down_are_up > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be down were up." %(down_are_up, node_count)) return passed def name(self): return "CrmdStateAudit" class DcIPaddrAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.log ("Do Audit %s"%self.name()) passed = 1 the_dc = self.find_dc() if the_dc == None: return passed #Make sure the resouces are running on one and only one node Resources = self.CM.Resources() for resource in Resources : if resource.rid == "DcIPaddr": if self.audit_ip_addr(resource, the_dc) == 0: passed = 0 return passed def is_node_dc(self, node): out=self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node) return (string.find(out, 'S_IDLE') != -1) def audit_ip_addr(self, resource, node): self.CM.log ("Auditing %s"%(resource)) RunningNodes = self._RscRunningNodes(resource) if len(RunningNodes) == 0 : self.CM.log("%s is not running" %(resource)) for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: out = self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node) self.CM.log("%s" %(out)) return 0 if len(RunningNodes) > 1: self.CM.log("%s is running more than once" %(resource)) for running_on in RunningNodes: if self.is_node_dc(running_on) == 0: self.CM.log("%s is running on a non-DC node %s" %(resource, running_on)) return 0 return 1 def name(self): return "DcIPaddrAudit" def find_dc(self): for node in self.CM.Env["nodes"]: if self.is_node_dc(node): return node return None def _RscRunningNodes(self, resource): ResourceNodes = [] for node in self.CM.Env["nodes"]: if resource.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes class DcAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} self.NodeEpoche={} self.NodeState={} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.log ("Do Audit %s"%self.name()) passed = 0 lowest_epoche = None nodes_up = 0 dc_allowed_list=[] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: nodes_up = nodes_up + 1 self.NodeEpoche[node] = self.CM.rsh.readaline( node, self.CM["EpocheCmd"]) self.NodeState[node] = self.CM.rsh.readaline( node, self.CM["StatusCmd"]%node) if len(self.NodeState[node]) > 1: self.NodeState[node] = self.NodeState[node][:-1] if len(self.NodeEpoche[node]) > 1: self.NodeEpoche[node] = self.NodeEpoche[node][:-1] if lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche: lowest_epoche = self.NodeEpoche[node] if nodes_up == 0: print ("No nodes running") return 1 for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if self.NodeEpoche[node] == lowest_epoche: dc_allowed_list.append(node) for node in dc_allowed_list: if self.is_node_dc(self.NodeState[node]): passed = 1 if passed == 0: self.CM.log("DC not found on any of the %d allowed nodes: %s" %(len(dc_allowed_list), str(dc_allowed_list))) for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.log("epoche %s : %s" %(self.NodeEpoche[node], self.NodeState[node])) return passed def is_node_dc(self, status_line): return (string.find(status_line, 'S_IDLE') != -1) def name(self): return "DcAudit" ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass