diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in index 7d9d6c4456..30d4600741 100755 --- a/cts/CM_LinuxHAv2.py.in +++ b/cts/CM_LinuxHAv2.py.in @@ -1,524 +1,525 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests from CTS import * from CM_hb import HeartbeatCM from xml.dom.minidom import * from CTSaudits import ClusterAudit from CTStests import * ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class LinuxHAv2(HeartbeatCM): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): HeartbeatCM.__init__(self, Environment, randseed=randseed) self.update({ "Name" : "linux-ha-v2", "DeadTime" : 600, "StableTime" : 10, "StartCmd" : "@libdir@/heartbeat/ha_logd -d >/dev/null 2>&1; @libdir@/heartbeat/heartbeat >/dev/null 2>&1", "StopCmd" : "@libdir@/heartbeat/heartbeat -k", + "ElectionCmd" : "@libdir@/heartbeat/crmadmin -E %s", "StatusCmd" : "@libdir@/heartbeat/crmadmin -S %s 2>/dev/null", "EpocheCmd" : "@libdir@/heartbeat/ccm_tool -e", "QuorumCmd" : "@libdir@/heartbeat/ccm_tool -q", "ParitionCmd" : "@libdir@/heartbeat/ccm_tool -p", # Revert this to blocking (timeout = 0) to catch bugs "IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s monitor 0 0 EVERYTIME 2>/dev/null|grep return", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd2" : "/usr/lib/heartbeat/TestHeartbeatComm break-communication %s>/dev/null 2>&1", "IsIPAddrRscRunning" : "", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd:.*State transition.*-> S_IDLE", "Pat:We_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)", "Pat:They_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)", "Pat:They_stopped" : "%s heartbeat.*Heartbeat shutdown complete", "Pat:They_dead" : "node (%s).*: is dead", "Pat:We_stopped" : ("%s heartbeat.*Heartbeat shutdown complete" %(self.OurNode)), "Pat:All_stopped" : "%s heartbeat.*Heartbeat shutdown complete", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.* not in our membership list", r"pengine:.*Attempting recovery of resource", r"pengine:.*Handling failed ", r"tengine:.*is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"ERROR:", r"CRIT:", ), }) del self["Standby"] self.CIBsync = {} cib_prefix=''' ''' cib_options=''' ''' cib_glue_1=''' ''' cib_glue_2=''' ''' cib_suffix=''' ''' resources=''' ''' constraints=''' ''' cib_fencing = "" if self.Env["CIBResource"] == 1: self.log("Enabling DC resource") resources=''' ''' # DcIPaddr cant run anywhere but the DC constraints=''' ''' ip_num=21 for node in self.Env["nodes"]: # These resources prefer to run on the node with the same name node_resource=(""" """ %("rsc_"+node, ip_num)) ip_num=ip_num+1 resources = resources + node_resource node_constraint=(""" """ % ("rsc_"+node, "rsc_"+node, "rsc_"+node, node)) constraints = constraints + node_constraint if self.Env["DoFencing"] == 1 : cib_options=cib_options + ''' ''' nodelist = "" for node in self.Env["nodes"]: nodelist += node + " " stonith_resource=(""" """ %(len(nodelist), len(nodelist), nodelist)) resources = resources + stonith_resource self.default_cts_cib=cib_prefix + cib_options + cib_glue_1 + \ resources + cib_glue_2 + constraints + cib_suffix self.debug(self.default_cts_cib) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 0: return [ "Currently no errors to ignore" ] return [] def StataCM(self, node): '''Report the status of the cluster manager on a given node''' out=self.rsh.readaline(node, self["StatusCmd"]%node) ret= (string.find(out, 'ok') != -1) try: if ret: if self.ShouldBeStatus[node] != self["up"]: self.log( "Node status for %s is %s but we think it should be %s" % (node, self["up"], self.ShouldBeStatus[node])) self.log("Expected: %s. Actual: %s" % (self.ShouldBeStatus[node], out)) else: if self.ShouldBeStatus[node] != self["down"]: self.log( "Node status for %s is %s but we think it should be %s" % (node, self["down"], self.ShouldBeStatus[node])) except KeyError: pass if ret: self.ShouldBeStatus[node]=self["up"] else: self.ShouldBeStatus[node]=self["down"] return ret def StartaCM(self, node): '''Start up the cluster manager on a given node''' localBadNewsPats = [] patterns = [] patterns.append(self["Pat:We_started"]%node) # only search for this pattern if there is another node out there # that should be the DC if self.any_running() == 1: patterns.append(self["Pat:DC_IDLE"]) localBadNewsPats.append("input=I_DC_TIMEOUT ") localBadNewsPats.append("input=I_ELECTION_DC ") localBadNewsPats.append("WARN:.*Ignoring HA message.*not in our membership list") if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) os.system("rm -f /tmp/cts.default.cib") os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib") if 0!=self.rsh.cp("/tmp/cts.default.cib", "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) os.system("rm -f /tmp/cts.default.cib") else: self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) localBadNews = CTS.LogWatcher(self["LogFileName"], localBadNewsPats, 0) localBadNews.setwatch() watch = CTS.LogWatcher(self["LogFileName"], patterns, 180) self.debug("Starting %s on node %s" %(self["Name"], node)) watch.setwatch() self.rsh(node, self["StartCmd"]) self.ShouldBeStatus[node]=self["up"] if watch.lookforall(): match=localBadNews.look() while match: if not re.search("CTS:", match): self.log("(startup) BadNews "+ match) self.Env.RandomTests.incr("BadNews") match=localBadNews.look() return 1 # the watch() failed... lets check to see if the start _really_ failed for regex in watch.unmatched: self.log ("Warn: Startup pattern not found: %s" %(regex)) match=localBadNews.look() while match: if not re.search("CTS:", match): self.log("Warn: %s" %match) self.Env.RandomTests.incr("BadNews") match=localBadNews.look() out = self.rsh.readaline(node, (self["StatusCmd"] %node) ) if string.find(out, 'ok') == -1: # yep, it _really_ failed self.ShouldBeStatus[node]=self["down"] return None ret=(string.find(out, 'S_NOT_DC') != -1) if ret: # actually we joined the cluster just fine self.log ("%s on %s joined the cluster" %(self["Name"], node)) return 1 ret= (string.find(out, 'S_IDLE') != -1) if ret: # actually we joined the cluster just fine self.log ("%s on %s joined the cluster as DC" %(self["Name"], node)) return 1 self.log ("%s on %s started but unstable: %s" %(self["Name"], node, out)) # self.ShouldBeStatus[node]=self["down"] return None def StopaCM(self, node): '''Stop the cluster manager on a given node''' self.debug("Stopping %s on node %s" %(self["Name"], node)) rc=self.rsh(node, self["StopCmd"]) if rc == 0: self.ShouldBeStatus[node]=self["down"] return 1 still_running = 0 if self.rsh(node, "killall -INT crmd") == 0: still_running = 1 self.log("%s is still running on node %s" %(self["name"], node)) if self.rsh(node, "killall -INT heartbeat") == 0: still_running = 1 self.log("Heartbeat is still running on node %s" %node) if still_running == 0: self.log ("Warn: %s failed, yet nothing is running on node %s" %(self["StopCmd"], self["Name"], node)) return 1 return None def isolate_node(self, node, allowlist): '''isolate the communication between the nodes''' rc = self.rsh(node, self["BreakCommCmd2"]%allowlist) if rc == 0: return 1 else: self.log("Could not break the communication from node: %s",node) return None def IsDC(self, node): rc = 0 status_line = self.rsh.readaline(node, self["StatusCmd"]%node) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 return rc def Configuration(self): if self.Env["ClobberCIB"] == 1: if self.Env["CIBfilename"] == None: os.system("rm -f /tmp/cts.default.cib") os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib") cib=parse("/tmp/cts.default.cib") # os.system("rm -f /tmp/cts.default.cib") else: cib=parse(self.Env["CIBfilename"]) else: local_cib = "%s/cts_cib_%s.xml"%(self["TmpDir"],str(os.getpid())) if 0!=self.rsh.cp("root@"+self["CIBfile"]%self.Env["nodes"][0],local_cib): raise ValueError("Can not copy file to %s, maybe permission denied"%self["TmpDir"]) cib=parse(local_cib) os.remove(local_cib) return cib.getElementsByTagName('configuration')[0] def Resources(self): ResourceList = [] #read resources in cib configuration=self.Configuration() resources=configuration.getElementsByTagName('resources')[0] rscs=configuration.getElementsByTagName('resource') for rsc in rscs: ResourceList.append(HAResource(self,rsc)) return ResourceList def Dependancies(self): DependancyList = [] #read dependancy in cib configuration=self.Configuration() constraints=configuration.getElementsByTagName('constraints')[0] rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc') for node in rsc_to_rscs: dependancy = {} dependancy["id"]=node.getAttribute('id') dependancy["from"]=node.getAttribute('from') dependancy["to"]=node.getAttribute('to') dependancy["type"]=node.getAttribute('type') dependancy["strength"]=node.getAttribute('strength') DependancyList.append(dependancy) return DependancyList def any_running(self): for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == self["up"]: return 1 return 0 def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == self["up"]: partition = self.rsh.readaline(node, self["ParitionCmd"]) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] for a_partition in ccm_partitions: if partition != a_partition: ccm_partitions.append(partition) else: self.log("bad partition details for %s" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == self["up"]: quorum = self.rsh.readaline(node, self["QuorumCmd"]) return string.find(quorum,"1") != -1 return 0 class HAResource(Resource): def __init__(self, cm, node): ''' Get information from xml node ''' self.rid = str(node.getAttribute('id')) self.rclass = str(node.getAttribute('class')) self.rtype = str(node.getAttribute('type')) self.rparameters = {} list = node.getElementsByTagName('instance_attributes') if len(list) > 0: attributes = list[0] list = attributes.getElementsByTagName('attributes') if len(list) > 0: parameters = list[0] nvpairs = parameters.getElementsByTagName('nvpair') for nvpair in nvpairs: name=nvpair.getAttribute('name') value=nvpair.getAttribute('value') self.rparameters[name]=value Resource.__init__(self, cm, self.rtype, self.rid) def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. We call the status operation for the resource script. ''' out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid) return re.search("0",out) def RunningNodes(self): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if self.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def _ResourceOperation(self, operation, nodename): ''' Execute an operation on the resource ''' self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation)) return self.CM.rsh.lastrc == 0 def Start(self, nodename): ''' This member function starts or activates the resource. ''' return self._ResourceOperation("start", nodename) def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' return self._ResourceOperation("stop", nodename) def IsWorkingCorrectly(self, nodename): return self._ResourceOperation("monitor", nodename) ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass