diff --git a/crm/admin/Makefile.am b/crm/admin/Makefile.am index abacc8e8b1..53bcec64e5 100644 --- a/crm/admin/Makefile.am +++ b/crm/admin/Makefile.am @@ -1,116 +1,105 @@ # # Copyright (C) 2004 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in - - -INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ +INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ -I$(top_builddir)/libltdl -I$(top_srcdir)/libltdl \ - -I$(top_builddir)/linux-ha -I$(top_srcdir)/linux-ha \ + -I$(top_builddir)/linux-ha -I$(top_srcdir)/linux-ha \ -I$(top_builddir) -I$(top_srcdir) -hadir = $(sysconfdir)/ha.d -halibdir = $(libdir)/@HB_PKG@ -commmoddir = $(halibdir)/modules/comm -havarlibdir = $(localstatedir)/lib/@HB_PKG@ - -# sockets with path -crmdir = $(havarlibdir)/crm -apigid = @HA_APIGID@ -crmuid = @HA_CCMUID@ +hasbindir = $(sbindir) +LIBRT = @LIBRT@ +AM_CFLAGS = @CFLAGS@ $(CRM_DEBUG_FLAGS) COMMONLIBS = $(CRM_DEBUG_LIBS) \ $(top_builddir)/lib/clplumbing/libplumb.la \ $(top_builddir)/lib/pils/libpils.la \ - $(top_builddir)/lib/crm/common/libcrmcommon.la \ - $(top_builddir)/lib/crm/cib/libcib.la \ + $(top_builddir)/lib/crm/common/libcrmcommon.la \ + $(top_builddir)/lib/crm/cib/libcib.la \ $(top_builddir)/lib/apphb/libapphb.la \ $(top_builddir)/lib/hbclient/libhbclient.la \ $(GLIBLIB) \ $(CURSESLIBS) \ $(LIBRT) -LIBRT = @LIBRT@ -AM_CFLAGS = @CFLAGS@ \ - $(CRM_DEBUG_FLAGS) ## binary progs -habin_PROGRAMS = crmadmin cibadmin ccm_tool crm_diff crm_mon iso8601 \ +hasbin_PROGRAMS = crmadmin cibadmin ccm_tool crm_diff crm_mon iso8601 \ crm_master crm_standby crm_attribute crm_resource \ crm_verify ## SOURCES #noinst_HEADERS = config.h control.h crmd.h noinst_HEADERS = crmadmin_SOURCES = crmadmin.c crmadmin_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crmadmin_LDADD = $(COMMONLIBS) \ $(top_builddir)/crm/pengine/libpengine.la cibadmin_SOURCES = cibadmin.c cibadmin_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' cibadmin_LDADD = $(COMMONLIBS) ccm_tool_SOURCES = ccm_epoche.c ccm_tool_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' ccm_tool_LDADD = $(COMMONLIBS) \ $(top_builddir)/membership/ccm/libccmclient.la crm_diff_SOURCES = xml_diff.c crm_diff_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_diff_LDADD = $(COMMONLIBS) crm_mon_SOURCES = crm_mon.c crm_mon_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_mon_LDADD = $(COMMONLIBS) \ $(top_builddir)/crm/pengine/libpengine.la crm_verify_SOURCES = crm_verify.c crm_verify_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_verify_LDADD = $(COMMONLIBS) \ $(top_builddir)/crm/pengine/libpengine.la crm_master_SOURCES = crm_attribute.c crm_master_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_master_LDADD = $(COMMONLIBS) crm_standby_SOURCES = crm_attribute.c crm_standby_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_standby_LDADD = $(COMMONLIBS) crm_attribute_SOURCES = crm_attribute.c crm_attribute_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_attribute_LDADD = $(COMMONLIBS) crm_resource_SOURCES = crm_resource.c crm_resource_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' crm_resource_LDADD = $(COMMONLIBS) \ $(top_builddir)/crm/pengine/libpengine.la iso8601_SOURCES = test.iso8601.c iso8601_CFLAGS = -DHA_VARLIBDIR='"@HA_VARLIBDIR@"' iso8601_LDADD = $(COMMONLIBS) clean-generic: rm -f *.log *.debug *.xml *~ install-exec-local: uninstall-local: diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in index 7a39ee1c1b..ba03d78b9b 100755 --- a/cts/CM_LinuxHAv2.py.in +++ b/cts/CM_LinuxHAv2.py.in @@ -1,650 +1,650 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests from CTS import * from CM_hb import HeartbeatCM from xml.dom.minidom import * from CTSaudits import ClusterAudit from CTStests import * ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class LinuxHAv2(HeartbeatCM): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): HeartbeatCM.__init__(self, Environment, randseed=randseed) self.cluster_monitor = 0 self.update({ "Name" : "linux-ha-v2", "DeadTime" : 300, "StartTime" : 300, # Max time to start up "StableTime" : 30, "StartCmd" : "/etc/init.d/heartbeat start > /dev/null 2>&1", "StopCmd" : "/etc/init.d/heartbeat stop > /dev/null 2>&1", - "ElectionCmd" : "@bindir@/crmadmin -E %s", - "StatusCmd" : "@bindir@/crmadmin -S %s 2>/dev/null", - "EpocheCmd" : "@bindir@/ccm_tool -e", - "QuorumCmd" : "@bindir@/ccm_tool -q", - "CibQuery" : "@bindir@/cibadmin -Ql", - "ParitionCmd" : "@bindir@/ccm_tool -p", + "ElectionCmd" : "@sbindir@/crmadmin -E %s", + "StatusCmd" : "@sbindir@/crmadmin -S %s 2>/dev/null", + "EpocheCmd" : "@sbindir@/ccm_tool -e", + "QuorumCmd" : "@sbindir@/ccm_tool -q", + "CibQuery" : "@sbindir@/cibadmin -Ql", + "ParitionCmd" : "@sbindir@/ccm_tool -p", "IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s monitor 0 0 EVERYTIME 2>/dev/null|grep return", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd2" : "/usr/lib/heartbeat/TestHeartbeatComm break-communication %s>/dev/null 2>&1", "IsIPAddrRscRunning" : "", - "StandbyCmd" : "@bindir@/crm_standby -U %s -v %s 2>/dev/null", - "UUIDQueryCmd" : "@bindir@/crmadmin -N", - "StandbyQueryCmd" : "@bindir@/crm_standby -GQ -U %s 2>/dev/null", + "StandbyCmd" : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null", + "UUIDQueryCmd" : "@sbindir@/crmadmin -N", + "StandbyQueryCmd" : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd:.*State transition.*-> S_IDLE", # This wont work if we have multiple partitions # Use: "Pat:They_started" : "%s crmd:.*State transition.*-> S_NOT_DC", "Pat:They_started" : "Updating node state to member for %s", "Pat:We_started" : "%s crmd:.*State transition.*-> S_IDLE", "Pat:We_stopped" : "%s heartbeat.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ", "Pat:All_stopped" : "%s .*heartbeat.*Heartbeat shutdown complete", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine:.*Attempting recovery of resource", r"pengine:.*Handling failed ", r"tengine:.*is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"ERROR:", r"CRIT:", ), }) del self["Standby"] self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} cib_prefix=''' ''' cib_options=""" """ % self.Env["SuppressCib"] cib_glue_1=''' ''' cib_glue_2=''' ''' cib_suffix=''' ''' resources=''' ''' constraints=''' ''' cib_fencing = "" if self.Env["CIBResource"] == 1: self.log("Enabling DC resource") resources=''' ''' % self.Env["IPBase"] if self.cluster_monitor == 1: resources=resources+''' ''' # DcIPaddr cant run anywhere but the DC constraints=''' ''' if self.cluster_monitor == 1: constraints=constraints+''' ''' fields = string.split(self.Env["IPBase"], '.') for node in self.Env["nodes"]: # These resources prefer to run on the node with the same name fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.Env["IPBase"] = ip node_resource=(""" """ %("rsc_"+node, "rsc_"+node, "rsc_"+node, ip)) resources = resources + node_resource node_constraint=(""" """ % ("rsc_"+node, "rsc_"+node, "rsc_"+node, "rsc_"+node, node)) constraints = constraints + node_constraint # always add the fencing resource so that we test incarnations nodelist = "" for node in self.Env["nodes"]: nodelist += node + " " stonith_resource=(""" """ %(len(self.Env["nodes"]), nodelist)) resources = resources + stonith_resource if self.Env["DoFencing"] == 1: cib_options=cib_options + ''' ''' self.default_cts_cib=cib_prefix + cib_options + cib_glue_1 + \ resources + cib_glue_2 + constraints + cib_suffix self.debug(self.default_cts_cib) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "crmadmin:" ] return [] def install_config(self, node): if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) os.system("rm -f /tmp/cts.default.cib") os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib") if 0!=self.rsh.cp("/tmp/cts.default.cib", "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) os.system("rm -f /tmp/cts.default.cib") else: self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) def prepare(self): '''Finish the Initialization process. Prepare to test...''' for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current state: (S_IDLE|S_NOT_DC)") watchpats.append(self["Pat:They_started"]%node) idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats) idle_watch.setwatch() out=self.rsh.readaline(node, self["StatusCmd"]%node) ret= (string.find(out, 'ok') != -1) self.debug("Node %s status: %s" %(node, out)) if not ret: if self.ShouldBeStatus[node] == self["up"]: self.log( "Node status for %s is %s but we think it should be %s" %(node, self["down"], self.ShouldBeStatus[node])) self.ShouldBeStatus[node]=self["down"] return 0 if self.ShouldBeStatus[node] == self["down"]: self.log( "Node status for %s is %s but we think it should be %s: %s" %(node, self["up"], self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node]=self["up"] # check the output first - because syslog-ng looses messages if string.find(out, 'S_NOT_DC') != -1: # Up and stable return 2 if string.find(out, 'S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" %(node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" %(node)) return None def cluster_stable(self, timeout=None): watchpats = [ ] watchpats.append("Current state: S_IDLE") watchpats.append(self["Pat:DC_IDLE"]) if timeout == None: timeout = self["DeadTime"] idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout) idle_watch.setwatch() any_up = 0 for node in self.Env["nodes"]: # have each node dump its current state if self.ShouldBeStatus[node] == self["up"]: self.rsh.readaline(node, (self["StatusCmd"] %node) ) any_up = 1 if any_up == 0 or idle_watch.look(): return 1 self.log("Warn: Cluster Master not IDLE") return None def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh.readaline(node, self["StatusCmd"]%node) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 if rc == 1: self.debug("%s _is_ the DC" % node) return rc def isolate_node(self, node, allowlist): '''isolate the communication between the nodes''' rc = self.rsh(node, self["BreakCommCmd2"]%allowlist) if rc == 0: return 1 else: self.log("Could not break the communication from node: %s",node) return None def Configuration(self): if self.Env["ClobberCIB"] == 1: if self.Env["CIBfilename"] == None: os.system("rm -f /tmp/cts.default.cib") os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib") cib=parse("/tmp/cts.default.cib") # os.system("rm -f /tmp/cts.default.cib") else: cib=parse(self.Env["CIBfilename"]) else: local_cib = "%s/cts_cib_%s.xml"%(self["TmpDir"],str(os.getpid())) if 0!=self.rsh.cp("root@"+self["CIBfile"]%self.Env["nodes"][0],local_cib): raise ValueError("Can not copy file to %s, maybe permission denied"%self["TmpDir"]) cib=parse(local_cib) os.remove(local_cib) return cib.getElementsByTagName('configuration')[0] def Resources(self): ResourceList = [] #read resources in cib configuration = self.Configuration() resources = configuration.getElementsByTagName('resources')[0] rscs = configuration.getElementsByTagName('primitive') incs = configuration.getElementsByTagName('clone') groups = configuration.getElementsByTagName('group') for rsc in rscs: if rsc in resources.childNodes: ResourceList.append(HAResource(self,rsc)) for grp in groups: for rsc in rscs: if rsc in grp.childNodes: resource = HAResource(self,rsc,grp.getAttribute('id')) ResourceList.append(resource) for inc in incs: max = 0 inc_name = inc.getAttribute("id") instance_attributes = inc.getElementsByTagName('instance_attributes')[0] attributes = instance_attributes.getElementsByTagName('attributes')[0] nvpairs = attributes.getElementsByTagName('nvpair') for nvpair in nvpairs: if nvpair.getAttribute("name") == "clone_max": max = int(nvpair.getAttribute("value")) inc_rsc = inc.getElementsByTagName('primitive')[0] for i in range(0,max): rsc = HAResource(self,inc_rsc) rsc.inc_no = i rsc.inc_name = inc_name rsc.inc_max = max rsc.rid = inc_name+":"+rsc.rid + ":%d"%i rsc.Instance = rsc.rid ResourceList.append(rsc) return ResourceList def ResourceGroups(self): GroupList = [] #read resources in cib configuration = self.Configuration() groups = configuration.getElementsByTagName('group') rscs = configuration.getElementsByTagName('primitive') for grp in groups: group = [] GroupList.append(group) for rsc in rscs: if rsc in grp.childNodes: resource = HAResource(self,rsc,grp.getAttribute('id')) group.append(resource) return GroupList def Dependencies(self): DependencyList = [] #read dependency in cib configuration=self.Configuration() constraints=configuration.getElementsByTagName('constraints')[0] rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc') for node in rsc_to_rscs: dependency = {} dependency["id"]=node.getAttribute('id') dependency["from"]=node.getAttribute('from') dependency["to"]=node.getAttribute('to') dependency["type"]=node.getAttribute('type') dependency["strength"]=node.getAttribute('strength') DependencyList.append(dependency) return DependencyList def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: self.debug("Retrieving partition details for %s" %node) if self.ShouldBeStatus[node] == self["up"]: partition = self.rsh.readaline(node, self["ParitionCmd"]) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] self.debug("partition details for %s: %s" %(node, partition)) found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition") ccm_partitions.append(partition) else: self.log("bad partition details for %s" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == self["up"]: quorum = self.rsh.readaline(node, self["QuorumCmd"]) return string.find(quorum,"1") != -1 return 0 def Components(self): complist = [Process("lrmd",self),Process("crmd",self)] if self.Env["DoFencing"] == 1 : complist.append(Process("stonithd",self)) complist.append(Process("heartbeat",self)) return complist def NodeUUID(self, node): lines = self.rsh.readlines(node, self["UUIDQueryCmd"]) for line in lines: self.debug("UUIDLine:"+ line) m = re.search(r'%s.+\((.+)\)' % node, line) if m: return m.group(1) return "" def StandbyStatus(self, node): out=self.rsh.readaline(node, self["StandbyQueryCmd"]%node) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True class HAResource(Resource): def __init__(self, cm, node, group=None): ''' Get information from xml node ''' if group == None : self.rid = str(node.getAttribute('id')) else : self.rid = group + ":" + str(node.getAttribute('id')) self.rclass = str(node.getAttribute('class')) self.rtype = str(node.getAttribute('type')) self.inc_name = None self.inc_no = -1 self.inc_max = -1 self.rparameters = {} list = node.getElementsByTagName('instance_attributes') if len(list) > 0: attributes = list[0] list = attributes.getElementsByTagName('attributes') if len(list) > 0: parameters = list[0] nvpairs = parameters.getElementsByTagName('nvpair') for nvpair in nvpairs: name=nvpair.getAttribute('name') value=nvpair.getAttribute('value') self.rparameters[name]=value # This should normally be called first... FIXME! Resource.__init__(self, cm, self.rtype, self.rid) # resources that dont need quorum will have: # ops = node.getElementsByTagName('op') for op in ops: if op.getAttribute('name') == "start" and op.getAttribute('prereq') == "nothing": self.needs_quorum = 0 if self.needs_quorum == 0: self.CM.debug("Resource %s does not need quorum" % self.rid) # else: # self.CM.debug("Resource %s DOES need quorum" % self.rid) def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. We call the status operation for the resource script. ''' out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid) return re.search("0",out) def RunningNodes(self): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if self.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def _ResourceOperation(self, operation, nodename): ''' Execute an operation on the resource ''' self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation)) return self.CM.rsh.lastrc == 0 def Start(self, nodename): ''' This member function starts or activates the resource. ''' return self._ResourceOperation("start", nodename) def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' return self._ResourceOperation("stop", nodename) def IsWorkingCorrectly(self, nodename): return self._ResourceOperation("monitor", nodename) ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass