No OneTemporary
Actions

Size

23 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in
	index 766a1f030d..4e28c815f7 100755
	--- a/cts/CM_LinuxHAv2.py.in
	+++ b/cts/CM_LinuxHAv2.py.in
	@@ -1,633 +1,621 @@
	#!@PYTHON@

	'''CTS: Cluster Testing System: LinuxHA v2 dependent modules...
	'''

	__copyright__='''
	Author: Huang Zhen <zhenhltc@cn.ibm.com>
	Copyright (C) 2004 International Business Machines

	Additional Audits, Revised Start action, Default Configuration:
	Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>

	'''

	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 2
	# of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

	import CTS
	from CTS import *
	from CM_hb import HeartbeatCM
	from xml.dom.minidom import *
	import CTSaudits
	from CTSaudits import ClusterAudit
	import CTStests
	from CTStests import *
	#######################################################################
	#
	# LinuxHA v2 dependent modules
	#
	#######################################################################


	class LinuxHAv2(HeartbeatCM):
	'''
	The linux-ha version 2 cluster manager class.
	It implements the things we need to talk to and manipulate
	linux-ha version 2 clusters
	'''
	def __init__(self, Environment, randseed=None):
	HeartbeatCM.__init__(self, Environment, randseed=randseed)

	self.update({
	"Name" : "linux-ha-v2",
	"DeadTime" : 600,
	"StableTime" : 10,
	"StartCmd" : "@libdir@/heartbeat/heartbeat >/dev/null 2>&1",
	"StopCmd" : "@libdir@/heartbeat/heartbeat -k",
	"StatusCmd" : "@libdir@/heartbeat/crmadmin -S %s 2>/dev/null",
	"EpocheCmd" : "@libdir@/heartbeat/ccm_tool -e",
	"QuorumCmd" : "@libdir@/heartbeat/ccm_tool -q",
	"ParitionCmd" : "@libdir@/heartbeat/ccm_tool -p",
	"IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s status 0 0 EVERYTIME 2>/dev/null\|grep return",
	"ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null",
	"CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml",

	"IsIPAddrRscRunning" : "",

	# Patterns to look for in the log files for various occasions...
	"Pat:DC_IDLE" : "crmd:.State transition.-> S_IDLE",
	"Pat:We_started" : "%s crmd:.State transition.-> (S_NOT_DC\|S_IDLE)",
	"Pat:They_started" : "%s crmd:.State transition.-> (S_NOT_DC\|S_IDLE)",
	"Pat:We_stopped" : ("%s heartbeat.*Heartbeat shutdown complete" %(self.OurNode)),
	"Pat:They_stopped" : "%s heartbeat.*Heartbeat shutdown complete",
	"Pat:All_stopped" : "%s heartbeat.*Heartbeat shutdown complete",

	# Bad news Regexes. Should never occur.
	"BadRegexes" : (
	r"Shutting down\.",
	r"Forcing shutdown\.",
	r"Timer I_TERMINATE just popped",
	r", exiting\.",
	r"ERROR:",
	r"CRIT:",
	),
	})
	self.default_cts_cib='''
	<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
	<configuration>
	<crm_config/>
	<nodes/>
	<resources>
	</resources>
	<constraints>
	<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
	<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
	<expression attribute="is_dc" operation="eq" value="true"/>
	</rule>
	</rsc_location>
	</constraints>
	</configuration>
	<status/>
	</cib>
	'''
	if self.Env["CIBResource"] == 1:
	print("Enabling DC resource")
	self.default_cts_cib='''
	<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
	<configuration>
	<crm_config/>
	<nodes/>
	<resources>
	<resource id="DcIPaddr" class="heartbeat" type="IPaddr2" priority="1.0">
	<instance_attributes>
	<rsc_parameters>
	<nvpair name="1" value="127.0.0.10"/>
	</rsc_parameters>
	</instance_attributes>
	</resource>
	</resources>
	<constraints>
	<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
	<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
	<expression attribute="is_dc" operation="eq" value="true"/>
	</rule>
	</rsc_location>
	</constraints>
	</configuration>
	<status/>
	</cib>
	'''

	# KLUDGE! Expedient, but a Kludge (FIXME)
	# CTStests.AllTestClasses = [FlipTest,RestartTest,StartOnebyOne,SimulStart,SimulStop,Split_brainTest,BandwidthTest]
	# StartOnebyOne is redundant as it is performed before SimulStop
	CTStests.AllTestClasses = [FlipTest, RestartTest, SimulStart, SimulStop]

	# CTSaudits.AllAuditClasses = [CrmdStateAudit, HAResourceAudit]
	CTSaudits.AllAuditClasses = [CrmdStateAudit, PartitionAudit]

	- def errorstoignore(self):
	- '''Return list of errors which are 'normal' and should be ignored'''
	- if 0:
	- return [ "heartbeat.*ERROR: Respawning client \"/usr/lib/heartbeat/ha_logd\"",
	- "heartbeat.*ERROR: Irretrievably lost packet",
	- "heartbeat.ERROR: Cannot rexmit pkt .: seqno too low",
	- "heartbeat.ERROR: Cannot rexmit pkt .: seqno not found",
	- "heartbeat.*ERROR: channel is not connected",
	- "ccm: .ERROR: .dropping message of type.*Is this a Byzantime failure?"
	- ]
	- return []
	-

	def StataCM(self, node):

	'''Report the status of the cluster manager on a given node'''

	out=self.rsh.readaline(node, self["StatusCmd"]%node)
	ret= (string.find(out, 'ok') != -1)
	try:
	if ret:
	if self.ShouldBeStatus[node] != self["up"]:
	self.log(
	"Node status for %s is %s but we think it should be %s"
	% (node, self["up"], self.ShouldBeStatus[node]))
	self.log("Expected: %s. Actual: %s"
	% (self.ShouldBeStatus[node], out))
	else:
	if self.ShouldBeStatus[node] != self["down"]:
	self.log(
	"Node status for %s is %s but we think it should be %s"
	% (node, self["down"], self.ShouldBeStatus[node]))
	except KeyError: pass

	if ret: self.ShouldBeStatus[node]=self["up"]
	else: self.ShouldBeStatus[node]=self["down"]
	return ret

	def StartaCM(self, node):

	'''Start up the cluster manager on a given node'''
	patterns = []
	patterns.append(self["Pat:We_started"]%node)

	# only search for this pattern if there is another node out there
	# that should be the DC
	if self.any_running() == 1:
	patterns.append(self["Pat:DC_IDLE"])

	watch = CTS.LogWatcher(self["LogFileName"], patterns, 120)
	watch.setwatch()

	self.log ("Starting %s on node %s" %(self["Name"], node))

	if self.Env["ClobberCIB"] != None:
	if self.Env["CIBfilename"] == None:
	os.system("rm -f /tmp/cts.default.cib")
	os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib")
	self.rsh.cp("/tmp/cts.default.cib", self["CIBfile"]%node)
	os.system("rm -f /tmp/cts.default.cib")
	else:
	self.rsh.cp(self.Env["CIBfilename"], self["CIBfile"]%node)

	self.rsh(node, self["StartCmd"])
	self.ShouldBeStatus[node]=self["up"]
	if watch.lookforall():
	return 1

	# the watch() failed... lets check to see if the start _really_ failed
	for regex in watch.unmatched:
	self.log ("Startup pattern not found: %s" %(regex))

	out = self.rsh.readaline(node, self["StatusCmd"])
	if string.find(out, 'ok') == -1:
	# yep, it _really_ failed
	self.ShouldBeStatus[node]=self["down"]
	self.log ("Could not start %s on node %s" %(self["Name"], node))
	return None

	ret=(string.find(out, 'S_NOT_DC') != -1)
	if ret:
	# actually we joined the cluster just fine
	self.log ("%s on %s joined the cluster" %(self["Name"], node))
	return 1

	ret= (string.find(out, 'S_IDLE') != -1)
	if ret:
	# actually we joined the cluster just fine
	self.log ("%s on %s joined the cluster as DC" %(self["Name"], node))
	return 1

	self.log ("%s on %s started but unstable: %s"
	%(self["Name"], node, out))
	# self.ShouldBeStatus[node]=self["down"]
	return None

	def Configuration(self):
	if not self.rsh.cp(self["CIBfile"]%self.Env["nodes"][0],self.Env["HAdir"]):
	raise ValueError("Can not copy file to %s, maybe permission denied"%self.Env["HAdir"])
	cib=parse("%s/cib.xml"%self.Env["HAdir"])
	return cib.getElementsByTagName('configuration')[0]

	def Resources(self):
	ResourceList = []
	#read resources in cib
	configuration=self.Configuration()
	resources=configuration.getElementsByTagName('resources')[0]
	rscs=configuration.getElementsByTagName('resource')
	for rsc in rscs:
	ResourceList.append(HAResource(self,rsc))
	return ResourceList

	def Dependancies(self):
	DependancyList = []
	#read dependancy in cib
	configuration=self.Configuration()
	constraints=configuration.getElementsByTagName('constraints')[0]
	rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc')
	for node in rsc_to_rscs:
	dependancy = {}
	dependancy["id"]=node.getAttribute('id')
	dependancy["from"]=node.getAttribute('from')
	dependancy["to"]=node.getAttribute('to')
	dependancy["type"]=node.getAttribute('type')
	dependancy["strength"]=node.getAttribute('strength')
	DependancyList.append(dependancy)
	return DependancyList

	def any_running(self):
	for node in self.Env["nodes"]:
	if self.ShouldBeStatus[node] == self["up"]:
	return 1
	return 0

	class HAResourceAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm

	def _RscRunningNodes(self, resource):
	ResourceNodes = []
	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if resource.IsRunningOn(node):
	ResourceNodes.append(node)
	return ResourceNodes

	def __call__(self):
	self.CM.log ("Do Audit %s"%self.name())
	passed = 1
	NodeofRsc = {}

	#Make sure the resouces are running on one and only one node
	Resources = self.CM.Resources()
	for resource in Resources :
	RunningNodes = self._RscRunningNodes(resource)
	NodeofRsc[resource.rid]=RunningNodes
	if len(RunningNodes) == 0 :
	print resource.rid + " isn't running anywhere"
	passed = 0
	if len(RunningNodes) > 1:
	print resource.rid + " is running more than once: " \
	+ str(RunningNodes)
	passed = 0

	#Make sure the resouces with "must","placement" constraint are running on the same node
	Dependancies = self.CM.Dependancies()
	for dependancy in Dependancies:
	if dependancy["type"] == "placement" and dependancy["strength"] == "must":
	if NodeofRsc[dependancy["from"]] != NodeofRsc[dependancy["to"]]:
	print dependancy["from"] + " and " + dependancy["to"] + " should be run on same node"
	passed = 0

	return passed

	def name(self):
	return "HAResourceAudit"

	class HAResource(Resource):
	def __init__(self, cm, node):
	'''
	Get information from xml node
	'''
	self.rid = node.getAttribute('id')
	self.rclass = node.getAttribute('class')
	self.rtype = node.getAttribute('type')
	self.rparameters = {}

	attributes = node.getElementsByTagName('instance_attributes')[0]
	parameters = node.getElementsByTagName('rsc_parameters')[0]
	nvpairs = node.getElementsByTagName('nvpair')
	for nvpair in nvpairs:
	name=nvpair.getAttribute('name')
	value=nvpair.getAttribute('value')
	self.rparameters[name]=value
	Resource.__init__(self, cm, self.rtype, self.rid)

	def IsRunningOn(self, nodename):
	'''
	This member function returns true if our resource is running
	on the given node in the cluster.
	We call the status operation for the resource script.
	'''

	out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid)
	return re.search("0",out)

	def RunningNodes(self):
	ResourceNodes = []
	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if self.IsRunningOn(node):
	ResourceNodes.append(node)
	return ResourceNodes

	def _ResourceOperation(self, operation, nodename):
	'''
	Execute an operation on the resource
	'''
	self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation))
	return self.CM.rsh.lastrc == 0

	def Start(self, nodename):
	'''
	This member function starts or activates the resource.
	'''
	return self._ResourceOperation("start", nodename)

	def Stop(self, nodename):
	'''
	This member function stops or deactivates the resource.
	'''
	return self._ResourceOperation("stop", nodename)

	def IsWorkingCorrectly(self, nodename):
	return self._ResourceOperation("monitor", nodename)


	class CrmdStateAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}

	def has_key(self, key):
	return self.Stats.has_key(key)

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	return self.Stats[key]

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not self.Stats.has_key(name):
	self.Stats[name]=0
	self.Stats[name] = self.Stats[name]+1

	def __call__(self):
	self.CM.log ("Do Audit %s"%self.name())
	passed = 1
	dc_list = []
	up_count = 0
	node_count = 0
	up_are_down = 0
	down_are_up = 0
	slave_count = 0
	unstable_list = []

	for node in self.CM.Env["nodes"]:
	out=self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node)
	ret = (string.find(out, 'ok') != -1)
	node_count = node_count + 1
	if ret:
	up_count = up_count + 1
	if self.CM.ShouldBeStatus[node] == self.CM["down"]:
	self.CM.log(
	"Node %s %s when it should be %s"
	% (node, self.CM["up"], self.CM.ShouldBeStatus[node]))
	self.CM.ShouldBeStatus[node] = self.CM["up"]
	down_are_up = down_are_up + 1

	ret= (string.find(out, 'S_NOT_DC') != -1)
	if ret:
	slave_count = slave_count + 1
	else:
	ret= (string.find(out, 'S_IDLE') != -1)
	if ret:
	dc_list.append(node)
	else:
	unstable_list.append(out)
	else:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	self.CM.log(
	"Node %s %s when it should be %s"
	% (node, self.CM["down"], self.CM.ShouldBeStatus[node]))
	self.CM.ShouldBeStatus[node] = self.CM["down"]
	up_are_down = up_are_down + 1

	# if up_count > 0 and len(dc_list) != 1:
	# passed = 0
	# self.CM.log("Exactly 1 node should be DC. We found %d (of %d): %s"
	# %(len(dc_list), up_count, str(dc_list)))

	if len(unstable_list) > 0:
	passed = 0
	self.CM.log("Cluster is not stable: %d (of %d)."
	%(len(unstable_list), up_count))
	for status in unstable_list:
	self.CM.log("%s" %(status))

	if up_are_down > 0:
	passed = 0
	self.CM.log("%d (of %d) nodes expected to be up were down."
	%(up_are_down, node_count))

	if down_are_up > 0:
	passed = 0
	self.CM.log("%d (of %d) nodes expected to be down were up."
	%(down_are_up, node_count))

	return passed

	def name(self):
	return "CrmdStateAudit"

	class PartitionAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}
	self.NodeEpoche={}
	self.NodeState={}
	self.NodeQuorum={}
	self.NodeCCM={}

	def has_key(self, key):
	return self.Stats.has_key(key)

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	return self.Stats[key]

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not self.Stats.has_key(name):
	self.Stats[name]=0
	self.Stats[name] = self.Stats[name]+1

	def __call__(self):
	self.CM.log ("Do Audit %s"%self.name())
	passed = 1
	nodes_up = 0
	ccm_partitions = []

	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	nodes_up = nodes_up + 1
	# self.PS_State[node] = os.system("@SSH@ root@%s ps -C crmd" %(node))
	self.NodeQuorum[node] = self.CM.rsh.readaline(
	node, self.CM["QuorumCmd"])
	self.NodeCCM[node] = self.CM.rsh.readaline(
	node, self.CM["ParitionCmd"])
	self.NodeEpoche[node] = self.CM.rsh.readaline(
	node, self.CM["EpocheCmd"])
	self.NodeState[node] = self.CM.rsh.readaline(
	node, self.CM["StatusCmd"]%node)

	if len(self.NodeState[node]) > 1:
	self.NodeState[node] = self.NodeState[node][:-1]

	if len(self.NodeEpoche[node]) > 0:
	self.NodeEpoche[node] = int(self.NodeEpoche[node][:-1])

	if len(self.NodeQuorum[node]) > 1:
	self.NodeQuorum[node] = self.NodeQuorum[node][:-1]

	if len(self.NodeCCM[node]) > 1:
	self.NodeCCM[node] = self.NodeCCM[node][:-1]
	found = 0
	for partition in ccm_partitions:
	if partition == self.NodeCCM[node]:
	found = 1
	if found == 0:
	ccm_partitions.append(self.NodeCCM[node])

	if nodes_up == 0:
	return 1

	# if len(ccm_partitions) > 1:
	# self.CM.log("%d cluster partitions detected:" %len(ccm_partitions))
	# for partition in ccm_partitions:
	# self.CM.log("\t %s" %partition)

	for partition in ccm_partitions:
	partition_passed = 0
	if self.audit_partition(partition) == 0:
	passed = 0

	return passed

	def audit_partition(self, partition):
	passed = 0
	dc_found = []
	dc_allowed_list = []
	lowest_epoche = None
	node_list = partition.split()

	self.CM.log("Auditing partition: %s" %(partition))
	for node in node_list:
	if lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
	lowest_epoche = self.NodeEpoche[node]

	for node in node_list:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if self.is_node_dc(self.NodeState[node]):
	dc_found.append(node)
	if self.NodeEpoche[node] == lowest_epoche:
	passed = 1
	else:
	self.CM.log("DC %s is not the oldest node (%d vs. %d)"
	%(node, self.NodeEpoche[node], lowest_epoche))
	passed = 0

	if len(dc_found) == 0:
	self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
	%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
	elif len(dc_found) > 1:
	self.CM.log("%d DCs (%s) found in cluster partition: %s"
	%(len(dc_found), str(dc_found), str(node_list)))
	passed = 0

	if passed == 0:
	for node in node_list:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	self.CM.log("epoche %s : %s"
	%(self.NodeEpoche[node], self.NodeState[node]))

	if self.audit_dc_resources(node_list, dc_found) == 0:
	passed = 0

	return passed

	def audit_dc_resources(self, node_list, dc_list):
	passed = 1
	Resources = self.CM.Resources()
	for resource in Resources:
	self.CM.log("Auditing resource: %s" %(resource))
	if resource.rid == "DcIPaddr":
	# All DCs are running the resource
	for dc in dc_list:
	if self.NodeQuorum[dc]:
	if resource.IsRunningOn(dc) == 0:
	self.CM.log("Resource %s not running on DC: %s"
	%(resource, dc))
	passed = 0

	# All nodes running the resource are DCs
	for node in node_list:
	if resource.IsRunningOn(node):
	if self.is_node_dc(self.NodeState[node]) == 0:
	self.CM.log("Resource %s is running on non-DC node %s"
	%("DcIPaddr", node))
	passed = 0

	return passed

	def is_node_dc(self, status_line):
	return (string.find(status_line, 'S_IDLE') != -1)

	def name(self):
	return "PartitionAudit"


	#######################################################################
	#
	# A little test code...
	#
	# Which you are advised to completely ignore...
	#
	#######################################################################
	if __name__ == '__main__':
	pass

File Metadata

Mime Type: text/x-diff
Expires: Thu, Oct 16, 12:37 AM (10 h, 7 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 2530931
Default Alt Text: (23 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions