No OneTemporary
Actions

Size

25 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/cts/CTSaudits.py.in b/cts/CTSaudits.py.in
	index aa1e1c077f..abfaab8497 100755
	--- a/cts/CTSaudits.py.in
	+++ b/cts/CTSaudits.py.in
	@@ -1,698 +1,698 @@
	#!@PYTHON@

	'''CTS: Cluster Testing System: Audit module
	'''

	__copyright__='''
	Copyright (C) 2000, 2001,2005 Alan Robertson <alanr@unix.sh>
	Licensed under the GNU GPL.
	'''

	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 2
	# of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

	import time, os, popen2, string, re
	import CTS
	import os
	import popen2


	class ClusterAudit:

	def __init__(self, cm):
	self.CM = cm

	def __call__(self):
	raise ValueError("Abstract Class member (__call__)")

	def is_applicable(self):
	'''Return TRUE if we are applicable in the current test configuration'''
	raise ValueError("Abstract Class member (is_applicable)")
	return 1

	def name(self):
	raise ValueError("Abstract Class member (name)")

	AllAuditClasses = [ ]

	class ResourceAudit(ClusterAudit):

	def name(self):
	return "ResourceAudit"

	def _doauditRsc(self, resource):
	ResourceNodes = []
	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if resource.IsRunningOn(node):
	ResourceNodes.append(node)
	return ResourceNodes

	def _doaudit(self):
	'''Check to see if all resources are running in exactly one place
	in the cluster.
	We also verify that the members of a resource group are all
	running on the same node in the cluster,
	and we monitor that they are all running "properly".
	'''
	Fatal = 0
	result = []

	# Thought: use self.CM.find_partitions() and make this audit
	# aware of partitions. Since in a split cluster one
	# partition may have quorum (and permission to run resources)
	# and the other not.

	Groups = self.CM.ResourceGroups()
	for group in Groups:
	GrpServedBy = None
	lastResource = None

	for resource in group:

	#
	# _doauditRsc returns the set of nodes serving
	# the given resource. This is normally a single node.
	#

	ResourceNodes = self._doauditRsc(resource)


	# Is the resource served without quorum present?

	if not self.CM.HasQuorum(None) and len(ResourceNodes) != 0 and resource.needs_quorum:
	result.append("Resource " + repr(resource)
	+ " active without Quorum: "
	+ repr(ResourceNodes))

	# Is the resource served at all?

	elif len(ResourceNodes) == 0 and self.CM.HasQuorum(None):
	result.append("Resource " + repr(resource)
	+ " not served anywhere.")

	# Is the resource served too many times?

	elif len(ResourceNodes) > 1:
	result.append("Resource " + repr(resource)
	+ " served too many times: "
	+ repr(ResourceNodes))
	self.CM.log("Resource " + repr(resource)
	+ " served too many times: "
	+ repr(ResourceNodes))
	Fatal = 1
	elif GrpServedBy == None:
	GrpServedBy = ResourceNodes

	# Are all the members of the Rsc Grp served by the same node?

	elif GrpServedBy != ResourceNodes:
	result.append("Resource group resources" + repr(resource)
	+ " running on different nodes: "
	+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
	+ "(otherRsc = " + repr(lastResource) + ")")
	self.CM.log("Resource group resources" + repr(resource)
	+ " running on different nodes: "
	+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
	+ "(otherRsc = " + repr(lastResource) + ")")
	Fatal = 1

	if self.CM.Env.has_key("SuppressMonitoring") and \
	self.CM.Env["SuppressMonitoring"]:
	continue

	# Is the resource working correctly ?

	if not Fatal and len(ResourceNodes) == 1:
	beforearpchild = popen2.Popen3("date;/sbin/arp -n\|cut -c1-15,26-50,75-"
	, None)
	beforearpchild.tochild.close() # /dev/null
	if not resource.IsWorkingCorrectly(ResourceNodes[0]):
	afterarpchild = popen2.Popen3("/sbin/arp -n\|cut -c1-15,26-50,75-"
	, None)
	afterarpchild.tochild.close() # /dev/null
	result.append("Resource " + repr(resource)
	+ " not operating properly."
	+ " Resource is running on " + ResourceNodes[0]);
	Fatal = 1
	self.CM.log("ARP table before failure ========");
	for line in beforearpchild.fromchild.readlines():
	self.CM.log(line)
	self.CM.log("ARP table after failure ========");
	for line in afterarpchild.fromchild.readlines():
	self.CM.log(line)
	self.CM.log("End of ARP tables ========");
	try:
	beforearpchild.wait()
	afterarpchild.wait()
	except OSError: pass
	afterarpchild.fromchild.close()
	beforearpchild.fromchild.close()

	lastResource = resource

	if (Fatal):
	result.insert(0, "FATAL") # Kludgy.

	return result


	def __call__(self):
	#
	# Audit the resources. Since heartbeat doesn't really
	# know when resource acquisition is complete, we will
	# poll until things get stable.
	#
	# Having a resource duplicately implemented is a Fatal Error
	# with no tolerance granted.
	#
	audresult = self._doaudit()
	#
	# Probably the constant below should be a CM parameter.
	# Then it could be 0 for FailSafe.
	# Of course, it really depends on what resources
	# you have in the test suite, and how long it takes
	# for them to settle.
	# Recently, we've changed heartbeat so we know better when
	# resource acquisition is done.
	#
	audcount=5;

	while(audcount > 0):
	audresult = self._doaudit()
	if (len(audresult) <= 0 or audresult[0] == "FATAL"):
	audcount=0
	else:
	audcount = audcount - 1
	if (audcount > 0):
	time.sleep(1)
	if (len(audresult) > 0):
	self.CM.log("Fatal Audit error: " + repr(audresult))

	return (len(audresult) == 0)

	def is_applicable(self):
	if self.CM["Name"] == "heartbeat":
	return 1
	return 0

	class HAResourceAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm

	def _RscRunningNodes(self, resource):
	ResourceNodes = []
	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if resource.IsRunningOn(node):
	ResourceNodes.append(node)
	return ResourceNodes

	def __call__(self):
	passed = 1
	NodeofRsc = {}
	NumofInc = {}
	MaxofInc = {}
	self.CM.debug("Do Audit HAResourceAudit")

	#Calculate the count of active nodes
	up_count = 0;
	for node in self.CM.Env["nodes"]:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	up_count += 1

	#Make sure the resouces are running on one and only one node
	Resources = self.CM.Resources()
	for resource in Resources :
	RunningNodes = self._RscRunningNodes(resource)
	NodeofRsc[resource.rid]=RunningNodes
	if resource.inc_name == None:
	#Is the resource served without quorum present?
	if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum:
	self.CM.log("Resource " + repr(resource)
	+ " active without Quorum: "
	+ repr(RunningNodes))
	passed = 0
	#Is the resource served at all?
	elif len(RunningNodes) == 0 and self.CM.HasQuorum(None):
	self.CM.log("Resource " + repr(resource)
	+ " not served anywhere.")
	passed = 0
	# Is the resource served too many times?
	elif len(RunningNodes) > 1:
	self.CM.log("Resource " + repr(resource)
	+ " served too many times: "
	+ repr(RunningNodes))
	passed = 0
	else:
	if not NumofInc.has_key(resource.inc_name):
	NumofInc[resource.inc_name]=0
	MaxofInc[resource.inc_name]=resource.inc_max
	running = 1
	#Is the resource served without quorum present?
	if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum == 1:
	self.CM.log("Resource " + repr(resource)
	+ " active without Quorum: "
	+ repr(RunningNodes))
	passed = 0
	#Is the resource served at all?
	elif len(RunningNodes) == 0 :
	running = 0
	# Is the resource served too many times?
	elif len(RunningNodes) > 1:
	self.CM.log("Resource " + repr(resource)
	+ " served too many times: "
	+ repr(RunningNodes))
	passed = 0

	if running:
	NumofInc[resource.inc_name] += 1

	if self.CM.HasQuorum(None):
	for inc_name in NumofInc.keys():
	if NumofInc[inc_name] != min(up_count, MaxofInc[inc_name]):
	passed = 0
	self.CM.log("Cloned resource "+ str(inc_name)
	+" has "+ str(NumofInc[inc_name])
	+" active instances (max: "
	+ str(MaxofInc[inc_name])
	+", active nodes: "+ str(up_count) + ")")

	Groups = self.CM.ResourceGroups()
	for group in Groups :
	group_printed = 0
	first_rsc = group[0].rid
	RunningNodes = NodeofRsc[first_rsc]
	for rsc in group :
	if RunningNodes != NodeofRsc[rsc.rid]:
	passed = 0

	if group_printed == 0:
	group_printed = 1
	self.CM.log("Group audit failed for: %s" % repr(group))
	if not NodeofRsc[first_rsc] or len(NodeofRsc[first_rsc]) == 0:
	self.CM.log("* %s not running" % first_rsc)
	else:
	self.CM.log("* %s running on %s"
	%(first_rsc, repr(NodeofRsc[first_rsc])))

	if not NodeofRsc[rsc.rid] or len(NodeofRsc[rsc.rid]) == 0:
	self.CM.log("* %s not running" % rsc.rid)
	else:
	self.CM.log("* %s running on %s"
	%(rsc.rid, repr(NodeofRsc[rsc.rid])))

	# Make sure the resouces with "must","placement" constraint
	# are running on the same node
	Dependancies = self.CM.Dependencies()
	for dependency in Dependancies:
	if dependency["type"] == "placement" and dependency["strength"] == "must":
	if NodeofRsc[dependency["from"]] != NodeofRsc[dependency["to"]]:
	print dependency["from"] + " and " + dependency["to"] + " should be run on same node"
	passed = 0

	return passed

	def is_applicable(self):
	if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["ResCanStop"] == 0:
	return 1
	return 0

	def name(self):
	return "HAResourceAudit"


	class CrmdStateAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}

	def has_key(self, key):
	return self.Stats.has_key(key)

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	return self.Stats[key]

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not self.Stats.has_key(name):
	self.Stats[name]=0
	self.Stats[name] = self.Stats[name]+1

	def __call__(self):
	passed = 1
	up_are_down = 0
	down_are_up = 0
	unstable_list = []
	self.CM.debug("Do Audit %s"%self.name())

	for node in self.CM.Env["nodes"]:
	should_be = self.CM.ShouldBeStatus[node]
	rc = self.CM.StataCM(node)
	if rc:
	if should_be == self.CM["down"]:
	down_are_up = down_are_up + 1
	if not self.CM.node_stable(node):
	unstable_list.append(node)
	elif should_be == self.CM["up"]:
	up_are_down = up_are_down + 1

	if len(unstable_list) > 0:
	passed = 0
	self.CM.log("Cluster is not stable: %d (of %d): %s"
	%(len(unstable_list), self.CM.upcount(), repr(unstable_list)))

	if up_are_down > 0:
	passed = 0
	self.CM.log("%d (of %d) nodes expected to be up were down."
	%(up_are_down, len(self.CM.Env["nodes"])))

	if down_are_up > 0:
	passed = 0
	self.CM.log("%d (of %d) nodes expected to be down were up."
	%(down_are_up, len(self.CM.Env["nodes"])))

	return passed

	def name(self):
	return "CrmdStateAudit"

	def is_applicable(self):
	if self.CM["Name"] == "linux-ha-v2":
	return 1
	return 0

	class CIBAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}

	def has_key(self, key):
	return self.Stats.has_key(key)

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	return self.Stats[key]

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not self.Stats.has_key(name):
	self.Stats[name]=0
	self.Stats[name] = self.Stats[name]+1

	def __call__(self):
	self.CM.debug("Do Audit %s"%self.name())
	passed = 1
	ccm_partitions = self.CM.find_partitions()

	if len(ccm_partitions) == 0:
	self.CM.debug("\tNo partitions to audit")
	return 1

	for partition in ccm_partitions:
	self.CM.debug("\tAuditing CIB consistency for: %s" %partition)
	partition_passed = 0
	if self.audit_cib_contents(partition) == 0:
	passed = 0

	return passed

	def audit_cib_contents(self, hostlist):
	passed = 1
	first_host = None
	first_host_xml = ""
	partition_hosts = hostlist.split()
	for a_host in partition_hosts:
	if first_host == None:
	first_host = a_host
	first_host_xml = self.store_remote_cib(a_host)
	#self.CM.debug("Retrieved CIB: %s" % first_host_xml)
	else:
	a_host_xml = self.store_remote_cib(a_host)
	diff_cmd="@sbindir@/crm_diff -c -VV -f -N \'%s\' -O '%s'" % (a_host_xml, first_host_xml)

	infile, outfile, errfile = os.popen3(diff_cmd)
	diff_lines = outfile.readlines()
	for line in diff_lines:
	if not re.search("<diff/>", line):
	passed = 0
	self.CM.log("CibDiff[%s-%s]: %s"
	% (first_host, a_host, line))
	else:
	self.CM.debug("CibDiff[%s-%s] Ignoring: %s"
	% (first_host, a_host, line))

	diff_lines = errfile.readlines()
	for line in diff_lines:
	passed = 0
	self.CM.log("CibDiff[%s-%s] ERROR: %s"
	% (first_host, a_host, line))

	return passed

	def store_remote_cib(self, node):
	combined = ""
	first_line = 1
	extra_debug = 0
	#self.CM.debug("\tRetrieving CIB from: %s" % node)
	lines = self.CM.rsh.readlines(node, self.CM["CibQuery"])
	if extra_debug:
	self.CM.debug("Start Cib[%s]" % node)
	for line in lines:
	combined = combined + line[:-1]
	if first_line:
	self.CM.debug("[Cib]" + line)
	first_line = 0
	elif extra_debug:
	self.CM.debug("[Cib]" + line)

	if extra_debug:
	self.CM.debug("End Cib[%s]" % node)

	#self.CM.debug("Complete CIB: %s" % combined)
	return combined

	def name(self):
	return "CibAudit"

	def is_applicable(self):
	if self.CM["Name"] == "linux-ha-v2":
	return 1
	return 0

	class PartitionAudit(ClusterAudit):
	def __init__(self, cm):
	self.CM = cm
	self.Stats = {"calls":0
	, "success":0
	, "failure":0
	, "skipped":0
	, "auditfail":0}
	self.NodeEpoche={}
	self.NodeState={}
	self.NodeQuorum={}

	def has_key(self, key):
	return self.Stats.has_key(key)

	def __setitem__(self, key, value):
	self.Stats[key] = value

	def __getitem__(self, key):
	return self.Stats[key]

	def incr(self, name):
	'''Increment (or initialize) the value associated with the given name'''
	if not self.Stats.has_key(name):
	self.Stats[name]=0
	self.Stats[name] = self.Stats[name]+1

	def __call__(self):
	self.CM.debug("Do Audit %s"%self.name())
	passed = 1
	ccm_partitions = self.CM.find_partitions()

	- if len(ccm_partitions) == 0:
	+ if ccm_partitions == None or len(ccm_partitions) == 0:
	return 1

	if len(ccm_partitions) > 1:
	self.CM.log("Warn: %d cluster partitions detected:" %len(ccm_partitions))
	for partition in ccm_partitions:
	self.CM.log("\t %s" %partition)

	for partition in ccm_partitions:
	partition_passed = 0
	if self.audit_partition(partition) == 0:
	passed = 0

	return passed

	def trim_string(self, avalue):
	if not avalue:
	return None
	if len(avalue) > 1:
	return avalue[:-1]

	def trim2int(self, avalue):
	if not avalue:
	return None
	if len(avalue) > 1:
	return int(avalue[:-1])


	def audit_partition(self, partition):
	passed = 1
	dc_found = []
	dc_allowed_list = []
	lowest_epoche = None
	node_list = partition.split()

	self.CM.debug("Auditing partition: %s" %(partition))
	for node in node_list:
	if self.CM.ShouldBeStatus[node] != self.CM["up"]:
	self.CM.log("Warn: Node %s appeared out of nowhere" %(node))
	self.CM.ShouldBeStatus[node] = self.CM["up"]
	# not in itself a reason to fail the audit (not what we're
	# checking for in this audit)

	self.NodeState[node] = self.CM.rsh.readaline(
	node, self.CM["StatusCmd"]%node)
	self.NodeEpoche[node] = self.CM.rsh.readaline(
	node, self.CM["EpocheCmd"])
	self.NodeQuorum[node] = self.CM.rsh.readaline(
	node, self.CM["QuorumCmd"])

	self.NodeState[node] = self.trim_string(self.NodeState[node])
	self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node])
	self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node])

	if not self.NodeEpoche[node]:
	self.CM.log("Warn: Node %s dissappeared: cant determin epoche" %(node))
	self.CM.ShouldBeStatus[node] = self.CM["down"]
	# not in itself a reason to fail the audit (not what we're
	# checking for in this audit)
	elif lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
	lowest_epoche = self.NodeEpoche[node]

	if not lowest_epoche:
	self.CM.log("Lowest epoche not determined in %s" % (partition))
	passed = 0

	for node in node_list:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	if self.CM.is_node_dc(node, self.NodeState[node]):
	dc_found.append(node)
	if self.NodeEpoche[node] == lowest_epoche:
	self.CM.debug("%s: OK" % node)
	elif not self.NodeEpoche[node]:
	self.CM.debug("Check on %s ignored: no node epoche" % node)
	elif not lowest_epoche:
	self.CM.debug("Check on %s ignored: no lowest epoche" % node)
	else:
	self.CM.log("DC %s is not the oldest node (%d vs. %d)"
	%(node, self.NodeEpoche[node], lowest_epoche))
	passed = 0

	if len(dc_found) == 0:
	self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
	%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))

	elif len(dc_found) > 1:
	self.CM.log("%d DCs (%s) found in cluster partition: %s"
	%(len(dc_found), str(dc_found), str(node_list)))
	passed = 0

	elif self.CM.Env["CIBResource"] == 1 and self.NodeQuorum[dc_found[0]]:
	if self.audit_dc_resources(node_list, dc_found) == 0:
	passed = 0

	Resources = self.CM.Resources()
	for node in node_list:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	for resource in Resources:
	if resource.rid == "rsc_"+node:
	if resource.IsRunningOn(node) == 0:
	self.CM.log("Node %s is not running its own resource" %(node))
	passed = 0

	elif self.CM.Env["CIBResource"] == 1:
	# no quorum means no resource management
	self.CM.debug("Not auditing resources - no quorum")

	if passed == 0:
	for node in node_list:
	if self.CM.ShouldBeStatus[node] == self.CM["up"]:
	self.CM.log("epoche %s : %s"
	%(self.NodeEpoche[node], self.NodeState[node]))


	return passed

	def audit_dc_resources(self, node_list, dc_list):
	passed = 1
	Resources = self.CM.Resources()
	for resource in Resources:
	if resource.rid == "DcIPaddr":
	self.CM.debug("Auditing resource: %s" %(resource))
	# All DCs are running the resource
	for dc in dc_list:
	if self.NodeQuorum[dc]:
	if resource.IsRunningOn(dc) == 0:
	self.CM.log("Resource %s not running on DC: %s"
	%(resource, dc))
	passed = 0

	# All nodes running the resource are DCs
	for node in node_list:
	if resource.IsRunningOn(node):
	if self.CM.is_node_dc(node, self.NodeState[node]) == 0:
	self.CM.log("Resource %s is running on non-DC node %s"
	%("DcIPaddr", node))
	passed = 0

	return passed

	def name(self):
	return "PartitionAudit"

	def is_applicable(self):
	if self.CM["Name"] == "linux-ha-v2":
	return 1
	return 0

	AllAuditClasses.append(CrmdStateAudit)
	AllAuditClasses.append(PartitionAudit)
	AllAuditClasses.append(ResourceAudit)
	AllAuditClasses.append(HAResourceAudit)
	AllAuditClasses.append(CIBAudit)

	def AuditList(cm):
	result = []
	for auditclass in AllAuditClasses:
	result.append(auditclass(cm))
	return result

File Metadata

Mime Type: text/x-diff
Expires: Sat, Jan 25, 6:07 AM (1 d, 1 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1321474
Default Alt Text: (25 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions