Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/cts/CTSaudits.py.in b/cts/CTSaudits.py.in
index aa1e1c077f..abfaab8497 100755
--- a/cts/CTSaudits.py.in
+++ b/cts/CTSaudits.py.in
@@ -1,698 +1,698 @@
#!@PYTHON@
'''CTS: Cluster Testing System: Audit module
'''
__copyright__='''
Copyright (C) 2000, 2001,2005 Alan Robertson <alanr@unix.sh>
Licensed under the GNU GPL.
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import time, os, popen2, string, re
import CTS
import os
import popen2
class ClusterAudit:
def __init__(self, cm):
self.CM = cm
def __call__(self):
raise ValueError("Abstract Class member (__call__)")
def is_applicable(self):
'''Return TRUE if we are applicable in the current test configuration'''
raise ValueError("Abstract Class member (is_applicable)")
return 1
def name(self):
raise ValueError("Abstract Class member (name)")
AllAuditClasses = [ ]
class ResourceAudit(ClusterAudit):
def name(self):
return "ResourceAudit"
def _doauditRsc(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def _doaudit(self):
'''Check to see if all resources are running in exactly one place
in the cluster.
We also verify that the members of a resource group are all
running on the same node in the cluster,
and we monitor that they are all running "properly".
'''
Fatal = 0
result = []
# Thought: use self.CM.find_partitions() and make this audit
# aware of partitions. Since in a split cluster one
# partition may have quorum (and permission to run resources)
# and the other not.
Groups = self.CM.ResourceGroups()
for group in Groups:
GrpServedBy = None
lastResource = None
for resource in group:
#
# _doauditRsc returns the set of nodes serving
# the given resource. This is normally a single node.
#
ResourceNodes = self._doauditRsc(resource)
# Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(ResourceNodes) != 0 and resource.needs_quorum:
result.append("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(ResourceNodes))
# Is the resource served at all?
elif len(ResourceNodes) == 0 and self.CM.HasQuorum(None):
result.append("Resource " + repr(resource)
+ " not served anywhere.")
# Is the resource served too many times?
elif len(ResourceNodes) > 1:
result.append("Resource " + repr(resource)
+ " served too many times: "
+ repr(ResourceNodes))
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(ResourceNodes))
Fatal = 1
elif GrpServedBy == None:
GrpServedBy = ResourceNodes
# Are all the members of the Rsc Grp served by the same node?
elif GrpServedBy != ResourceNodes:
result.append("Resource group resources" + repr(resource)
+ " running on different nodes: "
+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
+ "(otherRsc = " + repr(lastResource) + ")")
self.CM.log("Resource group resources" + repr(resource)
+ " running on different nodes: "
+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
+ "(otherRsc = " + repr(lastResource) + ")")
Fatal = 1
if self.CM.Env.has_key("SuppressMonitoring") and \
self.CM.Env["SuppressMonitoring"]:
continue
# Is the resource working correctly ?
if not Fatal and len(ResourceNodes) == 1:
beforearpchild = popen2.Popen3("date;/sbin/arp -n|cut -c1-15,26-50,75-"
, None)
beforearpchild.tochild.close() # /dev/null
if not resource.IsWorkingCorrectly(ResourceNodes[0]):
afterarpchild = popen2.Popen3("/sbin/arp -n|cut -c1-15,26-50,75-"
, None)
afterarpchild.tochild.close() # /dev/null
result.append("Resource " + repr(resource)
+ " not operating properly."
+ " Resource is running on " + ResourceNodes[0]);
Fatal = 1
self.CM.log("ARP table before failure ========");
for line in beforearpchild.fromchild.readlines():
self.CM.log(line)
self.CM.log("ARP table after failure ========");
for line in afterarpchild.fromchild.readlines():
self.CM.log(line)
self.CM.log("End of ARP tables ========");
try:
beforearpchild.wait()
afterarpchild.wait()
except OSError: pass
afterarpchild.fromchild.close()
beforearpchild.fromchild.close()
lastResource = resource
if (Fatal):
result.insert(0, "FATAL") # Kludgy.
return result
def __call__(self):
#
# Audit the resources. Since heartbeat doesn't really
# know when resource acquisition is complete, we will
# poll until things get stable.
#
# Having a resource duplicately implemented is a Fatal Error
# with no tolerance granted.
#
audresult = self._doaudit()
#
# Probably the constant below should be a CM parameter.
# Then it could be 0 for FailSafe.
# Of course, it really depends on what resources
# you have in the test suite, and how long it takes
# for them to settle.
# Recently, we've changed heartbeat so we know better when
# resource acquisition is done.
#
audcount=5;
while(audcount > 0):
audresult = self._doaudit()
if (len(audresult) <= 0 or audresult[0] == "FATAL"):
audcount=0
else:
audcount = audcount - 1
if (audcount > 0):
time.sleep(1)
if (len(audresult) > 0):
self.CM.log("Fatal Audit error: " + repr(audresult))
return (len(audresult) == 0)
def is_applicable(self):
if self.CM["Name"] == "heartbeat":
return 1
return 0
class HAResourceAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
def _RscRunningNodes(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def __call__(self):
passed = 1
NodeofRsc = {}
NumofInc = {}
MaxofInc = {}
self.CM.debug("Do Audit HAResourceAudit")
#Calculate the count of active nodes
up_count = 0;
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
up_count += 1
#Make sure the resouces are running on one and only one node
Resources = self.CM.Resources()
for resource in Resources :
RunningNodes = self._RscRunningNodes(resource)
NodeofRsc[resource.rid]=RunningNodes
if resource.inc_name == None:
#Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum:
self.CM.log("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(RunningNodes))
passed = 0
#Is the resource served at all?
elif len(RunningNodes) == 0 and self.CM.HasQuorum(None):
self.CM.log("Resource " + repr(resource)
+ " not served anywhere.")
passed = 0
# Is the resource served too many times?
elif len(RunningNodes) > 1:
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(RunningNodes))
passed = 0
else:
if not NumofInc.has_key(resource.inc_name):
NumofInc[resource.inc_name]=0
MaxofInc[resource.inc_name]=resource.inc_max
running = 1
#Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum == 1:
self.CM.log("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(RunningNodes))
passed = 0
#Is the resource served at all?
elif len(RunningNodes) == 0 :
running = 0
# Is the resource served too many times?
elif len(RunningNodes) > 1:
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(RunningNodes))
passed = 0
if running:
NumofInc[resource.inc_name] += 1
if self.CM.HasQuorum(None):
for inc_name in NumofInc.keys():
if NumofInc[inc_name] != min(up_count, MaxofInc[inc_name]):
passed = 0
self.CM.log("Cloned resource "+ str(inc_name)
+" has "+ str(NumofInc[inc_name])
+" active instances (max: "
+ str(MaxofInc[inc_name])
+", active nodes: "+ str(up_count) + ")")
Groups = self.CM.ResourceGroups()
for group in Groups :
group_printed = 0
first_rsc = group[0].rid
RunningNodes = NodeofRsc[first_rsc]
for rsc in group :
if RunningNodes != NodeofRsc[rsc.rid]:
passed = 0
if group_printed == 0:
group_printed = 1
self.CM.log("Group audit failed for: %s" % repr(group))
if not NodeofRsc[first_rsc] or len(NodeofRsc[first_rsc]) == 0:
self.CM.log("* %s not running" % first_rsc)
else:
self.CM.log("* %s running on %s"
%(first_rsc, repr(NodeofRsc[first_rsc])))
if not NodeofRsc[rsc.rid] or len(NodeofRsc[rsc.rid]) == 0:
self.CM.log("* %s not running" % rsc.rid)
else:
self.CM.log("* %s running on %s"
%(rsc.rid, repr(NodeofRsc[rsc.rid])))
# Make sure the resouces with "must","placement" constraint
# are running on the same node
Dependancies = self.CM.Dependencies()
for dependency in Dependancies:
if dependency["type"] == "placement" and dependency["strength"] == "must":
if NodeofRsc[dependency["from"]] != NodeofRsc[dependency["to"]]:
print dependency["from"] + " and " + dependency["to"] + " should be run on same node"
passed = 0
return passed
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["ResCanStop"] == 0:
return 1
return 0
def name(self):
return "HAResourceAudit"
class CrmdStateAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
passed = 1
up_are_down = 0
down_are_up = 0
unstable_list = []
self.CM.debug("Do Audit %s"%self.name())
for node in self.CM.Env["nodes"]:
should_be = self.CM.ShouldBeStatus[node]
rc = self.CM.StataCM(node)
if rc:
if should_be == self.CM["down"]:
down_are_up = down_are_up + 1
if not self.CM.node_stable(node):
unstable_list.append(node)
elif should_be == self.CM["up"]:
up_are_down = up_are_down + 1
if len(unstable_list) > 0:
passed = 0
self.CM.log("Cluster is not stable: %d (of %d): %s"
%(len(unstable_list), self.CM.upcount(), repr(unstable_list)))
if up_are_down > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be up were down."
%(up_are_down, len(self.CM.Env["nodes"])))
if down_are_up > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be down were up."
%(down_are_up, len(self.CM.Env["nodes"])))
return passed
def name(self):
return "CrmdStateAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
class CIBAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.debug("Do Audit %s"%self.name())
passed = 1
ccm_partitions = self.CM.find_partitions()
if len(ccm_partitions) == 0:
self.CM.debug("\tNo partitions to audit")
return 1
for partition in ccm_partitions:
self.CM.debug("\tAuditing CIB consistency for: %s" %partition)
partition_passed = 0
if self.audit_cib_contents(partition) == 0:
passed = 0
return passed
def audit_cib_contents(self, hostlist):
passed = 1
first_host = None
first_host_xml = ""
partition_hosts = hostlist.split()
for a_host in partition_hosts:
if first_host == None:
first_host = a_host
first_host_xml = self.store_remote_cib(a_host)
#self.CM.debug("Retrieved CIB: %s" % first_host_xml)
else:
a_host_xml = self.store_remote_cib(a_host)
diff_cmd="@sbindir@/crm_diff -c -VV -f -N \'%s\' -O '%s'" % (a_host_xml, first_host_xml)
infile, outfile, errfile = os.popen3(diff_cmd)
diff_lines = outfile.readlines()
for line in diff_lines:
if not re.search("<diff/>", line):
passed = 0
self.CM.log("CibDiff[%s-%s]: %s"
% (first_host, a_host, line))
else:
self.CM.debug("CibDiff[%s-%s] Ignoring: %s"
% (first_host, a_host, line))
diff_lines = errfile.readlines()
for line in diff_lines:
passed = 0
self.CM.log("CibDiff[%s-%s] ERROR: %s"
% (first_host, a_host, line))
return passed
def store_remote_cib(self, node):
combined = ""
first_line = 1
extra_debug = 0
#self.CM.debug("\tRetrieving CIB from: %s" % node)
lines = self.CM.rsh.readlines(node, self.CM["CibQuery"])
if extra_debug:
self.CM.debug("Start Cib[%s]" % node)
for line in lines:
combined = combined + line[:-1]
if first_line:
self.CM.debug("[Cib]" + line)
first_line = 0
elif extra_debug:
self.CM.debug("[Cib]" + line)
if extra_debug:
self.CM.debug("End Cib[%s]" % node)
#self.CM.debug("Complete CIB: %s" % combined)
return combined
def name(self):
return "CibAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
class PartitionAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
self.NodeEpoche={}
self.NodeState={}
self.NodeQuorum={}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.debug("Do Audit %s"%self.name())
passed = 1
ccm_partitions = self.CM.find_partitions()
- if len(ccm_partitions) == 0:
+ if ccm_partitions == None or len(ccm_partitions) == 0:
return 1
if len(ccm_partitions) > 1:
self.CM.log("Warn: %d cluster partitions detected:" %len(ccm_partitions))
for partition in ccm_partitions:
self.CM.log("\t %s" %partition)
for partition in ccm_partitions:
partition_passed = 0
if self.audit_partition(partition) == 0:
passed = 0
return passed
def trim_string(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return avalue[:-1]
def trim2int(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return int(avalue[:-1])
def audit_partition(self, partition):
passed = 1
dc_found = []
dc_allowed_list = []
lowest_epoche = None
node_list = partition.split()
self.CM.debug("Auditing partition: %s" %(partition))
for node in node_list:
if self.CM.ShouldBeStatus[node] != self.CM["up"]:
self.CM.log("Warn: Node %s appeared out of nowhere" %(node))
self.CM.ShouldBeStatus[node] = self.CM["up"]
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
self.NodeState[node] = self.CM.rsh.readaline(
node, self.CM["StatusCmd"]%node)
self.NodeEpoche[node] = self.CM.rsh.readaline(
node, self.CM["EpocheCmd"])
self.NodeQuorum[node] = self.CM.rsh.readaline(
node, self.CM["QuorumCmd"])
self.NodeState[node] = self.trim_string(self.NodeState[node])
self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node])
self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node])
if not self.NodeEpoche[node]:
self.CM.log("Warn: Node %s dissappeared: cant determin epoche" %(node))
self.CM.ShouldBeStatus[node] = self.CM["down"]
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
elif lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
lowest_epoche = self.NodeEpoche[node]
if not lowest_epoche:
self.CM.log("Lowest epoche not determined in %s" % (partition))
passed = 0
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.CM.is_node_dc(node, self.NodeState[node]):
dc_found.append(node)
if self.NodeEpoche[node] == lowest_epoche:
self.CM.debug("%s: OK" % node)
elif not self.NodeEpoche[node]:
self.CM.debug("Check on %s ignored: no node epoche" % node)
elif not lowest_epoche:
self.CM.debug("Check on %s ignored: no lowest epoche" % node)
else:
self.CM.log("DC %s is not the oldest node (%d vs. %d)"
%(node, self.NodeEpoche[node], lowest_epoche))
passed = 0
if len(dc_found) == 0:
self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
elif len(dc_found) > 1:
self.CM.log("%d DCs (%s) found in cluster partition: %s"
%(len(dc_found), str(dc_found), str(node_list)))
passed = 0
elif self.CM.Env["CIBResource"] == 1 and self.NodeQuorum[dc_found[0]]:
if self.audit_dc_resources(node_list, dc_found) == 0:
passed = 0
Resources = self.CM.Resources()
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
for resource in Resources:
if resource.rid == "rsc_"+node:
if resource.IsRunningOn(node) == 0:
self.CM.log("Node %s is not running its own resource" %(node))
passed = 0
elif self.CM.Env["CIBResource"] == 1:
# no quorum means no resource management
self.CM.debug("Not auditing resources - no quorum")
if passed == 0:
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log("epoche %s : %s"
%(self.NodeEpoche[node], self.NodeState[node]))
return passed
def audit_dc_resources(self, node_list, dc_list):
passed = 1
Resources = self.CM.Resources()
for resource in Resources:
if resource.rid == "DcIPaddr":
self.CM.debug("Auditing resource: %s" %(resource))
# All DCs are running the resource
for dc in dc_list:
if self.NodeQuorum[dc]:
if resource.IsRunningOn(dc) == 0:
self.CM.log("Resource %s not running on DC: %s"
%(resource, dc))
passed = 0
# All nodes running the resource are DCs
for node in node_list:
if resource.IsRunningOn(node):
if self.CM.is_node_dc(node, self.NodeState[node]) == 0:
self.CM.log("Resource %s is running on non-DC node %s"
%("DcIPaddr", node))
passed = 0
return passed
def name(self):
return "PartitionAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
AllAuditClasses.append(CrmdStateAudit)
AllAuditClasses.append(PartitionAudit)
AllAuditClasses.append(ResourceAudit)
AllAuditClasses.append(HAResourceAudit)
AllAuditClasses.append(CIBAudit)
def AuditList(cm):
result = []
for auditclass in AllAuditClasses:
result.append(auditclass(cm))
return result

File Metadata

Mime Type
text/x-diff
Expires
Sat, Jan 25, 6:07 AM (1 d, 1 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1321474
Default Alt Text
(25 KB)

Event Timeline