Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F2822570
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
25 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/cts/CTSaudits.py.in b/cts/CTSaudits.py.in
index aa1e1c077f..abfaab8497 100755
--- a/cts/CTSaudits.py.in
+++ b/cts/CTSaudits.py.in
@@ -1,698 +1,698 @@
#!@PYTHON@
'''CTS: Cluster Testing System: Audit module
'''
__copyright__='''
Copyright (C) 2000, 2001,2005 Alan Robertson <alanr@unix.sh>
Licensed under the GNU GPL.
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import time, os, popen2, string, re
import CTS
import os
import popen2
class ClusterAudit:
def __init__(self, cm):
self.CM = cm
def __call__(self):
raise ValueError("Abstract Class member (__call__)")
def is_applicable(self):
'''Return TRUE if we are applicable in the current test configuration'''
raise ValueError("Abstract Class member (is_applicable)")
return 1
def name(self):
raise ValueError("Abstract Class member (name)")
AllAuditClasses = [ ]
class ResourceAudit(ClusterAudit):
def name(self):
return "ResourceAudit"
def _doauditRsc(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def _doaudit(self):
'''Check to see if all resources are running in exactly one place
in the cluster.
We also verify that the members of a resource group are all
running on the same node in the cluster,
and we monitor that they are all running "properly".
'''
Fatal = 0
result = []
# Thought: use self.CM.find_partitions() and make this audit
# aware of partitions. Since in a split cluster one
# partition may have quorum (and permission to run resources)
# and the other not.
Groups = self.CM.ResourceGroups()
for group in Groups:
GrpServedBy = None
lastResource = None
for resource in group:
#
# _doauditRsc returns the set of nodes serving
# the given resource. This is normally a single node.
#
ResourceNodes = self._doauditRsc(resource)
# Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(ResourceNodes) != 0 and resource.needs_quorum:
result.append("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(ResourceNodes))
# Is the resource served at all?
elif len(ResourceNodes) == 0 and self.CM.HasQuorum(None):
result.append("Resource " + repr(resource)
+ " not served anywhere.")
# Is the resource served too many times?
elif len(ResourceNodes) > 1:
result.append("Resource " + repr(resource)
+ " served too many times: "
+ repr(ResourceNodes))
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(ResourceNodes))
Fatal = 1
elif GrpServedBy == None:
GrpServedBy = ResourceNodes
# Are all the members of the Rsc Grp served by the same node?
elif GrpServedBy != ResourceNodes:
result.append("Resource group resources" + repr(resource)
+ " running on different nodes: "
+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
+ "(otherRsc = " + repr(lastResource) + ")")
self.CM.log("Resource group resources" + repr(resource)
+ " running on different nodes: "
+ repr(ResourceNodes)+" vs "+repr(GrpServedBy)
+ "(otherRsc = " + repr(lastResource) + ")")
Fatal = 1
if self.CM.Env.has_key("SuppressMonitoring") and \
self.CM.Env["SuppressMonitoring"]:
continue
# Is the resource working correctly ?
if not Fatal and len(ResourceNodes) == 1:
beforearpchild = popen2.Popen3("date;/sbin/arp -n|cut -c1-15,26-50,75-"
, None)
beforearpchild.tochild.close() # /dev/null
if not resource.IsWorkingCorrectly(ResourceNodes[0]):
afterarpchild = popen2.Popen3("/sbin/arp -n|cut -c1-15,26-50,75-"
, None)
afterarpchild.tochild.close() # /dev/null
result.append("Resource " + repr(resource)
+ " not operating properly."
+ " Resource is running on " + ResourceNodes[0]);
Fatal = 1
self.CM.log("ARP table before failure ========");
for line in beforearpchild.fromchild.readlines():
self.CM.log(line)
self.CM.log("ARP table after failure ========");
for line in afterarpchild.fromchild.readlines():
self.CM.log(line)
self.CM.log("End of ARP tables ========");
try:
beforearpchild.wait()
afterarpchild.wait()
except OSError: pass
afterarpchild.fromchild.close()
beforearpchild.fromchild.close()
lastResource = resource
if (Fatal):
result.insert(0, "FATAL") # Kludgy.
return result
def __call__(self):
#
# Audit the resources. Since heartbeat doesn't really
# know when resource acquisition is complete, we will
# poll until things get stable.
#
# Having a resource duplicately implemented is a Fatal Error
# with no tolerance granted.
#
audresult = self._doaudit()
#
# Probably the constant below should be a CM parameter.
# Then it could be 0 for FailSafe.
# Of course, it really depends on what resources
# you have in the test suite, and how long it takes
# for them to settle.
# Recently, we've changed heartbeat so we know better when
# resource acquisition is done.
#
audcount=5;
while(audcount > 0):
audresult = self._doaudit()
if (len(audresult) <= 0 or audresult[0] == "FATAL"):
audcount=0
else:
audcount = audcount - 1
if (audcount > 0):
time.sleep(1)
if (len(audresult) > 0):
self.CM.log("Fatal Audit error: " + repr(audresult))
return (len(audresult) == 0)
def is_applicable(self):
if self.CM["Name"] == "heartbeat":
return 1
return 0
class HAResourceAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
def _RscRunningNodes(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def __call__(self):
passed = 1
NodeofRsc = {}
NumofInc = {}
MaxofInc = {}
self.CM.debug("Do Audit HAResourceAudit")
#Calculate the count of active nodes
up_count = 0;
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
up_count += 1
#Make sure the resouces are running on one and only one node
Resources = self.CM.Resources()
for resource in Resources :
RunningNodes = self._RscRunningNodes(resource)
NodeofRsc[resource.rid]=RunningNodes
if resource.inc_name == None:
#Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum:
self.CM.log("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(RunningNodes))
passed = 0
#Is the resource served at all?
elif len(RunningNodes) == 0 and self.CM.HasQuorum(None):
self.CM.log("Resource " + repr(resource)
+ " not served anywhere.")
passed = 0
# Is the resource served too many times?
elif len(RunningNodes) > 1:
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(RunningNodes))
passed = 0
else:
if not NumofInc.has_key(resource.inc_name):
NumofInc[resource.inc_name]=0
MaxofInc[resource.inc_name]=resource.inc_max
running = 1
#Is the resource served without quorum present?
if not self.CM.HasQuorum(None) and len(RunningNodes) != 0 and resource.needs_quorum == 1:
self.CM.log("Resource " + repr(resource)
+ " active without Quorum: "
+ repr(RunningNodes))
passed = 0
#Is the resource served at all?
elif len(RunningNodes) == 0 :
running = 0
# Is the resource served too many times?
elif len(RunningNodes) > 1:
self.CM.log("Resource " + repr(resource)
+ " served too many times: "
+ repr(RunningNodes))
passed = 0
if running:
NumofInc[resource.inc_name] += 1
if self.CM.HasQuorum(None):
for inc_name in NumofInc.keys():
if NumofInc[inc_name] != min(up_count, MaxofInc[inc_name]):
passed = 0
self.CM.log("Cloned resource "+ str(inc_name)
+" has "+ str(NumofInc[inc_name])
+" active instances (max: "
+ str(MaxofInc[inc_name])
+", active nodes: "+ str(up_count) + ")")
Groups = self.CM.ResourceGroups()
for group in Groups :
group_printed = 0
first_rsc = group[0].rid
RunningNodes = NodeofRsc[first_rsc]
for rsc in group :
if RunningNodes != NodeofRsc[rsc.rid]:
passed = 0
if group_printed == 0:
group_printed = 1
self.CM.log("Group audit failed for: %s" % repr(group))
if not NodeofRsc[first_rsc] or len(NodeofRsc[first_rsc]) == 0:
self.CM.log("* %s not running" % first_rsc)
else:
self.CM.log("* %s running on %s"
%(first_rsc, repr(NodeofRsc[first_rsc])))
if not NodeofRsc[rsc.rid] or len(NodeofRsc[rsc.rid]) == 0:
self.CM.log("* %s not running" % rsc.rid)
else:
self.CM.log("* %s running on %s"
%(rsc.rid, repr(NodeofRsc[rsc.rid])))
# Make sure the resouces with "must","placement" constraint
# are running on the same node
Dependancies = self.CM.Dependencies()
for dependency in Dependancies:
if dependency["type"] == "placement" and dependency["strength"] == "must":
if NodeofRsc[dependency["from"]] != NodeofRsc[dependency["to"]]:
print dependency["from"] + " and " + dependency["to"] + " should be run on same node"
passed = 0
return passed
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["ResCanStop"] == 0:
return 1
return 0
def name(self):
return "HAResourceAudit"
class CrmdStateAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
passed = 1
up_are_down = 0
down_are_up = 0
unstable_list = []
self.CM.debug("Do Audit %s"%self.name())
for node in self.CM.Env["nodes"]:
should_be = self.CM.ShouldBeStatus[node]
rc = self.CM.StataCM(node)
if rc:
if should_be == self.CM["down"]:
down_are_up = down_are_up + 1
if not self.CM.node_stable(node):
unstable_list.append(node)
elif should_be == self.CM["up"]:
up_are_down = up_are_down + 1
if len(unstable_list) > 0:
passed = 0
self.CM.log("Cluster is not stable: %d (of %d): %s"
%(len(unstable_list), self.CM.upcount(), repr(unstable_list)))
if up_are_down > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be up were down."
%(up_are_down, len(self.CM.Env["nodes"])))
if down_are_up > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be down were up."
%(down_are_up, len(self.CM.Env["nodes"])))
return passed
def name(self):
return "CrmdStateAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
class CIBAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.debug("Do Audit %s"%self.name())
passed = 1
ccm_partitions = self.CM.find_partitions()
if len(ccm_partitions) == 0:
self.CM.debug("\tNo partitions to audit")
return 1
for partition in ccm_partitions:
self.CM.debug("\tAuditing CIB consistency for: %s" %partition)
partition_passed = 0
if self.audit_cib_contents(partition) == 0:
passed = 0
return passed
def audit_cib_contents(self, hostlist):
passed = 1
first_host = None
first_host_xml = ""
partition_hosts = hostlist.split()
for a_host in partition_hosts:
if first_host == None:
first_host = a_host
first_host_xml = self.store_remote_cib(a_host)
#self.CM.debug("Retrieved CIB: %s" % first_host_xml)
else:
a_host_xml = self.store_remote_cib(a_host)
diff_cmd="@sbindir@/crm_diff -c -VV -f -N \'%s\' -O '%s'" % (a_host_xml, first_host_xml)
infile, outfile, errfile = os.popen3(diff_cmd)
diff_lines = outfile.readlines()
for line in diff_lines:
if not re.search("<diff/>", line):
passed = 0
self.CM.log("CibDiff[%s-%s]: %s"
% (first_host, a_host, line))
else:
self.CM.debug("CibDiff[%s-%s] Ignoring: %s"
% (first_host, a_host, line))
diff_lines = errfile.readlines()
for line in diff_lines:
passed = 0
self.CM.log("CibDiff[%s-%s] ERROR: %s"
% (first_host, a_host, line))
return passed
def store_remote_cib(self, node):
combined = ""
first_line = 1
extra_debug = 0
#self.CM.debug("\tRetrieving CIB from: %s" % node)
lines = self.CM.rsh.readlines(node, self.CM["CibQuery"])
if extra_debug:
self.CM.debug("Start Cib[%s]" % node)
for line in lines:
combined = combined + line[:-1]
if first_line:
self.CM.debug("[Cib]" + line)
first_line = 0
elif extra_debug:
self.CM.debug("[Cib]" + line)
if extra_debug:
self.CM.debug("End Cib[%s]" % node)
#self.CM.debug("Complete CIB: %s" % combined)
return combined
def name(self):
return "CibAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
class PartitionAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
self.NodeEpoche={}
self.NodeState={}
self.NodeQuorum={}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.debug("Do Audit %s"%self.name())
passed = 1
ccm_partitions = self.CM.find_partitions()
- if len(ccm_partitions) == 0:
+ if ccm_partitions == None or len(ccm_partitions) == 0:
return 1
if len(ccm_partitions) > 1:
self.CM.log("Warn: %d cluster partitions detected:" %len(ccm_partitions))
for partition in ccm_partitions:
self.CM.log("\t %s" %partition)
for partition in ccm_partitions:
partition_passed = 0
if self.audit_partition(partition) == 0:
passed = 0
return passed
def trim_string(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return avalue[:-1]
def trim2int(self, avalue):
if not avalue:
return None
if len(avalue) > 1:
return int(avalue[:-1])
def audit_partition(self, partition):
passed = 1
dc_found = []
dc_allowed_list = []
lowest_epoche = None
node_list = partition.split()
self.CM.debug("Auditing partition: %s" %(partition))
for node in node_list:
if self.CM.ShouldBeStatus[node] != self.CM["up"]:
self.CM.log("Warn: Node %s appeared out of nowhere" %(node))
self.CM.ShouldBeStatus[node] = self.CM["up"]
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
self.NodeState[node] = self.CM.rsh.readaline(
node, self.CM["StatusCmd"]%node)
self.NodeEpoche[node] = self.CM.rsh.readaline(
node, self.CM["EpocheCmd"])
self.NodeQuorum[node] = self.CM.rsh.readaline(
node, self.CM["QuorumCmd"])
self.NodeState[node] = self.trim_string(self.NodeState[node])
self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node])
self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node])
if not self.NodeEpoche[node]:
self.CM.log("Warn: Node %s dissappeared: cant determin epoche" %(node))
self.CM.ShouldBeStatus[node] = self.CM["down"]
# not in itself a reason to fail the audit (not what we're
# checking for in this audit)
elif lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
lowest_epoche = self.NodeEpoche[node]
if not lowest_epoche:
self.CM.log("Lowest epoche not determined in %s" % (partition))
passed = 0
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.CM.is_node_dc(node, self.NodeState[node]):
dc_found.append(node)
if self.NodeEpoche[node] == lowest_epoche:
self.CM.debug("%s: OK" % node)
elif not self.NodeEpoche[node]:
self.CM.debug("Check on %s ignored: no node epoche" % node)
elif not lowest_epoche:
self.CM.debug("Check on %s ignored: no lowest epoche" % node)
else:
self.CM.log("DC %s is not the oldest node (%d vs. %d)"
%(node, self.NodeEpoche[node], lowest_epoche))
passed = 0
if len(dc_found) == 0:
self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
elif len(dc_found) > 1:
self.CM.log("%d DCs (%s) found in cluster partition: %s"
%(len(dc_found), str(dc_found), str(node_list)))
passed = 0
elif self.CM.Env["CIBResource"] == 1 and self.NodeQuorum[dc_found[0]]:
if self.audit_dc_resources(node_list, dc_found) == 0:
passed = 0
Resources = self.CM.Resources()
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
for resource in Resources:
if resource.rid == "rsc_"+node:
if resource.IsRunningOn(node) == 0:
self.CM.log("Node %s is not running its own resource" %(node))
passed = 0
elif self.CM.Env["CIBResource"] == 1:
# no quorum means no resource management
self.CM.debug("Not auditing resources - no quorum")
if passed == 0:
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log("epoche %s : %s"
%(self.NodeEpoche[node], self.NodeState[node]))
return passed
def audit_dc_resources(self, node_list, dc_list):
passed = 1
Resources = self.CM.Resources()
for resource in Resources:
if resource.rid == "DcIPaddr":
self.CM.debug("Auditing resource: %s" %(resource))
# All DCs are running the resource
for dc in dc_list:
if self.NodeQuorum[dc]:
if resource.IsRunningOn(dc) == 0:
self.CM.log("Resource %s not running on DC: %s"
%(resource, dc))
passed = 0
# All nodes running the resource are DCs
for node in node_list:
if resource.IsRunningOn(node):
if self.CM.is_node_dc(node, self.NodeState[node]) == 0:
self.CM.log("Resource %s is running on non-DC node %s"
%("DcIPaddr", node))
passed = 0
return passed
def name(self):
return "PartitionAudit"
def is_applicable(self):
if self.CM["Name"] == "linux-ha-v2":
return 1
return 0
AllAuditClasses.append(CrmdStateAudit)
AllAuditClasses.append(PartitionAudit)
AllAuditClasses.append(ResourceAudit)
AllAuditClasses.append(HAResourceAudit)
AllAuditClasses.append(CIBAudit)
def AuditList(cm):
result = []
for auditclass in AllAuditClasses:
result.append(auditclass(cm))
return result
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jan 25, 6:07 AM (1 d, 1 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1321474
Default Alt Text
(25 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment