Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F7610074
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
23 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in
index 766a1f030d..4e28c815f7 100755
--- a/cts/CM_LinuxHAv2.py.in
+++ b/cts/CM_LinuxHAv2.py.in
@@ -1,633 +1,621 @@
#!@PYTHON@
'''CTS: Cluster Testing System: LinuxHA v2 dependent modules...
'''
__copyright__='''
Author: Huang Zhen <zhenhltc@cn.ibm.com>
Copyright (C) 2004 International Business Machines
Additional Audits, Revised Start action, Default Configuration:
Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import CTS
from CTS import *
from CM_hb import HeartbeatCM
from xml.dom.minidom import *
import CTSaudits
from CTSaudits import ClusterAudit
import CTStests
from CTStests import *
#######################################################################
#
# LinuxHA v2 dependent modules
#
#######################################################################
class LinuxHAv2(HeartbeatCM):
'''
The linux-ha version 2 cluster manager class.
It implements the things we need to talk to and manipulate
linux-ha version 2 clusters
'''
def __init__(self, Environment, randseed=None):
HeartbeatCM.__init__(self, Environment, randseed=randseed)
self.update({
"Name" : "linux-ha-v2",
"DeadTime" : 600,
"StableTime" : 10,
"StartCmd" : "@libdir@/heartbeat/heartbeat >/dev/null 2>&1",
"StopCmd" : "@libdir@/heartbeat/heartbeat -k",
"StatusCmd" : "@libdir@/heartbeat/crmadmin -S %s 2>/dev/null",
"EpocheCmd" : "@libdir@/heartbeat/ccm_tool -e",
"QuorumCmd" : "@libdir@/heartbeat/ccm_tool -q",
"ParitionCmd" : "@libdir@/heartbeat/ccm_tool -p",
"IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s status 0 0 EVERYTIME 2>/dev/null|grep return",
"ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null",
"CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml",
"IsIPAddrRscRunning" : "",
# Patterns to look for in the log files for various occasions...
"Pat:DC_IDLE" : "crmd:.*State transition.*-> S_IDLE",
"Pat:We_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)",
"Pat:They_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)",
"Pat:We_stopped" : ("%s heartbeat.*Heartbeat shutdown complete" %(self.OurNode)),
"Pat:They_stopped" : "%s heartbeat.*Heartbeat shutdown complete",
"Pat:All_stopped" : "%s heartbeat.*Heartbeat shutdown complete",
# Bad news Regexes. Should never occur.
"BadRegexes" : (
r"Shutting down\.",
r"Forcing shutdown\.",
r"Timer I_TERMINATE just popped",
r", exiting\.",
r"ERROR:",
r"CRIT:",
),
})
self.default_cts_cib='''
<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
<configuration>
<crm_config/>
<nodes/>
<resources>
</resources>
<constraints>
<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
<expression attribute="is_dc" operation="eq" value="true"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status/>
</cib>
'''
if self.Env["CIBResource"] == 1:
print("Enabling DC resource")
self.default_cts_cib='''
<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
<configuration>
<crm_config/>
<nodes/>
<resources>
<resource id="DcIPaddr" class="heartbeat" type="IPaddr2" priority="1.0">
<instance_attributes>
<rsc_parameters>
<nvpair name="1" value="127.0.0.10"/>
</rsc_parameters>
</instance_attributes>
</resource>
</resources>
<constraints>
<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
<expression attribute="is_dc" operation="eq" value="true"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status/>
</cib>
'''
# KLUDGE! Expedient, but a Kludge (FIXME)
# CTStests.AllTestClasses = [FlipTest,RestartTest,StartOnebyOne,SimulStart,SimulStop,Split_brainTest,BandwidthTest]
# StartOnebyOne is redundant as it is performed before SimulStop
CTStests.AllTestClasses = [FlipTest, RestartTest, SimulStart, SimulStop]
# CTSaudits.AllAuditClasses = [CrmdStateAudit, HAResourceAudit]
CTSaudits.AllAuditClasses = [CrmdStateAudit, PartitionAudit]
- def errorstoignore(self):
- '''Return list of errors which are 'normal' and should be ignored'''
- if 0:
- return [ "heartbeat.*ERROR: Respawning client \"/usr/lib/heartbeat/ha_logd\"",
- "heartbeat.*ERROR: Irretrievably lost packet",
- "heartbeat.*ERROR: Cannot rexmit pkt .*: seqno too low",
- "heartbeat.*ERROR: Cannot rexmit pkt .*: seqno not found",
- "heartbeat.*ERROR: channel is not connected",
- "ccm: .*ERROR: .*dropping message of type.*Is this a Byzantime failure?"
- ]
- return []
-
def StataCM(self, node):
'''Report the status of the cluster manager on a given node'''
out=self.rsh.readaline(node, self["StatusCmd"]%node)
ret= (string.find(out, 'ok') != -1)
try:
if ret:
if self.ShouldBeStatus[node] != self["up"]:
self.log(
"Node status for %s is %s but we think it should be %s"
% (node, self["up"], self.ShouldBeStatus[node]))
self.log("Expected: %s. Actual: %s"
% (self.ShouldBeStatus[node], out))
else:
if self.ShouldBeStatus[node] != self["down"]:
self.log(
"Node status for %s is %s but we think it should be %s"
% (node, self["down"], self.ShouldBeStatus[node]))
except KeyError: pass
if ret: self.ShouldBeStatus[node]=self["up"]
else: self.ShouldBeStatus[node]=self["down"]
return ret
def StartaCM(self, node):
'''Start up the cluster manager on a given node'''
patterns = []
patterns.append(self["Pat:We_started"]%node)
# only search for this pattern if there is another node out there
# that should be the DC
if self.any_running() == 1:
patterns.append(self["Pat:DC_IDLE"])
watch = CTS.LogWatcher(self["LogFileName"], patterns, 120)
watch.setwatch()
self.log ("Starting %s on node %s" %(self["Name"], node))
if self.Env["ClobberCIB"] != None:
if self.Env["CIBfilename"] == None:
os.system("rm -f /tmp/cts.default.cib")
os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib")
self.rsh.cp("/tmp/cts.default.cib", self["CIBfile"]%node)
os.system("rm -f /tmp/cts.default.cib")
else:
self.rsh.cp(self.Env["CIBfilename"], self["CIBfile"]%node)
self.rsh(node, self["StartCmd"])
self.ShouldBeStatus[node]=self["up"]
if watch.lookforall():
return 1
# the watch() failed... lets check to see if the start _really_ failed
for regex in watch.unmatched:
self.log ("Startup pattern not found: %s" %(regex))
out = self.rsh.readaline(node, self["StatusCmd"])
if string.find(out, 'ok') == -1:
# yep, it _really_ failed
self.ShouldBeStatus[node]=self["down"]
self.log ("Could not start %s on node %s" %(self["Name"], node))
return None
ret=(string.find(out, 'S_NOT_DC') != -1)
if ret:
# actually we joined the cluster just fine
self.log ("%s on %s joined the cluster" %(self["Name"], node))
return 1
ret= (string.find(out, 'S_IDLE') != -1)
if ret:
# actually we joined the cluster just fine
self.log ("%s on %s joined the cluster as DC" %(self["Name"], node))
return 1
self.log ("%s on %s started but unstable: %s"
%(self["Name"], node, out))
# self.ShouldBeStatus[node]=self["down"]
return None
def Configuration(self):
if not self.rsh.cp(self["CIBfile"]%self.Env["nodes"][0],self.Env["HAdir"]):
raise ValueError("Can not copy file to %s, maybe permission denied"%self.Env["HAdir"])
cib=parse("%s/cib.xml"%self.Env["HAdir"])
return cib.getElementsByTagName('configuration')[0]
def Resources(self):
ResourceList = []
#read resources in cib
configuration=self.Configuration()
resources=configuration.getElementsByTagName('resources')[0]
rscs=configuration.getElementsByTagName('resource')
for rsc in rscs:
ResourceList.append(HAResource(self,rsc))
return ResourceList
def Dependancies(self):
DependancyList = []
#read dependancy in cib
configuration=self.Configuration()
constraints=configuration.getElementsByTagName('constraints')[0]
rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc')
for node in rsc_to_rscs:
dependancy = {}
dependancy["id"]=node.getAttribute('id')
dependancy["from"]=node.getAttribute('from')
dependancy["to"]=node.getAttribute('to')
dependancy["type"]=node.getAttribute('type')
dependancy["strength"]=node.getAttribute('strength')
DependancyList.append(dependancy)
return DependancyList
def any_running(self):
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == self["up"]:
return 1
return 0
class HAResourceAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
def _RscRunningNodes(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
NodeofRsc = {}
#Make sure the resouces are running on one and only one node
Resources = self.CM.Resources()
for resource in Resources :
RunningNodes = self._RscRunningNodes(resource)
NodeofRsc[resource.rid]=RunningNodes
if len(RunningNodes) == 0 :
print resource.rid + " isn't running anywhere"
passed = 0
if len(RunningNodes) > 1:
print resource.rid + " is running more than once: " \
+ str(RunningNodes)
passed = 0
#Make sure the resouces with "must","placement" constraint are running on the same node
Dependancies = self.CM.Dependancies()
for dependancy in Dependancies:
if dependancy["type"] == "placement" and dependancy["strength"] == "must":
if NodeofRsc[dependancy["from"]] != NodeofRsc[dependancy["to"]]:
print dependancy["from"] + " and " + dependancy["to"] + " should be run on same node"
passed = 0
return passed
def name(self):
return "HAResourceAudit"
class HAResource(Resource):
def __init__(self, cm, node):
'''
Get information from xml node
'''
self.rid = node.getAttribute('id')
self.rclass = node.getAttribute('class')
self.rtype = node.getAttribute('type')
self.rparameters = {}
attributes = node.getElementsByTagName('instance_attributes')[0]
parameters = node.getElementsByTagName('rsc_parameters')[0]
nvpairs = node.getElementsByTagName('nvpair')
for nvpair in nvpairs:
name=nvpair.getAttribute('name')
value=nvpair.getAttribute('value')
self.rparameters[name]=value
Resource.__init__(self, cm, self.rtype, self.rid)
def IsRunningOn(self, nodename):
'''
This member function returns true if our resource is running
on the given node in the cluster.
We call the status operation for the resource script.
'''
out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid)
return re.search("0",out)
def RunningNodes(self):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def _ResourceOperation(self, operation, nodename):
'''
Execute an operation on the resource
'''
self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation))
return self.CM.rsh.lastrc == 0
def Start(self, nodename):
'''
This member function starts or activates the resource.
'''
return self._ResourceOperation("start", nodename)
def Stop(self, nodename):
'''
This member function stops or deactivates the resource.
'''
return self._ResourceOperation("stop", nodename)
def IsWorkingCorrectly(self, nodename):
return self._ResourceOperation("monitor", nodename)
class CrmdStateAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
dc_list = []
up_count = 0
node_count = 0
up_are_down = 0
down_are_up = 0
slave_count = 0
unstable_list = []
for node in self.CM.Env["nodes"]:
out=self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node)
ret = (string.find(out, 'ok') != -1)
node_count = node_count + 1
if ret:
up_count = up_count + 1
if self.CM.ShouldBeStatus[node] == self.CM["down"]:
self.CM.log(
"Node %s %s when it should be %s"
% (node, self.CM["up"], self.CM.ShouldBeStatus[node]))
self.CM.ShouldBeStatus[node] = self.CM["up"]
down_are_up = down_are_up + 1
ret= (string.find(out, 'S_NOT_DC') != -1)
if ret:
slave_count = slave_count + 1
else:
ret= (string.find(out, 'S_IDLE') != -1)
if ret:
dc_list.append(node)
else:
unstable_list.append(out)
else:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log(
"Node %s %s when it should be %s"
% (node, self.CM["down"], self.CM.ShouldBeStatus[node]))
self.CM.ShouldBeStatus[node] = self.CM["down"]
up_are_down = up_are_down + 1
# if up_count > 0 and len(dc_list) != 1:
# passed = 0
# self.CM.log("Exactly 1 node should be DC. We found %d (of %d): %s"
# %(len(dc_list), up_count, str(dc_list)))
if len(unstable_list) > 0:
passed = 0
self.CM.log("Cluster is not stable: %d (of %d)."
%(len(unstable_list), up_count))
for status in unstable_list:
self.CM.log("%s" %(status))
if up_are_down > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be up were down."
%(up_are_down, node_count))
if down_are_up > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be down were up."
%(down_are_up, node_count))
return passed
def name(self):
return "CrmdStateAudit"
class PartitionAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
self.NodeEpoche={}
self.NodeState={}
self.NodeQuorum={}
self.NodeCCM={}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
nodes_up = 0
ccm_partitions = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
nodes_up = nodes_up + 1
# self.PS_State[node] = os.system("@SSH@ root@%s ps -C crmd" %(node))
self.NodeQuorum[node] = self.CM.rsh.readaline(
node, self.CM["QuorumCmd"])
self.NodeCCM[node] = self.CM.rsh.readaline(
node, self.CM["ParitionCmd"])
self.NodeEpoche[node] = self.CM.rsh.readaline(
node, self.CM["EpocheCmd"])
self.NodeState[node] = self.CM.rsh.readaline(
node, self.CM["StatusCmd"]%node)
if len(self.NodeState[node]) > 1:
self.NodeState[node] = self.NodeState[node][:-1]
if len(self.NodeEpoche[node]) > 0:
self.NodeEpoche[node] = int(self.NodeEpoche[node][:-1])
if len(self.NodeQuorum[node]) > 1:
self.NodeQuorum[node] = self.NodeQuorum[node][:-1]
if len(self.NodeCCM[node]) > 1:
self.NodeCCM[node] = self.NodeCCM[node][:-1]
found = 0
for partition in ccm_partitions:
if partition == self.NodeCCM[node]:
found = 1
if found == 0:
ccm_partitions.append(self.NodeCCM[node])
if nodes_up == 0:
return 1
# if len(ccm_partitions) > 1:
# self.CM.log("%d cluster partitions detected:" %len(ccm_partitions))
# for partition in ccm_partitions:
# self.CM.log("\t %s" %partition)
for partition in ccm_partitions:
partition_passed = 0
if self.audit_partition(partition) == 0:
passed = 0
return passed
def audit_partition(self, partition):
passed = 0
dc_found = []
dc_allowed_list = []
lowest_epoche = None
node_list = partition.split()
self.CM.log("Auditing partition: %s" %(partition))
for node in node_list:
if lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
lowest_epoche = self.NodeEpoche[node]
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.is_node_dc(self.NodeState[node]):
dc_found.append(node)
if self.NodeEpoche[node] == lowest_epoche:
passed = 1
else:
self.CM.log("DC %s is not the oldest node (%d vs. %d)"
%(node, self.NodeEpoche[node], lowest_epoche))
passed = 0
if len(dc_found) == 0:
self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
elif len(dc_found) > 1:
self.CM.log("%d DCs (%s) found in cluster partition: %s"
%(len(dc_found), str(dc_found), str(node_list)))
passed = 0
if passed == 0:
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log("epoche %s : %s"
%(self.NodeEpoche[node], self.NodeState[node]))
if self.audit_dc_resources(node_list, dc_found) == 0:
passed = 0
return passed
def audit_dc_resources(self, node_list, dc_list):
passed = 1
Resources = self.CM.Resources()
for resource in Resources:
self.CM.log("Auditing resource: %s" %(resource))
if resource.rid == "DcIPaddr":
# All DCs are running the resource
for dc in dc_list:
if self.NodeQuorum[dc]:
if resource.IsRunningOn(dc) == 0:
self.CM.log("Resource %s not running on DC: %s"
%(resource, dc))
passed = 0
# All nodes running the resource are DCs
for node in node_list:
if resource.IsRunningOn(node):
if self.is_node_dc(self.NodeState[node]) == 0:
self.CM.log("Resource %s is running on non-DC node %s"
%("DcIPaddr", node))
passed = 0
return passed
def is_node_dc(self, status_line):
return (string.find(status_line, 'S_IDLE') != -1)
def name(self):
return "PartitionAudit"
#######################################################################
#
# A little test code...
#
# Which you are advised to completely ignore...
#
#######################################################################
if __name__ == '__main__':
pass
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Oct 16, 12:37 AM (10 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2530931
Default Alt Text
(23 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment