Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in
index 766a1f030d..4e28c815f7 100755
--- a/cts/CM_LinuxHAv2.py.in
+++ b/cts/CM_LinuxHAv2.py.in
@@ -1,633 +1,621 @@
#!@PYTHON@
'''CTS: Cluster Testing System: LinuxHA v2 dependent modules...
'''
__copyright__='''
Author: Huang Zhen <zhenhltc@cn.ibm.com>
Copyright (C) 2004 International Business Machines
Additional Audits, Revised Start action, Default Configuration:
Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import CTS
from CTS import *
from CM_hb import HeartbeatCM
from xml.dom.minidom import *
import CTSaudits
from CTSaudits import ClusterAudit
import CTStests
from CTStests import *
#######################################################################
#
# LinuxHA v2 dependent modules
#
#######################################################################
class LinuxHAv2(HeartbeatCM):
'''
The linux-ha version 2 cluster manager class.
It implements the things we need to talk to and manipulate
linux-ha version 2 clusters
'''
def __init__(self, Environment, randseed=None):
HeartbeatCM.__init__(self, Environment, randseed=randseed)
self.update({
"Name" : "linux-ha-v2",
"DeadTime" : 600,
"StableTime" : 10,
"StartCmd" : "@libdir@/heartbeat/heartbeat >/dev/null 2>&1",
"StopCmd" : "@libdir@/heartbeat/heartbeat -k",
"StatusCmd" : "@libdir@/heartbeat/crmadmin -S %s 2>/dev/null",
"EpocheCmd" : "@libdir@/heartbeat/ccm_tool -e",
"QuorumCmd" : "@libdir@/heartbeat/ccm_tool -q",
"ParitionCmd" : "@libdir@/heartbeat/ccm_tool -p",
"IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s status 0 0 EVERYTIME 2>/dev/null|grep return",
"ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -E %s %s 0 0 EVERYTIME 2>/dev/null",
"CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml",
"IsIPAddrRscRunning" : "",
# Patterns to look for in the log files for various occasions...
"Pat:DC_IDLE" : "crmd:.*State transition.*-> S_IDLE",
"Pat:We_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)",
"Pat:They_started" : "%s crmd:.*State transition.*-> (S_NOT_DC|S_IDLE)",
"Pat:We_stopped" : ("%s heartbeat.*Heartbeat shutdown complete" %(self.OurNode)),
"Pat:They_stopped" : "%s heartbeat.*Heartbeat shutdown complete",
"Pat:All_stopped" : "%s heartbeat.*Heartbeat shutdown complete",
# Bad news Regexes. Should never occur.
"BadRegexes" : (
r"Shutting down\.",
r"Forcing shutdown\.",
r"Timer I_TERMINATE just popped",
r", exiting\.",
r"ERROR:",
r"CRIT:",
),
})
self.default_cts_cib='''
<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
<configuration>
<crm_config/>
<nodes/>
<resources>
</resources>
<constraints>
<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
<expression attribute="is_dc" operation="eq" value="true"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status/>
</cib>
'''
if self.Env["CIBResource"] == 1:
print("Enabling DC resource")
self.default_cts_cib='''
<cib cib_feature_revision="1" num_updates="1" have_quorum="false" epoche="1">
<configuration>
<crm_config/>
<nodes/>
<resources>
<resource id="DcIPaddr" class="heartbeat" type="IPaddr2" priority="1.0">
<instance_attributes>
<rsc_parameters>
<nvpair name="1" value="127.0.0.10"/>
</rsc_parameters>
</instance_attributes>
</resource>
</resources>
<constraints>
<rsc_location id="run_DcIPaddr" rsc="DcIPaddr">
<rule id="can_run_DcIPaddr" result="can" boolean_op="and">
<expression attribute="is_dc" operation="eq" value="true"/>
</rule>
</rsc_location>
</constraints>
</configuration>
<status/>
</cib>
'''
# KLUDGE! Expedient, but a Kludge (FIXME)
# CTStests.AllTestClasses = [FlipTest,RestartTest,StartOnebyOne,SimulStart,SimulStop,Split_brainTest,BandwidthTest]
# StartOnebyOne is redundant as it is performed before SimulStop
CTStests.AllTestClasses = [FlipTest, RestartTest, SimulStart, SimulStop]
# CTSaudits.AllAuditClasses = [CrmdStateAudit, HAResourceAudit]
CTSaudits.AllAuditClasses = [CrmdStateAudit, PartitionAudit]
- def errorstoignore(self):
- '''Return list of errors which are 'normal' and should be ignored'''
- if 0:
- return [ "heartbeat.*ERROR: Respawning client \"/usr/lib/heartbeat/ha_logd\"",
- "heartbeat.*ERROR: Irretrievably lost packet",
- "heartbeat.*ERROR: Cannot rexmit pkt .*: seqno too low",
- "heartbeat.*ERROR: Cannot rexmit pkt .*: seqno not found",
- "heartbeat.*ERROR: channel is not connected",
- "ccm: .*ERROR: .*dropping message of type.*Is this a Byzantime failure?"
- ]
- return []
-
def StataCM(self, node):
'''Report the status of the cluster manager on a given node'''
out=self.rsh.readaline(node, self["StatusCmd"]%node)
ret= (string.find(out, 'ok') != -1)
try:
if ret:
if self.ShouldBeStatus[node] != self["up"]:
self.log(
"Node status for %s is %s but we think it should be %s"
% (node, self["up"], self.ShouldBeStatus[node]))
self.log("Expected: %s. Actual: %s"
% (self.ShouldBeStatus[node], out))
else:
if self.ShouldBeStatus[node] != self["down"]:
self.log(
"Node status for %s is %s but we think it should be %s"
% (node, self["down"], self.ShouldBeStatus[node]))
except KeyError: pass
if ret: self.ShouldBeStatus[node]=self["up"]
else: self.ShouldBeStatus[node]=self["down"]
return ret
def StartaCM(self, node):
'''Start up the cluster manager on a given node'''
patterns = []
patterns.append(self["Pat:We_started"]%node)
# only search for this pattern if there is another node out there
# that should be the DC
if self.any_running() == 1:
patterns.append(self["Pat:DC_IDLE"])
watch = CTS.LogWatcher(self["LogFileName"], patterns, 120)
watch.setwatch()
self.log ("Starting %s on node %s" %(self["Name"], node))
if self.Env["ClobberCIB"] != None:
if self.Env["CIBfilename"] == None:
os.system("rm -f /tmp/cts.default.cib")
os.system("echo \'" + self.default_cts_cib + "\' > /tmp/cts.default.cib")
self.rsh.cp("/tmp/cts.default.cib", self["CIBfile"]%node)
os.system("rm -f /tmp/cts.default.cib")
else:
self.rsh.cp(self.Env["CIBfilename"], self["CIBfile"]%node)
self.rsh(node, self["StartCmd"])
self.ShouldBeStatus[node]=self["up"]
if watch.lookforall():
return 1
# the watch() failed... lets check to see if the start _really_ failed
for regex in watch.unmatched:
self.log ("Startup pattern not found: %s" %(regex))
out = self.rsh.readaline(node, self["StatusCmd"])
if string.find(out, 'ok') == -1:
# yep, it _really_ failed
self.ShouldBeStatus[node]=self["down"]
self.log ("Could not start %s on node %s" %(self["Name"], node))
return None
ret=(string.find(out, 'S_NOT_DC') != -1)
if ret:
# actually we joined the cluster just fine
self.log ("%s on %s joined the cluster" %(self["Name"], node))
return 1
ret= (string.find(out, 'S_IDLE') != -1)
if ret:
# actually we joined the cluster just fine
self.log ("%s on %s joined the cluster as DC" %(self["Name"], node))
return 1
self.log ("%s on %s started but unstable: %s"
%(self["Name"], node, out))
# self.ShouldBeStatus[node]=self["down"]
return None
def Configuration(self):
if not self.rsh.cp(self["CIBfile"]%self.Env["nodes"][0],self.Env["HAdir"]):
raise ValueError("Can not copy file to %s, maybe permission denied"%self.Env["HAdir"])
cib=parse("%s/cib.xml"%self.Env["HAdir"])
return cib.getElementsByTagName('configuration')[0]
def Resources(self):
ResourceList = []
#read resources in cib
configuration=self.Configuration()
resources=configuration.getElementsByTagName('resources')[0]
rscs=configuration.getElementsByTagName('resource')
for rsc in rscs:
ResourceList.append(HAResource(self,rsc))
return ResourceList
def Dependancies(self):
DependancyList = []
#read dependancy in cib
configuration=self.Configuration()
constraints=configuration.getElementsByTagName('constraints')[0]
rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc')
for node in rsc_to_rscs:
dependancy = {}
dependancy["id"]=node.getAttribute('id')
dependancy["from"]=node.getAttribute('from')
dependancy["to"]=node.getAttribute('to')
dependancy["type"]=node.getAttribute('type')
dependancy["strength"]=node.getAttribute('strength')
DependancyList.append(dependancy)
return DependancyList
def any_running(self):
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == self["up"]:
return 1
return 0
class HAResourceAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
def _RscRunningNodes(self, resource):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if resource.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
NodeofRsc = {}
#Make sure the resouces are running on one and only one node
Resources = self.CM.Resources()
for resource in Resources :
RunningNodes = self._RscRunningNodes(resource)
NodeofRsc[resource.rid]=RunningNodes
if len(RunningNodes) == 0 :
print resource.rid + " isn't running anywhere"
passed = 0
if len(RunningNodes) > 1:
print resource.rid + " is running more than once: " \
+ str(RunningNodes)
passed = 0
#Make sure the resouces with "must","placement" constraint are running on the same node
Dependancies = self.CM.Dependancies()
for dependancy in Dependancies:
if dependancy["type"] == "placement" and dependancy["strength"] == "must":
if NodeofRsc[dependancy["from"]] != NodeofRsc[dependancy["to"]]:
print dependancy["from"] + " and " + dependancy["to"] + " should be run on same node"
passed = 0
return passed
def name(self):
return "HAResourceAudit"
class HAResource(Resource):
def __init__(self, cm, node):
'''
Get information from xml node
'''
self.rid = node.getAttribute('id')
self.rclass = node.getAttribute('class')
self.rtype = node.getAttribute('type')
self.rparameters = {}
attributes = node.getElementsByTagName('instance_attributes')[0]
parameters = node.getElementsByTagName('rsc_parameters')[0]
nvpairs = node.getElementsByTagName('nvpair')
for nvpair in nvpairs:
name=nvpair.getAttribute('name')
value=nvpair.getAttribute('value')
self.rparameters[name]=value
Resource.__init__(self, cm, self.rtype, self.rid)
def IsRunningOn(self, nodename):
'''
This member function returns true if our resource is running
on the given node in the cluster.
We call the status operation for the resource script.
'''
out=self.CM.rsh.readaline(nodename, self.CM["IsRscRunning"]%self.rid)
return re.search("0",out)
def RunningNodes(self):
ResourceNodes = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.IsRunningOn(node):
ResourceNodes.append(node)
return ResourceNodes
def _ResourceOperation(self, operation, nodename):
'''
Execute an operation on the resource
'''
self.CM.rsh.readaline(nodename, self.CM["ExecuteRscOp"]%(self.rid,operation))
return self.CM.rsh.lastrc == 0
def Start(self, nodename):
'''
This member function starts or activates the resource.
'''
return self._ResourceOperation("start", nodename)
def Stop(self, nodename):
'''
This member function stops or deactivates the resource.
'''
return self._ResourceOperation("stop", nodename)
def IsWorkingCorrectly(self, nodename):
return self._ResourceOperation("monitor", nodename)
class CrmdStateAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
dc_list = []
up_count = 0
node_count = 0
up_are_down = 0
down_are_up = 0
slave_count = 0
unstable_list = []
for node in self.CM.Env["nodes"]:
out=self.CM.rsh.readaline(node, self.CM["StatusCmd"]%node)
ret = (string.find(out, 'ok') != -1)
node_count = node_count + 1
if ret:
up_count = up_count + 1
if self.CM.ShouldBeStatus[node] == self.CM["down"]:
self.CM.log(
"Node %s %s when it should be %s"
% (node, self.CM["up"], self.CM.ShouldBeStatus[node]))
self.CM.ShouldBeStatus[node] = self.CM["up"]
down_are_up = down_are_up + 1
ret= (string.find(out, 'S_NOT_DC') != -1)
if ret:
slave_count = slave_count + 1
else:
ret= (string.find(out, 'S_IDLE') != -1)
if ret:
dc_list.append(node)
else:
unstable_list.append(out)
else:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log(
"Node %s %s when it should be %s"
% (node, self.CM["down"], self.CM.ShouldBeStatus[node]))
self.CM.ShouldBeStatus[node] = self.CM["down"]
up_are_down = up_are_down + 1
# if up_count > 0 and len(dc_list) != 1:
# passed = 0
# self.CM.log("Exactly 1 node should be DC. We found %d (of %d): %s"
# %(len(dc_list), up_count, str(dc_list)))
if len(unstable_list) > 0:
passed = 0
self.CM.log("Cluster is not stable: %d (of %d)."
%(len(unstable_list), up_count))
for status in unstable_list:
self.CM.log("%s" %(status))
if up_are_down > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be up were down."
%(up_are_down, node_count))
if down_are_up > 0:
passed = 0
self.CM.log("%d (of %d) nodes expected to be down were up."
%(down_are_up, node_count))
return passed
def name(self):
return "CrmdStateAudit"
class PartitionAudit(ClusterAudit):
def __init__(self, cm):
self.CM = cm
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
self.NodeEpoche={}
self.NodeState={}
self.NodeQuorum={}
self.NodeCCM={}
def has_key(self, key):
return self.Stats.has_key(key)
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
return self.Stats[key]
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not self.Stats.has_key(name):
self.Stats[name]=0
self.Stats[name] = self.Stats[name]+1
def __call__(self):
self.CM.log ("Do Audit %s"%self.name())
passed = 1
nodes_up = 0
ccm_partitions = []
for node in self.CM.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
nodes_up = nodes_up + 1
# self.PS_State[node] = os.system("@SSH@ root@%s ps -C crmd" %(node))
self.NodeQuorum[node] = self.CM.rsh.readaline(
node, self.CM["QuorumCmd"])
self.NodeCCM[node] = self.CM.rsh.readaline(
node, self.CM["ParitionCmd"])
self.NodeEpoche[node] = self.CM.rsh.readaline(
node, self.CM["EpocheCmd"])
self.NodeState[node] = self.CM.rsh.readaline(
node, self.CM["StatusCmd"]%node)
if len(self.NodeState[node]) > 1:
self.NodeState[node] = self.NodeState[node][:-1]
if len(self.NodeEpoche[node]) > 0:
self.NodeEpoche[node] = int(self.NodeEpoche[node][:-1])
if len(self.NodeQuorum[node]) > 1:
self.NodeQuorum[node] = self.NodeQuorum[node][:-1]
if len(self.NodeCCM[node]) > 1:
self.NodeCCM[node] = self.NodeCCM[node][:-1]
found = 0
for partition in ccm_partitions:
if partition == self.NodeCCM[node]:
found = 1
if found == 0:
ccm_partitions.append(self.NodeCCM[node])
if nodes_up == 0:
return 1
# if len(ccm_partitions) > 1:
# self.CM.log("%d cluster partitions detected:" %len(ccm_partitions))
# for partition in ccm_partitions:
# self.CM.log("\t %s" %partition)
for partition in ccm_partitions:
partition_passed = 0
if self.audit_partition(partition) == 0:
passed = 0
return passed
def audit_partition(self, partition):
passed = 0
dc_found = []
dc_allowed_list = []
lowest_epoche = None
node_list = partition.split()
self.CM.log("Auditing partition: %s" %(partition))
for node in node_list:
if lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche:
lowest_epoche = self.NodeEpoche[node]
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
if self.is_node_dc(self.NodeState[node]):
dc_found.append(node)
if self.NodeEpoche[node] == lowest_epoche:
passed = 1
else:
self.CM.log("DC %s is not the oldest node (%d vs. %d)"
%(node, self.NodeEpoche[node], lowest_epoche))
passed = 0
if len(dc_found) == 0:
self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
%(len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
elif len(dc_found) > 1:
self.CM.log("%d DCs (%s) found in cluster partition: %s"
%(len(dc_found), str(dc_found), str(node_list)))
passed = 0
if passed == 0:
for node in node_list:
if self.CM.ShouldBeStatus[node] == self.CM["up"]:
self.CM.log("epoche %s : %s"
%(self.NodeEpoche[node], self.NodeState[node]))
if self.audit_dc_resources(node_list, dc_found) == 0:
passed = 0
return passed
def audit_dc_resources(self, node_list, dc_list):
passed = 1
Resources = self.CM.Resources()
for resource in Resources:
self.CM.log("Auditing resource: %s" %(resource))
if resource.rid == "DcIPaddr":
# All DCs are running the resource
for dc in dc_list:
if self.NodeQuorum[dc]:
if resource.IsRunningOn(dc) == 0:
self.CM.log("Resource %s not running on DC: %s"
%(resource, dc))
passed = 0
# All nodes running the resource are DCs
for node in node_list:
if resource.IsRunningOn(node):
if self.is_node_dc(self.NodeState[node]) == 0:
self.CM.log("Resource %s is running on non-DC node %s"
%("DcIPaddr", node))
passed = 0
return passed
def is_node_dc(self, status_line):
return (string.find(status_line, 'S_IDLE') != -1)
def name(self):
return "PartitionAudit"
#######################################################################
#
# A little test code...
#
# Which you are advised to completely ignore...
#
#######################################################################
if __name__ == '__main__':
pass

File Metadata

Mime Type
text/x-diff
Expires
Thu, Oct 16, 12:37 AM (10 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2530931
Default Alt Text
(23 KB)

Event Timeline