Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F1842473
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
24 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in
index 029a1083b7..b85e6b0960 100755
--- a/cts/CM_LinuxHAv2.py.in
+++ b/cts/CM_LinuxHAv2.py.in
@@ -1,593 +1,580 @@
#!@PYTHON@
'''CTS: Cluster Testing System: LinuxHA v2 dependent modules...
'''
__copyright__='''
Author: Huang Zhen <zhenhltc@cn.ibm.com>
Copyright (C) 2004 International Business Machines
Additional Audits, Revised Start action, Default Configuration:
Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import os,sys,CTS,CTSaudits,CTStests, warnings
from CTS import *
from CM_hb import HeartbeatCM
from CTSaudits import ClusterAudit
from CTStests import *
from CIB import *
try:
from xml.dom.minidom import *
except ImportError:
sys.__stdout__.write("Python module xml.dom.minidom not found\n")
sys.__stdout__.write("Please install python-xml or similar before continuing\n")
sys.__stdout__.flush()
sys.exit(1)
#######################################################################
#
# LinuxHA v2 dependent modules
#
#######################################################################
class LinuxHAv2(HeartbeatCM):
'''
The linux-ha version 2 cluster manager class.
It implements the things we need to talk to and manipulate
linux-ha version 2 clusters
'''
def __init__(self, Environment, randseed=None):
HeartbeatCM.__init__(self, Environment, randseed=randseed)
self.fastfail = 0
self.clear_cache = 0
self.cib_installed = 0
self.config = None
self.cluster_monitor = 0
self.use_short_names = 1
self.update({
"Name" : "linux-ha-v2",
"DeadTime" : 300,
"StartTime" : 300, # Max time to start up
"StableTime" : 30,
"StartCmd" : "@INITDIR@/heartbeat@INIT_EXT@ start > /dev/null 2>&1",
"StopCmd" : "@INITDIR@/heartbeat@INIT_EXT@ stop > /dev/null 2>&1",
"ElectionCmd" : "@sbindir@/crmadmin -E %s",
"StatusCmd" : "@sbindir@/crmadmin -S %s 2>/dev/null",
"EpocheCmd" : "@sbindir@/ccm_tool -e",
"QuorumCmd" : "@sbindir@/ccm_tool -q",
"CibQuery" : "@sbindir@/cibadmin -Ql",
"ParitionCmd" : "@sbindir@/ccm_tool -p",
"ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>&1",
"CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd2" : "@HA_NOARCHDATAHBDIR@/TestHeartbeatComm break-communication %s>/dev/null 2>&1",
"IsIPAddrRscRunning" : "",
"StandbyCmd" : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null",
"UUIDQueryCmd" : "@sbindir@/crmadmin -N",
"StandbyQueryCmd" : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null",
# Patterns to look for in the log files for various occasions...
"Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE",
# This wont work if we have multiple partitions
"Pat:Local_started" : "%s crmd:.*The local CRM is operational",
"Pat:Slave_started" : "%s crmd:.*State transition.*-> S_NOT_DC",
"Pat:Master_started" : "%s crmd:.* State transition.*-> S_IDLE",
"Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete",
"Pat:Logd_stopped" : "%s logd:.*Exiting write process",
"Pat:They_stopped" : "%s crmd:.*LOST:.* %s ",
"Pat:All_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
# Bad news Regexes. Should never occur.
"BadRegexes" : (
r"ERROR:",
r"CRIT:",
r"Shutting down\.",
r"Forcing shutdown\.",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r", exiting\.",
r"WARN.*Ignoring HA message.*vote.*not in our membership list",
r"pengine.*Attempting recovery of resource",
- r"tengine.*is taking more than 2x its timeout",
+ r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r"No need to invoke the TE",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
),
})
del self["Standby"]
if self.Env["DoBSC"]:
del self["Pat:They_stopped"]
del self["Pat:Logd_stopped"]
self.Env["use_logd"] = 0
self.check_transitions = 0
self.check_elections = 0
self.CIBsync = {}
self.default_cts_cib=CIB(self).cib()
self.debug(self.default_cts_cib)
def errorstoignore(self):
# At some point implement a more elegant solution that
# also produces a report at the end
'''Return list of errors which are known and very noisey should be ignored'''
if 1:
return [
"ERROR: Message hist queue is filling up",
"stonithd: .*CRIT: external_hostlist: 'vmware gethosts' returned an empty hostlist",
"stonithd: .*ERROR: Could not list nodes for stonith RA external/vmware.",
"pengine: Preventing .* from re-starting",
]
return []
def install_config(self, node):
if not self.ns.WaitForNodeToComeUp(node):
self.log("Node %s is not up." % node)
return None
if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1:
self.CIBsync[node] = 1
self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml")
self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig")
self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.last")
self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig.last")
# Only install the CIB on the first node, all the other ones will pick it up from there
if self.cib_installed == 1:
return None
self.cib_installed = 1
if self.Env["CIBfilename"] == None:
self.debug("Installing Generated CIB on node %s" %(node))
warnings.filterwarnings("ignore")
cib_file=os.tmpnam()
warnings.resetwarnings()
os.system("rm -f "+cib_file)
self.debug("Creating new CIB for " + node + " in: " + cib_file)
os.system("echo \'" + self.default_cts_cib + "\' > " + cib_file)
if 0!=self.rsh.echo_cp(None, cib_file, node, "@HA_VARLIBDIR@/heartbeat/crm/cib.xml"):
raise ValueError("Can not create CIB on %s "%node)
os.system("rm -f "+cib_file)
else:
self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node))
if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)):
raise ValueError("Can not scp file to %s "%node)
self.rsh.remote_py(node, "os", "system", "chown @HA_CCMUSER@ @HA_VARLIBDIR@/heartbeat/crm/cib.xml")
def prepare(self):
'''Finish the Initialization process. Prepare to test...'''
for node in self.Env["nodes"]:
self.ShouldBeStatus[node] = ""
self.StataCM(node)
def test_node_CM(self, node):
'''Report the status of the cluster manager on a given node'''
watchpats = [ ]
watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
watchpats.append(self["Pat:Slave_started"]%node)
idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats)
idle_watch.setwatch()
out=self.rsh.readaline(node, self["StatusCmd"]%node)
self.debug("Node %s status: '%s'" %(node, out))
if not out or string.find(out, 'ok') < 0:
if self.ShouldBeStatus[node] == self["up"]:
self.log(
"Node status for %s is %s but we think it should be %s"
%(node, self["down"], self.ShouldBeStatus[node]))
self.ShouldBeStatus[node]=self["down"]
return 0
if self.ShouldBeStatus[node] == self["down"]:
self.log(
"Node status for %s is %s but we think it should be %s: %s"
%(node, self["up"], self.ShouldBeStatus[node], out))
self.ShouldBeStatus[node]=self["up"]
# check the output first - because syslog-ng looses messages
if string.find(out, 'S_NOT_DC') != -1:
# Up and stable
return 2
if string.find(out, 'S_IDLE') != -1:
# Up and stable
return 2
# fall back to syslog-ng and wait
if not idle_watch.look():
# just up
self.debug("Warn: Node %s is unstable: %s" %(node, out))
return 1
# Up and stable
return 2
# Is the node up or is the node down
def StataCM(self, node):
'''Report the status of the cluster manager on a given node'''
if self.test_node_CM(node) > 0:
return 1
return None
# Being up and being stable is not the same question...
def node_stable(self, node):
'''Report the status of the cluster manager on a given node'''
if self.test_node_CM(node) == 2:
return 1
self.log("Warn: Node %s not stable" %(node))
return None
def cluster_stable(self, timeout=None):
watchpats = [ ]
watchpats.append("Current ping state: S_IDLE")
watchpats.append(self["Pat:DC_IDLE"])
self.debug("Waiting for cluster stability...")
if timeout == None:
timeout = self["DeadTime"]
idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout)
idle_watch.setwatch()
any_up = 0
for node in self.Env["nodes"]:
# have each node dump its current state
if self.ShouldBeStatus[node] == self["up"]:
self.rsh.readaline(node, (self["StatusCmd"] %node) )
any_up = 1
if any_up == 0:
self.debug("Cluster is inactive")
return 1
ret = idle_watch.look()
if ret:
self.debug(ret)
return 1
self.log("Warn: Cluster Master not IDLE after %ds" % timeout)
return None
def is_node_dc(self, node, status_line=None):
rc = 0
if not status_line:
status_line = self.rsh.readaline(node, self["StatusCmd"]%node)
if not status_line:
rc = 0
elif string.find(status_line, 'S_IDLE') != -1:
rc = 1
elif string.find(status_line, 'S_INTEGRATION') != -1:
rc = 1
elif string.find(status_line, 'S_FINALIZE_JOIN') != -1:
rc = 1
elif string.find(status_line, 'S_POLICY_ENGINE') != -1:
rc = 1
elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1:
rc = 1
if rc == 1:
self.debug("%s _is_ the DC" % node)
return rc
def active_resources(self, node):
# [SM].* {node} matches Started, Slave, Master
# Stopped wont be matched as it wont include {node}
(rc, output) = self.rsh.remote_py(
node, "os", "system", """@sbindir@/crm_mon -1 | grep -e "[SM].* %s" """ % node)
resources = []
for line in output:
fields = line.split()
resources.append(fields[0])
return resources
def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"):
'''
Execute an operation on a resource
'''
cmd = self["ExecuteRscOp"] % (app, resource, op, interval)
(rc, lines) = self.rsh.remote_py(node, "os", "system", cmd)
- self.debug("RscOp '%s' on %s: %d" % (cmd, node, rc))
+ #self.debug("RscOp '%s' on %s: %d" % (cmd, node, rc))
#for line in lines:
# self.debug("RscOp: "+line)
return rc
def ResourceLocation(self, rid):
ResourceNodes = []
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == self["up"]:
rc = self.ResourceOp(rid, "monitor", node)
# Strange error codes from remote_py
# 65024 == not install
# 2048 == 8
# 1792 == 7
# 0 == 0
if rc == 65024:
self.debug("%s is not installed on %s: %d" % (rid, node, rc))
if rc == 0 or rc == 2048 or rc == 8:
ResourceNodes.append(node)
elif rc == 7 or rc == 1792:
self.debug("%s is not running on %s: %d" % (rid, node, rc))
else:
# not active on this node?
self.debug("Unknown rc code for %s on %s: %d" % (rid, node, rc))
return ResourceNodes
def isolate_node(self, node, allowlist):
'''isolate the communication between the nodes'''
rc = self.rsh(node, self["BreakCommCmd2"]%allowlist)
if rc == 0:
return 1
else:
self.log("Could not break the communication from node: %s",node)
return None
def find_partitions(self):
ccm_partitions = []
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == self["up"]:
partition = self.rsh.readaline(node, self["ParitionCmd"])
if not partition:
self.log("no partition details for %s" %node)
elif len(partition) > 2:
partition = partition[:-1]
found=0
for a_partition in ccm_partitions:
if partition == a_partition:
found = 1
if found == 0:
self.debug("Adding partition from %s: %s" %(node, partition))
ccm_partitions.append(partition)
else:
self.debug("Partition '%s' is consistent with existing entries" %(partition))
else:
self.log("bad partition details for %s" %node)
else:
self.debug("Node %s is down... skipping" %node)
return ccm_partitions
def HasQuorum(self, node_list):
# If we are auditing a partition, then one side will
# have quorum and the other not.
# So the caller needs to tell us which we are checking
# If no value for node_list is specified... assume all nodes
if not node_list:
node_list = self.Env["nodes"]
for node in node_list:
if self.ShouldBeStatus[node] == self["up"]:
quorum = self.rsh.readaline(node, self["QuorumCmd"])
if string.find(quorum, "1") != -1:
return 1
elif string.find(quorum, "0") != -1:
return 0
else:
self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum)
return 0
def Components(self):
complist = []
common_ignore = [
"Pending action:",
"ERROR: crm_log_message_adv:",
"ERROR: MSG: No message to dump",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"Sending message to CIB service FAILED",
"crmd: .*Action A_RECOVER .* not supported",
"ERROR: stonithd_op_result_ready: not signed on",
"send_ipc_message: IPC Channel to .* is not connected",
"unconfirmed_actions: Waiting on .* unconfirmed actions",
"cib_native_msgready: Message pending on command channel",
"crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd",
"verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
]
stonith_ignore = [
"ERROR: stonithd_signon: ",
"update_failcount: Updating failcount for child_DoFencing",
"ERROR: te_connect_stonith: Sign-in failed: triggered a retry",
]
stonith_ignore.extend(common_ignore)
ccm = Process("ccm", 0, [
"State transition S_IDLE",
"CCM connection appears to have failed",
"crmd: .*Action A_RECOVER .* not supported",
"crmd: .*Input I_TERMINATE from do_recover",
"Exiting to recover from CCM connection failure",
"crmd:.*do_exit: Could not recover from internal error",
"crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)",
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# "tengine_stonith_callback: .*result=0",
"A new node joined the cluster",
# "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE",
# "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN",
"State transition S_STARTING -> S_PENDING",
], [], common_ignore, self.fastfail, self)
cib = Process("cib", 0, [
"State transition S_IDLE",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"crmd: .*Input I_TERMINATE from do_recover",
"crmd: .*I_ERROR.*crmd_cib_connection_destroy",
"crmd:.*do_exit: Could not recover from internal error",
], [], common_ignore, self.fastfail, self)
lrmd = Process("lrmd", 0, [
"State transition S_IDLE",
"LRM Connection failed",
"crmd: .*I_ERROR.*lrm_dispatch",
"State transition S_STARTING -> S_PENDING",
".*crmd .*exited with return code 2.",
"crmd: .*Input I_TERMINATE from do_recover",
"crmd:.*do_exit: Could not recover from internal error",
], [], common_ignore, self.fastfail, self)
crmd = Process("crmd", 0, [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# "tengine_stonith_callback: .*result=0",
"State transition S_IDLE",
"State transition S_STARTING -> S_PENDING",
], [
- "tengine: .*ERROR: subsystem_msg_dispatch: The server .* has left us: Shutting down...NOW",
"pengine: .*ERROR: subsystem_msg_dispatch: The server .* has left us: Shutting down...NOW",
], common_ignore, self.fastfail, self)
pengine = Process("pengine", 1, [
"State transition S_IDLE",
".*crmd .*exited with return code 2.",
"crmd: .*Input I_TERMINATE from do_recover",
"crmd:.*do_exit: Could not recover from internal error",
], [], common_ignore, self.fastfail, self)
- tengine = Process("tengine", 1, [
- "State transition S_IDLE",
- ".*crmd .*exited with return code 2.",
- "crmd: .*Input I_TERMINATE from do_recover",
- "crmd:.*do_exit: Could not recover from internal error",
- ], [], common_ignore, self.fastfail, self)
-
if self.Env["DoFencing"] == 1 :
complist.append(Process("stonithd", 0, [], [
"tengine_stonith_connection_destroy: Fencing daemon has left us",
"Attempting connection to fencing daemon",
"te_connect_stonith: Connected",
], stonith_ignore, 0, self))
# complist.append(Process("heartbeat", 0, [], [], [], None, self))
if self.fastfail == 0:
ccm.pats.extend([
"ERROR: Client .*attrd exited with return code 1",
"ERROR: Respawning client .*attrd",
"ERROR: Client .*cib exited with return code 2",
"ERROR: Respawning client .*cib",
"ERROR: Client .*crmd exited with return code 2",
"ERROR: Respawning client .*crmd"
])
cib.pats.extend([
"ERROR: Client .*attrd exited with return code 1",
"ERROR: Respawning client .*attrd",
"ERROR: Client .*crmd exited with return code 2",
"ERROR: Respawning client .*crmd"
])
lrmd.pats.extend([
"ERROR: Client .*crmd exited with return code 2",
"ERROR: Respawning client .*crmd"
])
pengine.pats.extend([
"ERROR: Client .*crmd exited with return code 2",
"ERROR: Respawning client .*crmd"
])
- tengine.pats.extend([
- "ERROR: Client .*crmd exited with return code 2",
- "ERROR: Respawning client .*crmd"
- ])
complist.append(ccm)
complist.append(cib)
complist.append(lrmd)
complist.append(crmd)
complist.append(pengine)
- complist.append(tengine)
return complist
def NodeUUID(self, node):
lines = self.rsh.readlines(node, self["UUIDQueryCmd"])
for line in lines:
self.debug("UUIDLine:"+ line)
m = re.search(r'%s.+\((.+)\)' % node, line)
if m:
return m.group(1)
return ""
def StandbyStatus(self, node):
out=self.rsh.readaline(node, self["StandbyQueryCmd"]%node)
if not out:
return "off"
out = out[:-1]
self.debug("Standby result: "+out)
return out
# status == "on" : Enter Standby mode
# status == "off": Enter Active mode
def SetStandbyMode(self, node, status):
current_status = self.StandbyStatus(node)
cmd = self["StandbyCmd"] % (node, status)
ret = self.rsh(node, cmd)
return True
#######################################################################
#
# A little test code...
#
# Which you are advised to completely ignore...
#
#######################################################################
if __name__ == '__main__':
pass
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Nov 23, 4:49 PM (10 h, 28 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1018976
Default Alt Text
(24 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment