Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/cts/CM_common.py b/cts/CM_common.py
index da6b68a96d..506a2de054 100755
--- a/cts/CM_common.py
+++ b/cts/CM_common.py
@@ -1,502 +1,486 @@
'''CTS: Cluster Testing System: Cluster Manager Common Class
This was originally the cluster manager class for the Heartbeat stack.
It is retained for use as a base class by other cluster manager classes.
It could be merged into the ClusterManager class directly, but this is
easier.
'''
__copyright__ = '''
Author: Huang Zhen <zhenhltc@cn.ibm.com>
Copyright (C) 2004 International Business Machines
Additional Audits, Revised Start action, Default Configuration:
Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
'''
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
import sys
from cts.CTSvars import *
from cts.CTS import *
from cts.CIB import *
from cts.CTStests import AuditResource
from cts.watcher import LogWatcher
class crm_common(ClusterManager):
def __init__(self, Environment, randseed=None, name=None):
ClusterManager.__init__(self, Environment, randseed=randseed)
self.fastfail = 0
self.cib_installed = 0
self.config = None
self.cluster_monitor = 0
self.use_short_names = 1
if self.Env["DoBSC"]:
del self.templates["Pat:They_stopped"]
self._finalConditions()
self.check_transitions = 0
self.check_elections = 0
self.CIBsync = {}
self.CibFactory = ConfigFactory(self)
self.cib = self.CibFactory.createConfig(self.Env["Schema"])
def errorstoignore(self):
# At some point implement a more elegant solution that
# also produces a report at the end
'''Return list of errors which are known and very noisey should be ignored'''
return PatternSelector().get_patterns(self.name, "BadNewsIgnore")
def install_config(self, node):
if not self.ns.WaitForNodeToComeUp(node):
self.log("Node %s is not up." % node)
return None
if not node in self.CIBsync and self.Env["ClobberCIB"] == 1:
self.CIBsync[node] = 1
self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
# Only install the CIB on the first node, all the other ones will pick it up from there
if self.cib_installed == 1:
return None
self.cib_installed = 1
if self.Env["CIBfilename"] == None:
self.log("Installing Generated CIB on node %s" % (node))
self.cib.install(node)
else:
self.log("Installing CIB (%s) on node %s" % (self.Env["CIBfilename"], node))
if 0 != self.rsh.cp(self.Env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)):
raise ValueError("Can not scp file to %s %d"%(node))
self.rsh(node, "chown "+CTSvars.CRM_DAEMON_USER+" "+CTSvars.CRM_CONFIG_DIR+"/cib.xml")
def prepare(self):
'''Finish the Initialization process. Prepare to test...'''
self.partitions_expected = 1
for node in self.Env["nodes"]:
self.ShouldBeStatus[node] = ""
self.unisolate_node(node)
self.StataCM(node)
def test_node_CM(self, node):
'''Report the status of the cluster manager on a given node'''
watchpats = [ ]
watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
watchpats.append(self.templates["Pat:Slave_started"]%node)
watchpats.append(self.templates["Pat:Master_started"]%node)
idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterIdle", hosts=[node], kind=self.Env["LogWatcher"])
idle_watch.setwatch()
out = self.rsh(node, self.templates["StatusCmd"]%node, 1)
self.debug("Node %s status: '%s'" %(node, out))
if not out or str.find(out, 'ok') < 0:
if self.ShouldBeStatus[node] == "up":
self.log(
"Node status for %s is %s but we think it should be %s"
% (node, "down", self.ShouldBeStatus[node]))
self.ShouldBeStatus[node] = "down"
return 0
if self.ShouldBeStatus[node] == "down":
self.log(
"Node status for %s is %s but we think it should be %s: %s"
% (node, "up", self.ShouldBeStatus[node], out))
self.ShouldBeStatus[node] = "up"
# check the output first - because syslog-ng loses messages
if str.find(out, 'S_NOT_DC') != -1:
# Up and stable
return 2
if str.find(out, 'S_IDLE') != -1:
# Up and stable
return 2
# fall back to syslog-ng and wait
if not idle_watch.look():
# just up
self.debug("Warn: Node %s is unstable: %s" % (node, out))
return 1
# Up and stable
return 2
# Is the node up or is the node down
def StataCM(self, node):
'''Report the status of the cluster manager on a given node'''
if self.test_node_CM(node) > 0:
return 1
return None
# Being up and being stable is not the same question...
def node_stable(self, node):
'''Report the status of the cluster manager on a given node'''
if self.test_node_CM(node) == 2:
return 1
self.log("Warn: Node %s not stable" % (node))
return None
def partition_stable(self, nodes, timeout=None):
watchpats = [ ]
watchpats.append("Current ping state: S_IDLE")
watchpats.append(self.templates["Pat:DC_IDLE"])
self.debug("Waiting for cluster stability...")
if timeout == None:
timeout = self.Env["DeadTime"]
if len(nodes) < 3:
self.debug("Cluster is inactive")
return 1
idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"])
idle_watch.setwatch()
for node in nodes.split():
# have each node dump its current state
self.rsh(node, self.templates["StatusCmd"] % node, 1)
ret = idle_watch.look()
while ret:
self.debug(ret)
for node in nodes.split():
if re.search(node, ret):
return 1
ret = idle_watch.look()
self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout))
return None
def cluster_stable(self, timeout=None, double_check=False):
partitions = self.find_partitions()
for partition in partitions:
if not self.partition_stable(partition, timeout):
return None
if double_check:
# Make sure we are really stable and that all resources,
# including those that depend on transient node attributes,
# are started if they were going to be
time.sleep(5)
for partition in partitions:
if not self.partition_stable(partition, timeout):
return None
return 1
def is_node_dc(self, node, status_line=None):
rc = 0
if not status_line:
status_line = self.rsh(node, self.templates["StatusCmd"]%node, 1)
if not status_line:
rc = 0
elif str.find(status_line, 'S_IDLE') != -1:
rc = 1
elif str.find(status_line, 'S_INTEGRATION') != -1:
rc = 1
elif str.find(status_line, 'S_FINALIZE_JOIN') != -1:
rc = 1
elif str.find(status_line, 'S_POLICY_ENGINE') != -1:
rc = 1
elif str.find(status_line, 'S_TRANSITION_ENGINE') != -1:
rc = 1
return rc
def active_resources(self, node):
# [SM].* {node} matches Started, Slave, Master
# Stopped wont be matched as it wont include {node}
(rc, output) = self.rsh(node, """crm_resource -c""", None)
resources = []
for line in output:
if re.search("^Resource", line):
tmp = AuditResource(self, line)
if tmp.type == "primitive" and tmp.host == node:
resources.append(tmp.id)
return resources
def ResourceLocation(self, rid):
ResourceNodes = []
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == "up":
cmd = self.templates["RscRunning"] % (rid)
(rc, lines) = self.rsh(node, cmd, None)
if rc == 127:
self.log("Command '%s' failed. Binary or pacemaker-cts package not installed?" % cmd)
for line in lines:
self.log("Output: "+line)
elif rc == 0:
ResourceNodes.append(node)
return ResourceNodes
def find_partitions(self):
ccm_partitions = []
for node in self.Env["nodes"]:
if self.ShouldBeStatus[node] == "up":
partition = self.rsh(node, self.templates["PartitionCmd"], 1)
if not partition:
self.log("no partition details for %s" % node)
elif len(partition) > 2:
nodes = partition.split()
nodes.sort()
partition = ' '.join(nodes)
found = 0
for a_partition in ccm_partitions:
if partition == a_partition:
found = 1
if found == 0:
self.debug("Adding partition from %s: %s" % (node, partition))
ccm_partitions.append(partition)
else:
self.debug("Partition '%s' from %s is consistent with existing entries" % (partition, node))
else:
self.log("bad partition details for %s" % node)
else:
self.debug("Node %s is down... skipping" % node)
self.debug("Found partitions: %s" % repr(ccm_partitions) )
return ccm_partitions
def HasQuorum(self, node_list):
# If we are auditing a partition, then one side will
# have quorum and the other not.
# So the caller needs to tell us which we are checking
# If no value for node_list is specified... assume all nodes
if not node_list:
node_list = self.Env["nodes"]
for node in node_list:
if self.ShouldBeStatus[node] == "up":
quorum = self.rsh(node, self.templates["QuorumCmd"], 1)
if str.find(quorum, "1") != -1:
return 1
elif str.find(quorum, "0") != -1:
return 0
else:
self.debug("WARN: Unexpected quorum test result from " + node + ":" + quorum)
return 0
def Components(self):
complist = []
common_ignore = [
"Pending action:",
"(ERROR|error): crm_log_message_adv:",
"(ERROR|error): MSG: No message to dump",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"Sending message to CIB service FAILED",
"Action A_RECOVER .* not supported",
"(ERROR|error): stonithd_op_result_ready: not signed on",
"pingd.*(ERROR|error): send_update: Could not send update",
"send_ipc_message: IPC Channel to .* is not connected",
"unconfirmed_actions: Waiting on .* unconfirmed actions",
"cib_native_msgready: Message pending on command channel",
r": Performing A_EXIT_1 - forcefully exiting the CRMd",
r"Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
]
stonith_ignore = [
r"Updating failcount for child_DoFencing",
r"(ERROR|error).*: Sign-in failed: triggered a retry",
"lrmd.*(ERROR|error): stonithd_receive_ops_result failed.",
]
stonith_ignore.extend(common_ignore)
- ccm_ignore = [
- "(ERROR|error): get_channel_token: No reply message - disconnected"
- ]
-
- ccm_ignore.extend(common_ignore)
-
ccm = Process(self, "ccm", triggersreboot=self.fastfail, pats = [
"State transition .* S_RECOVERY",
- "CCM connection appears to have failed",
"crmd.*Action A_RECOVER .* not supported",
r"crmd.*: Input I_TERMINATE .*from do_recover",
r"crmd.*: Could not recover from internal error",
- "crmd.*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)",
+ "crmd.*I_ERROR.*crmd_cib_connection_destroy",
# these status numbers are likely wrong now
r"crmd.*exited with status 2",
r"attrd.*exited with status 1",
r"cib.*exited with status 2",
# Not if it was fenced
# "A new node joined the cluster",
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# "tengine_stonith_callback: .*result=0",
# "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE",
# "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN",
"State transition S_STARTING -> S_PENDING",
- ], badnews_ignore = ccm_ignore)
+ ], badnews_ignore = common_ignore)
cib = Process(self, "cib", triggersreboot=self.fastfail, pats = [
"State transition .* S_RECOVERY",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*I_ERROR.*crmd_cib_connection_destroy",
r"crmd.*: Could not recover from internal error",
# these status numbers are likely wrong now
r"crmd.*exited with status 2",
r"attrd.*exited with status 1",
], badnews_ignore = common_ignore)
lrmd = Process(self, "lrmd", triggersreboot=self.fastfail, pats = [
"State transition .* S_RECOVERY",
"LRM Connection failed",
"crmd.*I_ERROR.*lrm_connection_destroy",
"State transition S_STARTING -> S_PENDING",
r"crmd.*: Input I_TERMINATE .*from do_recover",
r"crmd.*: Could not recover from internal error",
# this status number is likely wrong now
r"crmd.*exited with status 2",
], badnews_ignore = common_ignore)
crmd = Process(self, "crmd", triggersreboot=self.fastfail, pats = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# "tengine_stonith_callback: .*result=0",
"State transition .* S_IDLE",
"State transition S_STARTING -> S_PENDING",
], badnews_ignore = common_ignore)
pengine = Process(self, "pengine", triggersreboot=self.fastfail, pats = [
"State transition .* S_RECOVERY",
r"crmd.*: Input I_TERMINATE .*from do_recover",
r"crmd.*: Could not recover from internal error",
r"crmd.*CRIT.*: Connection to the Policy Engine failed",
"crmd.*I_ERROR.*save_cib_contents",
# this status number is likely wrong now
r"crmd.*exited with status 2",
], badnews_ignore = common_ignore, dc_only=1)
if self.Env["DoFencing"] == 1 :
complist.append(Process(self, "stoniths", triggersreboot=self.fastfail, dc_pats = [
r"crmd.*CRIT.*: Fencing daemon connection failed",
"Attempting connection to fencing daemon",
], badnews_ignore = stonith_ignore))
if self.fastfail == 0:
ccm.pats.extend([
# these status numbers are likely wrong now
r"attrd.*exited with status 1",
- "(ERROR|error): Respawning client .*attrd",
r"cib.*exited with status 2",
- "(ERROR|error): Respawning client .*cib",
r"crmd.*exited with status 2",
- "(ERROR|error): Respawning client .*crmd"
])
cib.pats.extend([
# these status numbers are likely wrong now
r"attrd.*exited with status 1",
- "(ERROR|error): Respawning client .*attrd",
r"crmd.*exited with status 2",
- "(ERROR|error): Respawning client .*crmd"
])
lrmd.pats.extend([
# these status numbers are likely wrong now
r"crmd.*exited with status 2",
- "(ERROR|error): Respawning client .*crmd"
- ])
- pengine.pats.extend([
- "(ERROR|error): Respawning client .*crmd"
])
complist.append(ccm)
complist.append(cib)
complist.append(lrmd)
complist.append(crmd)
complist.append(pengine)
return complist
def StandbyStatus(self, node):
out=self.rsh(node, self.templates["StandbyQueryCmd"] % node, 1)
if not out:
return "off"
out = out[:-1]
self.debug("Standby result: "+out)
return out
# status == "on" : Enter Standby mode
# status == "off": Enter Active mode
def SetStandbyMode(self, node, status):
current_status = self.StandbyStatus(node)
cmd = self.templates["StandbyCmd"] % (node, status)
ret = self.rsh(node, cmd)
return True
def AddDummyRsc(self, node, rid):
rsc_xml = """ '<resources>
<primitive class=\"ocf\" id=\"%s\" provider=\"pacemaker\" type=\"Dummy\">
<operations>
<op id=\"%s-interval-10s\" interval=\"10s\" name=\"monitor\"/
</operations>
</primitive>
</resources>'""" % (rid, rid)
constraint_xml = """ '<constraints>
<rsc_location id=\"location-%s-%s\" node=\"%s\" rsc=\"%s\" score=\"INFINITY\"/>
</constraints>'
""" % (rid, node, node, rid)
self.rsh(node, self.templates['CibAddXml'] % (rsc_xml))
self.rsh(node, self.templates['CibAddXml'] % (constraint_xml))
def RemoveDummyRsc(self, node, rid):
constraint = "\"//rsc_location[@rsc='%s']\"" % (rid)
rsc = "\"//primitive[@id='%s']\"" % (rid)
self.rsh(node, self.templates['CibDelXpath'] % constraint)
self.rsh(node, self.templates['CibDelXpath'] % rsc)
#######################################################################
#
# A little test code...
#
# Which you are advised to completely ignore...
#
#######################################################################
if __name__ == '__main__':
pass
diff --git a/cts/patterns.py b/cts/patterns.py
index cbaeb54c29..88797b7d84 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -1,403 +1,401 @@
from __future__ import print_function
import sys, os
from cts.CTSvars import *
patternvariants = {}
class BasePatterns(object):
def __init__(self, name):
self.name = name
patternvariants[name] = self
self.ignore = [
"avoid confusing Valgrind",
]
self.BadNews = []
self.components = {}
self.commands = {
"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
"CibQuery" : "cibadmin -Ql",
"CibAddXml" : "cibadmin --modify -c --xml-text %s",
"CibDelXpath" : "cibadmin --delete --xpath %s",
# 300,000 == 5 minutes
"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/lrmd_test -R -r %s",
"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",
# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
"ReduceCommCmd" : "",
"RestoreCommCmd" : "tc qdisc del dev lo root",
"SetCheckInterval" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-recheck-interval-setting\" name=\"cluster-recheck-interval\" value=\"%s\"/></cluster_property_set>'",
"ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"",
"MaintenanceModeOn" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
}
self.search = {
"Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE",
# This won't work if we have multiple partitions
"Pat:Local_started" : "%s\W.*The local CRM is operational",
"Pat:Slave_started" : "%s\W.*State transition.*-> S_NOT_DC",
"Pat:Master_started": "%s\W.*State transition.*-> S_IDLE",
"Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN",
"Pat:They_stopped" : "%s\W.*LOST:.* %s ",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
"Pat:Fencing_start" : "(Initiating remote operation|Requesting peer fencing ).* (for|of) %s",
"Pat:Fencing_ok" : r"stonith.*:\s*Operation .* of %s by .* for .*@.*: OK",
"Pat:Fencing_recover" : r"pengine.*: Recover %s",
"Pat:RscOpOK" : r"crmd.*:\s+Result of %s operation for %s.*: (0 \()?ok",
"Pat:RscRemoteOpOK" : r"crmd.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
"Pat:NodeFenced" : r"crmd.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK",
"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
}
def get_component(self, key):
if key in self.components:
return self.components[key]
print("Unknown component '%s' for %s" % (key, self.name))
return []
def get_patterns(self, key):
if key == "BadNews":
return self.BadNews
elif key == "BadNewsIgnore":
return self.ignore
elif key == "Commands":
return self.commands
elif key == "Search":
return self.search
elif key == "Components":
return self.components
def __getitem__(self, key):
if key == "Name":
return self.name
elif key in self.commands:
return self.commands[key]
elif key in self.search:
return self.search[key]
else:
print("Unknown template '%s' for %s" % (key, self.name))
return None
class crm_corosync(BasePatterns):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"StartCmd" : "service corosync start && service pacemaker start",
"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker_remoted ] || service pacemaker_remote stop; service corosync stop",
"EpochCmd" : "crm_node -e",
"QuorumCmd" : "crm_node -q",
"PartitionCmd" : "crm_node -p",
})
self.search.update({
# Close enough... "Corosync Cluster Engine exiting normally" isn't printed
# reliably and there's little interest in doing anything about it
"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*crmd.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "crmd.*Node %s(\[|\s).*state is now lost",
"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated with signal 9",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:InfraUp" : "%s\W.*corosync.*Initializing transport",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
self.ignore = self.ignore + [
r"crm_mon:",
r"crmadmin:",
r"update_trace_data",
r"async_notify:.*strange, client not found",
r"Parse error: Ignoring unknown option .*nodename",
r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:",
r"Child process .* terminated with signal 9",
r"getinfo response error: 1$",
"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro failed",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* closed .I/O condition=17",
]
self.BadNews = [
r"error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|lrmd|crmd):.*, exiting",
- r"(WARN|warn).*Ignoring HA message.*vote.*not in our membership list",
r"pengine.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
r"\[[0-9]+\] terminated with signal [0-9]+ \(",
r"Child process .* terminated with signal",
r"pengine:.*Recover .*\(.* -\> .*\)",
r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
r"Peer is not part of our cluster",
r"We appear to be in an election loop",
r"Unknown node -> we will not deliver message",
r"(Blackbox dump requested|Problem detected)",
r"pacemakerd.*Could not connect to Cluster Configuration Database API",
r"Receiving messages from a node we think is dead",
r"share the same cluster nodeid",
r"share the same name",
#r"crm_ipc_send:.*Request .* failed",
#r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received",
# Not inherently bad, but worth tracking
#r"No need to invoke the TE",
#r"ping.*: DEBUG: Updated connected = 0",
#r"Digest mis-match:",
r"crmd:.*Transition failed: terminated",
r"Local CIB .* differs from .*:",
r"warn.*:\s*Continuing but .* will NOT be used",
r"warn.*:\s*Cluster configuration file .* is corrupt",
#r"Executing .* fencing operation",
r"Election storm",
r"stalled the FSA with pending inputs",
]
self.components["common-ignore"] = [
"Pending action:",
"error: crm_log_message_adv:",
r"resource( was|s were) active at shutdown",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"Sending message to CIB service FAILED",
"apply_xml_diff:.*Diff application failed!",
r"crmd.*:\s*Action A_RECOVER .* not supported",
"unconfirmed_actions:.*Waiting on .* unconfirmed actions",
"cib_native_msgready:.*Message pending on command channel",
r"crmd.*:\s*Performing A_EXIT_1 - forcefully exiting the CRMd",
"verify_stopped:.*Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
"error: attrd_connection_destroy:.*Lost connection to attrd",
r".*:\s*Executing .* fencing operation \(.*\) on ",
r".*:\s*Requesting fencing \([^)]+\) of node ",
r"(Blackbox dump requested|Problem detected)",
# "error: native_create_actions: Resource .*stonith::.* is active on 2 nodes attempting recovery",
# "error: process_pe_message: Transition .* ERRORs found during PE processing",
]
self.components["corosync-ignore"] = [
r"error:.*Connection to the CPG API failed: Library error",
r"\[[0-9]+\] exited with status [0-9]+ \(",
r"pacemakerd.*error:.*Child process .* exited",
r"cib.*error:.*Corosync connection lost",
r"stonith-ng.*error:.*Corosync connection terminated",
r"error:.*Child process cib .* exited: Invalid argument",
r"error:.*Child process attrd .* exited: Transport endpoint is not connected",
r"error:.*Child process crmd .* exited: Link has been severed",
r"lrmd.*error:.*Connection to stonith-ng.* (failed|closed)",
r"lrmd.*error:.*LRMD lost STONITH connection",
r"crmd.*State transition .* S_RECOVERY",
r"crmd.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state",
r"crmd.*error:.*Could not recover from internal error",
r"error:.*Connection to cib_(shm|rw).* (failed|closed)",
r"error:.*STONITH connection failed",
r"error: Connection to stonith-ng.* (failed|closed)",
r"crit: Fencing daemon connection failed",
]
self.components["corosync"] = [
r"pacemakerd.*error:.*Connection destroyed",
r"attrd.*:\s*(crit|error):.*Lost connection to (Corosync|CIB) service",
r"stonith.*:\s*(Corosync connection terminated|Shutting down)",
r"cib.*:\s*Corosync connection lost!\s+Exiting.",
r"crmd.*:\s*(connection terminated|Disconnected from Corosync)",
r"pengine.*Scheduling Node .* for STONITH",
r"crmd.*:\s*Peer .* was terminated \(.*\) by .* for .*:\s*OK",
]
self.components["cib-ignore"] = [
"lrmd.*Connection to stonith-ng failed",
"lrmd.*Connection to stonith-ng.* closed",
"lrmd.*LRMD lost STONITH connection",
"lrmd.*STONITH connection failed, finalizing .* pending operations",
]
self.components["cib"] = [
"State transition .* S_RECOVERY",
- "Respawning .* crmd",
- "Respawning .* attrd",
+ r"Respawning failed child process: (attrd|crmd)",
"Connection to cib_.* failed",
"Connection to cib_.* closed",
r"crmd.*:.*Connection to the CIB terminated...",
r"attrd.*:.*(Lost connection to CIB service|Connection to the CIB terminated)",
r"crmd\[[0-9]+\] exited with status 1 \(",
r"attrd\[[0-9]+\] exited with status 102 \(",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*I_ERROR.*crmd_cib_connection_destroy",
"crmd.*Could not recover from internal error",
]
self.components["lrmd"] = [
"State transition .* S_RECOVERY",
"LRM Connection failed",
- "Respawning .* crmd",
+ r"Respawning failed child process: crmd",
"Connection to lrmd failed",
"Connection to lrmd.* closed",
"crmd.*I_ERROR.*lrm_connection_destroy",
r"crmd\[[0-9]+\] exited with status 1 \(",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*Could not recover from internal error",
]
self.components["lrmd-ignore"] = []
self.components["crmd"] = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# Only if the node wasn't the DC: "State transition S_IDLE",
"State transition .* -> S_IDLE",
]
self.components["crmd-ignore"] = []
self.components["attrd"] = []
self.components["attrd-ignore"] = []
self.components["pengine"] = [
"State transition .* S_RECOVERY",
- "Respawning .* crmd",
+ r"Respawning failed child process: crmd",
r"crmd\[[0-9]+\] exited with status 1 \(",
"Connection to pengine failed",
"Connection to pengine.* closed",
"Connection to the Policy Engine failed",
"crmd.*I_ERROR.*save_cib_contents",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*Could not recover from internal error",
]
self.components["pengine-ignore"] = []
self.components["stonith"] = [
"Connection to stonith-ng failed",
"LRMD lost STONITH connection",
"Connection to stonith-ng.* closed",
"Fencing daemon connection failed",
r"crmd.*:\s*warn.*:\s*Callback already present",
]
self.components["stonith-ignore"] = [
r"pengine.*: Recover Fencing",
r"Updating failcount for Fencing",
r"error:.*Connection to stonith-ng failed",
r"error:.*Connection to stonith-ng.*closed \(I/O condition=17\)",
r"crit:.*Fencing daemon connection failed",
r"error:.*Sign-in failed: triggered a retry",
"STONITH connection failed, finalizing .* pending operations.",
r"crmd.*:\s+Result of .* operation for Fencing.*Error",
]
self.components["stonith-ignore"].extend(self.components["common-ignore"])
class crm_corosync_docker(crm_corosync):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
crm_corosync.__init__(self, name)
self.commands.update({
"StartCmd" : "pcmk_start",
"StopCmd" : "pcmk_stop",
})
class PatternSelector(object):
def __init__(self, name=None):
self.name = name
self.base = BasePatterns("crm-base")
if not name:
crm_corosync("crm-corosync")
elif name == "crm-corosync":
crm_corosync(name)
elif name == "crm-corosync-docker":
crm_corosync_docker(name)
def get_variant(self, variant):
if variant in patternvariants:
return patternvariants[variant]
print("defaulting to crm-base for %s" % variant)
return self.base
def get_patterns(self, variant, kind):
return self.get_variant(variant).get_patterns(kind)
def get_template(self, variant, key):
v = self.get_variant(variant)
return v[key]
def get_component(self, variant, kind):
return self.get_variant(variant).get_component(kind)
def __getitem__(self, key):
return self.get_template(self.name, key)
# python cts/CTSpatt.py -k crm-corosync -t StartCmd
if __name__ == '__main__':
pdir=os.path.dirname(sys.path[0])
sys.path.insert(0, pdir) # So that things work from the source directory
kind=None
template=None
skipthis=None
args=sys.argv[1:]
for i in range(0, len(args)):
if skipthis:
skipthis=None
continue
elif args[i] == "-k" or args[i] == "--kind":
skipthis=1
kind = args[i+1]
elif args[i] == "-t" or args[i] == "--template":
skipthis=1
template = args[i+1]
else:
print("Illegal argument " + args[i])
print(PatternSelector(kind)[template])

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jul 20, 7:29 PM (3 h, 17 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2081322
Default Alt Text
(38 KB)

Event Timeline