No OneTemporary
Actions

Size

38 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/cts/CM_common.py b/cts/CM_common.py
	index da6b68a96d..506a2de054 100755
	--- a/cts/CM_common.py
	+++ b/cts/CM_common.py
	@@ -1,502 +1,486 @@
	'''CTS: Cluster Testing System: Cluster Manager Common Class

	This was originally the cluster manager class for the Heartbeat stack.
	It is retained for use as a base class by other cluster manager classes.
	It could be merged into the ClusterManager class directly, but this is
	easier.
	'''

	__copyright__ = '''
	Author: Huang Zhen <zhenhltc@cn.ibm.com>
	Copyright (C) 2004 International Business Machines

	Additional Audits, Revised Start action, Default Configuration:
	Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>

	'''

	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 2
	# of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

	import sys
	from cts.CTSvars import *
	from cts.CTS import *
	from cts.CIB import *
	from cts.CTStests import AuditResource
	from cts.watcher import LogWatcher

	class crm_common(ClusterManager):
	def __init__(self, Environment, randseed=None, name=None):
	ClusterManager.__init__(self, Environment, randseed=randseed)

	self.fastfail = 0
	self.cib_installed = 0
	self.config = None
	self.cluster_monitor = 0
	self.use_short_names = 1

	if self.Env["DoBSC"]:
	del self.templates["Pat:They_stopped"]

	self._finalConditions()

	self.check_transitions = 0
	self.check_elections = 0
	self.CIBsync = {}
	self.CibFactory = ConfigFactory(self)
	self.cib = self.CibFactory.createConfig(self.Env["Schema"])

	def errorstoignore(self):
	# At some point implement a more elegant solution that
	# also produces a report at the end
	'''Return list of errors which are known and very noisey should be ignored'''
	return PatternSelector().get_patterns(self.name, "BadNewsIgnore")

	def install_config(self, node):
	if not self.ns.WaitForNodeToComeUp(node):
	self.log("Node %s is not up." % node)
	return None

	if not node in self.CIBsync and self.Env["ClobberCIB"] == 1:
	self.CIBsync[node] = 1
	self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")

	# Only install the CIB on the first node, all the other ones will pick it up from there
	if self.cib_installed == 1:
	return None

	self.cib_installed = 1
	if self.Env["CIBfilename"] == None:
	self.log("Installing Generated CIB on node %s" % (node))
	self.cib.install(node)

	else:
	self.log("Installing CIB (%s) on node %s" % (self.Env["CIBfilename"], node))
	if 0 != self.rsh.cp(self.Env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)):
	raise ValueError("Can not scp file to %s %d"%(node))

	self.rsh(node, "chown "+CTSvars.CRM_DAEMON_USER+" "+CTSvars.CRM_CONFIG_DIR+"/cib.xml")

	def prepare(self):
	'''Finish the Initialization process. Prepare to test...'''

	self.partitions_expected = 1
	for node in self.Env["nodes"]:
	self.ShouldBeStatus[node] = ""
	self.unisolate_node(node)
	self.StataCM(node)

	def test_node_CM(self, node):
	'''Report the status of the cluster manager on a given node'''

	watchpats = [ ]
	watchpats.append("Current ping state: (S_IDLE\|S_NOT_DC)")
	watchpats.append(self.templates["Pat:Slave_started"]%node)
	watchpats.append(self.templates["Pat:Master_started"]%node)
	idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterIdle", hosts=[node], kind=self.Env["LogWatcher"])
	idle_watch.setwatch()

	out = self.rsh(node, self.templates["StatusCmd"]%node, 1)
	self.debug("Node %s status: '%s'" %(node, out))

	if not out or str.find(out, 'ok') < 0:
	if self.ShouldBeStatus[node] == "up":
	self.log(
	"Node status for %s is %s but we think it should be %s"
	% (node, "down", self.ShouldBeStatus[node]))
	self.ShouldBeStatus[node] = "down"
	return 0

	if self.ShouldBeStatus[node] == "down":
	self.log(
	"Node status for %s is %s but we think it should be %s: %s"
	% (node, "up", self.ShouldBeStatus[node], out))

	self.ShouldBeStatus[node] = "up"

	# check the output first - because syslog-ng loses messages
	if str.find(out, 'S_NOT_DC') != -1:
	# Up and stable
	return 2
	if str.find(out, 'S_IDLE') != -1:
	# Up and stable
	return 2

	# fall back to syslog-ng and wait
	if not idle_watch.look():
	# just up
	self.debug("Warn: Node %s is unstable: %s" % (node, out))
	return 1

	# Up and stable
	return 2

	# Is the node up or is the node down
	def StataCM(self, node):
	'''Report the status of the cluster manager on a given node'''

	if self.test_node_CM(node) > 0:
	return 1
	return None

	# Being up and being stable is not the same question...
	def node_stable(self, node):
	'''Report the status of the cluster manager on a given node'''

	if self.test_node_CM(node) == 2:
	return 1
	self.log("Warn: Node %s not stable" % (node))
	return None

	def partition_stable(self, nodes, timeout=None):
	watchpats = [ ]
	watchpats.append("Current ping state: S_IDLE")
	watchpats.append(self.templates["Pat:DC_IDLE"])
	self.debug("Waiting for cluster stability...")

	if timeout == None:
	timeout = self.Env["DeadTime"]

	if len(nodes) < 3:
	self.debug("Cluster is inactive")
	return 1

	idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"])
	idle_watch.setwatch()

	for node in nodes.split():
	# have each node dump its current state
	self.rsh(node, self.templates["StatusCmd"] % node, 1)

	ret = idle_watch.look()
	while ret:
	self.debug(ret)
	for node in nodes.split():
	if re.search(node, ret):
	return 1
	ret = idle_watch.look()

	self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout))
	return None

	def cluster_stable(self, timeout=None, double_check=False):
	partitions = self.find_partitions()

	for partition in partitions:
	if not self.partition_stable(partition, timeout):
	return None

	if double_check:
	# Make sure we are really stable and that all resources,
	# including those that depend on transient node attributes,
	# are started if they were going to be
	time.sleep(5)
	for partition in partitions:
	if not self.partition_stable(partition, timeout):
	return None

	return 1

	def is_node_dc(self, node, status_line=None):
	rc = 0

	if not status_line:
	status_line = self.rsh(node, self.templates["StatusCmd"]%node, 1)

	if not status_line:
	rc = 0
	elif str.find(status_line, 'S_IDLE') != -1:
	rc = 1
	elif str.find(status_line, 'S_INTEGRATION') != -1:
	rc = 1
	elif str.find(status_line, 'S_FINALIZE_JOIN') != -1:
	rc = 1
	elif str.find(status_line, 'S_POLICY_ENGINE') != -1:
	rc = 1
	elif str.find(status_line, 'S_TRANSITION_ENGINE') != -1:
	rc = 1

	return rc

	def active_resources(self, node):
	# [SM].* {node} matches Started, Slave, Master
	# Stopped wont be matched as it wont include {node}
	(rc, output) = self.rsh(node, """crm_resource -c""", None)

	resources = []
	for line in output:
	if re.search("^Resource", line):
	tmp = AuditResource(self, line)
	if tmp.type == "primitive" and tmp.host == node:
	resources.append(tmp.id)
	return resources

	def ResourceLocation(self, rid):
	ResourceNodes = []
	for node in self.Env["nodes"]:
	if self.ShouldBeStatus[node] == "up":

	cmd = self.templates["RscRunning"] % (rid)
	(rc, lines) = self.rsh(node, cmd, None)

	if rc == 127:
	self.log("Command '%s' failed. Binary or pacemaker-cts package not installed?" % cmd)
	for line in lines:
	self.log("Output: "+line)
	elif rc == 0:
	ResourceNodes.append(node)

	return ResourceNodes

	def find_partitions(self):
	ccm_partitions = []

	for node in self.Env["nodes"]:
	if self.ShouldBeStatus[node] == "up":
	partition = self.rsh(node, self.templates["PartitionCmd"], 1)

	if not partition:
	self.log("no partition details for %s" % node)
	elif len(partition) > 2:
	nodes = partition.split()
	nodes.sort()
	partition = ' '.join(nodes)

	found = 0
	for a_partition in ccm_partitions:
	if partition == a_partition:
	found = 1
	if found == 0:
	self.debug("Adding partition from %s: %s" % (node, partition))
	ccm_partitions.append(partition)
	else:
	self.debug("Partition '%s' from %s is consistent with existing entries" % (partition, node))

	else:
	self.log("bad partition details for %s" % node)
	else:
	self.debug("Node %s is down... skipping" % node)

	self.debug("Found partitions: %s" % repr(ccm_partitions) )
	return ccm_partitions

	def HasQuorum(self, node_list):
	# If we are auditing a partition, then one side will
	# have quorum and the other not.
	# So the caller needs to tell us which we are checking
	# If no value for node_list is specified... assume all nodes
	if not node_list:
	node_list = self.Env["nodes"]

	for node in node_list:
	if self.ShouldBeStatus[node] == "up":
	quorum = self.rsh(node, self.templates["QuorumCmd"], 1)
	if str.find(quorum, "1") != -1:
	return 1
	elif str.find(quorum, "0") != -1:
	return 0
	else:
	self.debug("WARN: Unexpected quorum test result from " + node + ":" + quorum)

	return 0
	def Components(self):
	complist = []
	common_ignore = [
	"Pending action:",
	"(ERROR\|error): crm_log_message_adv:",
	"(ERROR\|error): MSG: No message to dump",
	"pending LRM operations at shutdown",
	"Lost connection to the CIB service",
	"Connection to the CIB terminated...",
	"Sending message to CIB service FAILED",
	"Action A_RECOVER .* not supported",
	"(ERROR\|error): stonithd_op_result_ready: not signed on",
	"pingd.*(ERROR\|error): send_update: Could not send update",
	"send_ipc_message: IPC Channel to .* is not connected",
	"unconfirmed_actions: Waiting on .* unconfirmed actions",
	"cib_native_msgready: Message pending on command channel",
	r": Performing A_EXIT_1 - forcefully exiting the CRMd",
	r"Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
	]

	stonith_ignore = [
	r"Updating failcount for child_DoFencing",
	r"(ERROR\|error).*: Sign-in failed: triggered a retry",
	"lrmd.*(ERROR\|error): stonithd_receive_ops_result failed.",
	]

	stonith_ignore.extend(common_ignore)

	- ccm_ignore = [
	- "(ERROR\|error): get_channel_token: No reply message - disconnected"
	- ]
	-
	- ccm_ignore.extend(common_ignore)
	-
	ccm = Process(self, "ccm", triggersreboot=self.fastfail, pats = [
	"State transition .* S_RECOVERY",
	- "CCM connection appears to have failed",
	"crmd.Action A_RECOVER . not supported",
	r"crmd.: Input I_TERMINATE .from do_recover",
	r"crmd.*: Could not recover from internal error",
	- "crmd.I_ERROR.(ccm_dispatch\|crmd_cib_connection_destroy)",
	+ "crmd.I_ERROR.crmd_cib_connection_destroy",
	# these status numbers are likely wrong now
	r"crmd.*exited with status 2",
	r"attrd.*exited with status 1",
	r"cib.*exited with status 2",

	# Not if it was fenced
	# "A new node joined the cluster",

	# "WARN: determine_online_status: Node .* is unclean",
	# "Scheduling Node .* for STONITH",
	# "Executing .* fencing operation",
	# "tengine_stonith_callback: .*result=0",
	# "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE",
	# "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN",
	"State transition S_STARTING -> S_PENDING",
	- ], badnews_ignore = ccm_ignore)
	+ ], badnews_ignore = common_ignore)

	cib = Process(self, "cib", triggersreboot=self.fastfail, pats = [
	"State transition .* S_RECOVERY",
	"Lost connection to the CIB service",
	"Connection to the CIB terminated...",
	r"crmd.: Input I_TERMINATE .from do_recover",
	"crmd.I_ERROR.crmd_cib_connection_destroy",
	r"crmd.*: Could not recover from internal error",
	# these status numbers are likely wrong now
	r"crmd.*exited with status 2",
	r"attrd.*exited with status 1",
	], badnews_ignore = common_ignore)

	lrmd = Process(self, "lrmd", triggersreboot=self.fastfail, pats = [
	"State transition .* S_RECOVERY",
	"LRM Connection failed",
	"crmd.I_ERROR.lrm_connection_destroy",
	"State transition S_STARTING -> S_PENDING",
	r"crmd.: Input I_TERMINATE .from do_recover",
	r"crmd.*: Could not recover from internal error",
	# this status number is likely wrong now
	r"crmd.*exited with status 2",
	], badnews_ignore = common_ignore)

	crmd = Process(self, "crmd", triggersreboot=self.fastfail, pats = [
	# "WARN: determine_online_status: Node .* is unclean",
	# "Scheduling Node .* for STONITH",
	# "Executing .* fencing operation",
	# "tengine_stonith_callback: .*result=0",
	"State transition .* S_IDLE",
	"State transition S_STARTING -> S_PENDING",
	], badnews_ignore = common_ignore)

	pengine = Process(self, "pengine", triggersreboot=self.fastfail, pats = [
	"State transition .* S_RECOVERY",
	r"crmd.: Input I_TERMINATE .from do_recover",
	r"crmd.*: Could not recover from internal error",
	r"crmd.CRIT.: Connection to the Policy Engine failed",
	"crmd.I_ERROR.save_cib_contents",
	# this status number is likely wrong now
	r"crmd.*exited with status 2",
	], badnews_ignore = common_ignore, dc_only=1)

	if self.Env["DoFencing"] == 1 :
	complist.append(Process(self, "stoniths", triggersreboot=self.fastfail, dc_pats = [
	r"crmd.CRIT.: Fencing daemon connection failed",
	"Attempting connection to fencing daemon",
	], badnews_ignore = stonith_ignore))

	if self.fastfail == 0:
	ccm.pats.extend([
	# these status numbers are likely wrong now
	r"attrd.*exited with status 1",
	- "(ERROR\|error): Respawning client .*attrd",
	r"cib.*exited with status 2",
	- "(ERROR\|error): Respawning client .*cib",
	r"crmd.*exited with status 2",
	- "(ERROR\|error): Respawning client .*crmd"
	])
	cib.pats.extend([
	# these status numbers are likely wrong now
	r"attrd.*exited with status 1",
	- "(ERROR\|error): Respawning client .*attrd",
	r"crmd.*exited with status 2",
	- "(ERROR\|error): Respawning client .*crmd"
	])
	lrmd.pats.extend([
	# these status numbers are likely wrong now
	r"crmd.*exited with status 2",
	- "(ERROR\|error): Respawning client .*crmd"
	- ])
	- pengine.pats.extend([
	- "(ERROR\|error): Respawning client .*crmd"
	])

	complist.append(ccm)
	complist.append(cib)
	complist.append(lrmd)
	complist.append(crmd)
	complist.append(pengine)

	return complist

	def StandbyStatus(self, node):
	out=self.rsh(node, self.templates["StandbyQueryCmd"] % node, 1)
	if not out:
	return "off"
	out = out[:-1]
	self.debug("Standby result: "+out)
	return out

	# status == "on" : Enter Standby mode
	# status == "off": Enter Active mode
	def SetStandbyMode(self, node, status):
	current_status = self.StandbyStatus(node)
	cmd = self.templates["StandbyCmd"] % (node, status)
	ret = self.rsh(node, cmd)
	return True

	def AddDummyRsc(self, node, rid):
	rsc_xml = """ '<resources>
	<primitive class=\"ocf\" id=\"%s\" provider=\"pacemaker\" type=\"Dummy\">
	<operations>
	<op id=\"%s-interval-10s\" interval=\"10s\" name=\"monitor\"/
	</operations>
	</primitive>
	</resources>'""" % (rid, rid)
	constraint_xml = """ '<constraints>
	<rsc_location id=\"location-%s-%s\" node=\"%s\" rsc=\"%s\" score=\"INFINITY\"/>
	</constraints>'
	""" % (rid, node, node, rid)

	self.rsh(node, self.templates['CibAddXml'] % (rsc_xml))
	self.rsh(node, self.templates['CibAddXml'] % (constraint_xml))

	def RemoveDummyRsc(self, node, rid):
	constraint = "\"//rsc_location[@rsc='%s']\"" % (rid)
	rsc = "\"//primitive[@id='%s']\"" % (rid)

	self.rsh(node, self.templates['CibDelXpath'] % constraint)
	self.rsh(node, self.templates['CibDelXpath'] % rsc)


	#######################################################################
	#
	# A little test code...
	#
	# Which you are advised to completely ignore...
	#
	#######################################################################
	if __name__ == '__main__':
	pass
	diff --git a/cts/patterns.py b/cts/patterns.py
	index cbaeb54c29..88797b7d84 100644
	--- a/cts/patterns.py
	+++ b/cts/patterns.py
	@@ -1,403 +1,401 @@
	from __future__ import print_function
	import sys, os

	from cts.CTSvars import *

	patternvariants = {}
	class BasePatterns(object):
	def __init__(self, name):
	self.name = name
	patternvariants[name] = self
	self.ignore = [
	"avoid confusing Valgrind",
	]
	self.BadNews = []
	self.components = {}
	self.commands = {
	"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
	"CibQuery" : "cibadmin -Ql",
	"CibAddXml" : "cibadmin --modify -c --xml-text %s",
	"CibDelXpath" : "cibadmin --delete --xpath %s",
	# 300,000 == 5 minutes
	"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/lrmd_test -R -r %s",
	"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
	"TmpDir" : "/tmp",

	"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
	"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",

	# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
	# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
	# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
	# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
	"ReduceCommCmd" : "",
	"RestoreCommCmd" : "tc qdisc del dev lo root",

	"SetCheckInterval" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-recheck-interval-setting\" name=\"cluster-recheck-interval\" value=\"%s\"/></cluster_property_set>'",
	"ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"",

	"MaintenanceModeOn" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
	"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",

	"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
	"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
	}
	self.search = {
	"Pat:DC_IDLE" : "crmd.State transition.-> S_IDLE",

	# This won't work if we have multiple partitions
	"Pat:Local_started" : "%s\W.*The local CRM is operational",
	"Pat:Slave_started" : "%s\W.State transition.-> S_NOT_DC",
	"Pat:Master_started": "%s\W.State transition.-> S_IDLE",
	"Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN",
	"Pat:They_stopped" : "%s\W.LOST:. %s ",
	"Pat:They_dead" : "node %s.*: is dead",
	"Pat:TransitionComplete" : "Transition status: Complete: complete",

	"Pat:Fencing_start" : "(Initiating remote operation\|Requesting peer fencing ).* (for\|of) %s",
	"Pat:Fencing_ok" : r"stonith.:\sOperation .* of %s by .* for .@.: OK",
	"Pat:Fencing_recover" : r"pengine.*: Recover %s",

	"Pat:RscOpOK" : r"crmd.:\s+Result of %s operation for %s.: (0 \()?ok",
	"Pat:RscRemoteOpOK" : r"crmd.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
	"Pat:NodeFenced" : r"crmd.:\s Peer %s was terminated $.$ by . on behalf of .*: OK",
	"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
	}

	def get_component(self, key):
	if key in self.components:
	return self.components[key]
	print("Unknown component '%s' for %s" % (key, self.name))
	return []

	def get_patterns(self, key):
	if key == "BadNews":
	return self.BadNews
	elif key == "BadNewsIgnore":
	return self.ignore
	elif key == "Commands":
	return self.commands
	elif key == "Search":
	return self.search
	elif key == "Components":
	return self.components

	def __getitem__(self, key):
	if key == "Name":
	return self.name
	elif key in self.commands:
	return self.commands[key]
	elif key in self.search:
	return self.search[key]
	else:
	print("Unknown template '%s' for %s" % (key, self.name))
	return None


	class crm_corosync(BasePatterns):
	'''
	Patterns for Corosync version 2 cluster manager class
	'''

	def __init__(self, name):
	BasePatterns.__init__(self, name)

	self.commands.update({
	"StartCmd" : "service corosync start && service pacemaker start",
	"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker_remoted ] \|\| service pacemaker_remote stop; service corosync stop",

	"EpochCmd" : "crm_node -e",
	"QuorumCmd" : "crm_node -q",
	"PartitionCmd" : "crm_node -p",
	})

	self.search.update({
	# Close enough... "Corosync Cluster Engine exiting normally" isn't printed
	# reliably and there's little interest in doing anything about it
	"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
	"Pat:They_stopped" : "%s\W.crmd.Node %s(\[\|\s).*state is now lost",
	"Pat:They_dead" : "crmd.Node %s(\[\|\s).state is now lost",

	"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
	"Pat:ChildKilled" : r"%s\W.pacemakerd.%s\[[0-9]+\] terminated with signal 9",
	"Pat:ChildRespawn" : "%s\W.pacemakerd.Respawning failed child process: %s",

	"Pat:InfraUp" : "%s\W.corosync.Initializing transport",
	"Pat:PacemakerUp" : "%s\W.pacemakerd.Starting Pacemaker",
	})

	self.ignore = self.ignore + [
	r"crm_mon:",
	r"crmadmin:",
	r"update_trace_data",
	r"async_notify:.*strange, client not found",
	r"Parse error: Ignoring unknown option .*nodename",
	r"error.: Operation 'reboot' . with device 'FencingFail' returned:",
	r"Child process .* terminated with signal 9",
	r"getinfo response error: 1$",
	"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
	r"sbd.* pcmk:\serror:.Connection to cib_ro failed",
	r"sbd.* pcmk:\serror:.Connection to cib_ro.* closed .I/O condition=17",
	]

	self.BadNews = [
	r"error:",
	r"crit:",
	r"ERROR:",
	r"CRIT:",
	r"Shutting down...NOW",
	r"Timer I_TERMINATE just popped",
	r"input=I_ERROR",
	r"input=I_FAIL",
	r"input=I_INTEGRATED cause=C_TIMER_POPPED",
	r"input=I_FINALIZED cause=C_TIMER_POPPED",
	r"input=I_ERROR",
	r"(pacemakerd\|lrmd\|crmd):.*, exiting",
	- r"(WARN\|warn).Ignoring HA message.vote.*not in our membership list",
	r"pengine.*Attempting recovery of resource",
	r"is taking more than 2x its timeout",
	r"Confirm not received from",
	r"Welcome reply not received from",
	r"Attempting to schedule .* after a stop",
	r"Resource .* was active at shutdown",
	r"duplicate entries for call_id",
	r"Search terminated:",
	r":global_timer_callback",
	r"Faking parameter digest creation",
	r"Parameters to .* action changed:",
	r"Parameters to .* changed",
	r"\[[0-9]+\] terminated with signal [0-9]+ \(",
	r"Child process .* terminated with signal",
	r"pengine:.Recover .$.* -\> .*$",
	r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
	r"Peer is not part of our cluster",
	r"We appear to be in an election loop",
	r"Unknown node -> we will not deliver message",
	r"(Blackbox dump requested\|Problem detected)",
	r"pacemakerd.*Could not connect to Cluster Configuration Database API",
	r"Receiving messages from a node we think is dead",
	r"share the same cluster nodeid",
	r"share the same name",

	#r"crm_ipc_send:.Request . failed",
	#r"crm_ipc_send:.Sending to . is disabled until pending reply is received",

	# Not inherently bad, but worth tracking
	#r"No need to invoke the TE",
	#r"ping.*: DEBUG: Updated connected = 0",
	#r"Digest mis-match:",
	r"crmd:.*Transition failed: terminated",
	r"Local CIB .* differs from .*:",
	r"warn.:\sContinuing but .* will NOT be used",
	r"warn.:\sCluster configuration file .* is corrupt",
	#r"Executing .* fencing operation",
	r"Election storm",
	r"stalled the FSA with pending inputs",
	]

	self.components["common-ignore"] = [
	"Pending action:",
	"error: crm_log_message_adv:",
	r"resource( was\|s were) active at shutdown",
	"pending LRM operations at shutdown",
	"Lost connection to the CIB service",
	"Connection to the CIB terminated...",
	"Sending message to CIB service FAILED",
	"apply_xml_diff:.*Diff application failed!",
	r"crmd.:\sAction A_RECOVER .* not supported",
	"unconfirmed_actions:.Waiting on . unconfirmed actions",
	"cib_native_msgready:.*Message pending on command channel",
	r"crmd.:\sPerforming A_EXIT_1 - forcefully exiting the CRMd",
	"verify_stopped:.Resource . was active at shutdown. You may ignore this error if it is unmanaged.",
	"error: attrd_connection_destroy:.*Lost connection to attrd",
	r".:\sExecuting .* fencing operation $.*$ on ",
	r".:\sRequesting fencing $[^)]+$ of node ",
	r"(Blackbox dump requested\|Problem detected)",
	# "error: native_create_actions: Resource .stonith::. is active on 2 nodes attempting recovery",
	# "error: process_pe_message: Transition .* ERRORs found during PE processing",
	]

	self.components["corosync-ignore"] = [
	r"error:.*Connection to the CPG API failed: Library error",
	r"\[[0-9]+\] exited with status [0-9]+ \(",
	r"pacemakerd.error:.Child process .* exited",
	r"cib.error:.Corosync connection lost",
	r"stonith-ng.error:.Corosync connection terminated",
	r"error:.Child process cib . exited: Invalid argument",
	r"error:.Child process attrd . exited: Transport endpoint is not connected",
	r"error:.Child process crmd . exited: Link has been severed",
	r"lrmd.error:.Connection to stonith-ng.* (failed\|closed)",
	r"lrmd.error:.LRMD lost STONITH connection",
	r"crmd.State transition . S_RECOVERY",
	r"crmd.error:.Input (I_ERROR\|I_TERMINATE ) .*received in state",
	r"crmd.error:.Could not recover from internal error",
	r"error:.Connection to cib_(shm\|rw). (failed\|closed)",
	r"error:.*STONITH connection failed",
	r"error: Connection to stonith-ng.* (failed\|closed)",
	r"crit: Fencing daemon connection failed",
	]

	self.components["corosync"] = [
	r"pacemakerd.error:.Connection destroyed",
	r"attrd.:\s(crit\|error):.*Lost connection to (Corosync\|CIB) service",
	r"stonith.:\s(Corosync connection terminated\|Shutting down)",
	r"cib.:\sCorosync connection lost!\s+Exiting.",
	r"crmd.:\s(connection terminated\|Disconnected from Corosync)",
	r"pengine.Scheduling Node . for STONITH",
	r"crmd.:\sPeer .* was terminated $.$ by . for .:\sOK",
	]

	self.components["cib-ignore"] = [
	"lrmd.*Connection to stonith-ng failed",
	"lrmd.Connection to stonith-ng. closed",
	"lrmd.*LRMD lost STONITH connection",
	"lrmd.STONITH connection failed, finalizing . pending operations",
	]

	self.components["cib"] = [
	"State transition .* S_RECOVERY",
	- "Respawning .* crmd",
	- "Respawning .* attrd",
	+ r"Respawning failed child process: (attrd\|crmd)",
	"Connection to cib_.* failed",
	"Connection to cib_.* closed",
	r"crmd.:.Connection to the CIB terminated...",
	r"attrd.:.(Lost connection to CIB service\|Connection to the CIB terminated)",
	r"crmd\[[0-9]+\] exited with status 1 \(",
	r"attrd\[[0-9]+\] exited with status 102 \(",
	r"crmd.: Input I_TERMINATE .from do_recover",
	"crmd.I_ERROR.crmd_cib_connection_destroy",
	"crmd.*Could not recover from internal error",
	]

	self.components["lrmd"] = [
	"State transition .* S_RECOVERY",
	"LRM Connection failed",
	- "Respawning .* crmd",
	+ r"Respawning failed child process: crmd",
	"Connection to lrmd failed",
	"Connection to lrmd.* closed",
	"crmd.I_ERROR.lrm_connection_destroy",
	r"crmd\[[0-9]+\] exited with status 1 \(",
	r"crmd.: Input I_TERMINATE .from do_recover",
	"crmd.*Could not recover from internal error",
	]
	self.components["lrmd-ignore"] = []

	self.components["crmd"] = [
	# "WARN: determine_online_status: Node .* is unclean",
	# "Scheduling Node .* for STONITH",
	# "Executing .* fencing operation",
	# Only if the node wasn't the DC: "State transition S_IDLE",
	"State transition .* -> S_IDLE",
	]
	self.components["crmd-ignore"] = []

	self.components["attrd"] = []
	self.components["attrd-ignore"] = []

	self.components["pengine"] = [
	"State transition .* S_RECOVERY",
	- "Respawning .* crmd",
	+ r"Respawning failed child process: crmd",
	r"crmd\[[0-9]+\] exited with status 1 \(",
	"Connection to pengine failed",
	"Connection to pengine.* closed",
	"Connection to the Policy Engine failed",
	"crmd.I_ERROR.save_cib_contents",
	r"crmd.: Input I_TERMINATE .from do_recover",
	"crmd.*Could not recover from internal error",
	]
	self.components["pengine-ignore"] = []

	self.components["stonith"] = [
	"Connection to stonith-ng failed",
	"LRMD lost STONITH connection",
	"Connection to stonith-ng.* closed",
	"Fencing daemon connection failed",
	r"crmd.:\swarn.:\sCallback already present",
	]
	self.components["stonith-ignore"] = [
	r"pengine.*: Recover Fencing",
	r"Updating failcount for Fencing",
	r"error:.*Connection to stonith-ng failed",
	r"error:.Connection to stonith-ng.closed $I/O condition=17$",
	r"crit:.*Fencing daemon connection failed",
	r"error:.*Sign-in failed: triggered a retry",
	"STONITH connection failed, finalizing .* pending operations.",
	r"crmd.:\s+Result of . operation for Fencing.*Error",
	]
	self.components["stonith-ignore"].extend(self.components["common-ignore"])


	class crm_corosync_docker(crm_corosync):
	'''
	Patterns for Corosync version 2 cluster manager class
	'''
	def __init__(self, name):
	crm_corosync.__init__(self, name)

	self.commands.update({
	"StartCmd" : "pcmk_start",
	"StopCmd" : "pcmk_stop",
	})


	class PatternSelector(object):

	def __init__(self, name=None):
	self.name = name
	self.base = BasePatterns("crm-base")

	if not name:
	crm_corosync("crm-corosync")
	elif name == "crm-corosync":
	crm_corosync(name)
	elif name == "crm-corosync-docker":
	crm_corosync_docker(name)

	def get_variant(self, variant):
	if variant in patternvariants:
	return patternvariants[variant]
	print("defaulting to crm-base for %s" % variant)
	return self.base

	def get_patterns(self, variant, kind):
	return self.get_variant(variant).get_patterns(kind)

	def get_template(self, variant, key):
	v = self.get_variant(variant)
	return v[key]

	def get_component(self, variant, kind):
	return self.get_variant(variant).get_component(kind)

	def __getitem__(self, key):
	return self.get_template(self.name, key)

	# python cts/CTSpatt.py -k crm-corosync -t StartCmd
	if __name__ == '__main__':

	pdir=os.path.dirname(sys.path[0])
	sys.path.insert(0, pdir) # So that things work from the source directory

	kind=None
	template=None

	skipthis=None
	args=sys.argv[1:]
	for i in range(0, len(args)):
	if skipthis:
	skipthis=None
	continue

	elif args[i] == "-k" or args[i] == "--kind":
	skipthis=1
	kind = args[i+1]

	elif args[i] == "-t" or args[i] == "--template":
	skipthis=1
	template = args[i+1]

	else:
	print("Illegal argument " + args[i])


	print(PatternSelector(kind)[template])

File Metadata

Mime Type: text/x-diff
Expires: Sun, Jul 20, 7:29 PM (3 h, 17 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 2081322
Default Alt Text: (38 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions