diff --git a/cts/CM_common.py b/cts/CM_common.py index 0112fec463..b7ff223e90 100755 --- a/cts/CM_common.py +++ b/cts/CM_common.py @@ -1,471 +1,471 @@ """ Cluster Manager common class for Pacemaker's Cluster Test Suite (CTS) This was originally the cluster manager class for the Heartbeat stack. It is retained for use as a base class by other cluster manager classes. It could be merged into the ClusterManager class directly, but this is easier. """ # Pacemaker targets compatibility with Python 2.7 and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division __copyright__ = """Original Author: Huang Zhen Copyright 2004 International Business Machines -with later changes copyright 2004-2018 the Pacemaker project contributors. +with later changes copyright 2004-2019 the Pacemaker project contributors. The version control history for this file may have further details. """ __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys from cts.CTSvars import * from cts.CTS import * from cts.CIB import * from cts.CTStests import AuditResource from cts.watcher import LogWatcher class crm_common(ClusterManager): def __init__(self, Environment, randseed=None, name=None): ClusterManager.__init__(self, Environment, randseed=randseed) self.fastfail = 0 self.cib_installed = 0 self.config = None self.cluster_monitor = 0 self.use_short_names = 1 if self.Env["DoBSC"]: del self.templates["Pat:They_stopped"] self._finalConditions() self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} self.CibFactory = ConfigFactory(self) self.cib = self.CibFactory.createConfig(self.Env["Schema"]) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end """ Return a list of known error messages that should be ignored """ return PatternSelector().get_patterns(self.name, "BadNewsIgnore") def install_config(self, node): if not self.ns.WaitForNodeToComeUp(node): self.log("Node %s is not up." % node) return None if not node in self.CIBsync and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") # Only install the CIB on the first node, all the other ones will pick it up from there if self.cib_installed == 1: return None self.cib_installed = 1 if self.Env["CIBfilename"] == None: self.log("Installing Generated CIB on node %s" % (node)) self.cib.install(node) else: self.log("Installing CIB (%s) on node %s" % (self.Env["CIBfilename"], node)) if 0 != self.rsh.cp(self.Env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)): raise ValueError("Can not scp file to %s %d"%(node)) self.rsh(node, "chown "+CTSvars.CRM_DAEMON_USER+" "+CTSvars.CRM_CONFIG_DIR+"/cib.xml") def prepare(self): '''Finish the Initialization process. Prepare to test...''' self.partitions_expected = 1 for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" if self.Env["experimental-tests"]: self.unisolate_node(node) self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") watchpats.append(self.templates["Pat:NonDC_started"] % node) watchpats.append(self.templates["Pat:DC_started"] % node) idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterIdle", hosts=[node], kind=self.Env["LogWatcher"]) idle_watch.setwatch() out = self.rsh(node, self.templates["StatusCmd"]%node, 1) self.debug("Node %s status: '%s'" %(node, out)) if not out or (out.find('ok') < 0): if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" % (node, "down", self.ShouldBeStatus[node])) self.ShouldBeStatus[node] = "down" return 0 if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s: %s" % (node, "up", self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node] = "up" # check the output first - because syslog-ng loses messages if out.find('S_NOT_DC') != -1: # Up and stable return 2 if out.find('S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" % (node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" % (node)) return None def partition_stable(self, nodes, timeout=None): watchpats = [ ] watchpats.append("Current ping state: S_IDLE") watchpats.append(self.templates["Pat:DC_IDLE"]) self.debug("Waiting for cluster stability...") if timeout == None: timeout = self.Env["DeadTime"] if len(nodes) < 3: self.debug("Cluster is inactive") return 1 idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"]) idle_watch.setwatch() for node in nodes.split(): # have each node dump its current state self.rsh(node, self.templates["StatusCmd"] % node, 1) ret = idle_watch.look() while ret: self.debug(ret) for node in nodes.split(): if re.search(node, ret): return 1 ret = idle_watch.look() self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) return None def cluster_stable(self, timeout=None, double_check=False): partitions = self.find_partitions() for partition in partitions: if not self.partition_stable(partition, timeout): return None if double_check: # Make sure we are really stable and that all resources, # including those that depend on transient node attributes, # are started if they were going to be time.sleep(5) for partition in partitions: if not self.partition_stable(partition, timeout): return None return 1 def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh(node, self.templates["StatusCmd"]%node, 1) if not status_line: rc = 0 elif status_line.find('S_IDLE') != -1: rc = 1 elif status_line.find('S_INTEGRATION') != -1: rc = 1 elif status_line.find('S_FINALIZE_JOIN') != -1: rc = 1 elif status_line.find('S_POLICY_ENGINE') != -1: rc = 1 elif status_line.find('S_TRANSITION_ENGINE') != -1: rc = 1 return rc def active_resources(self, node): (rc, output) = self.rsh(node, """crm_resource -c""", None) resources = [] for line in output: if re.search("^Resource", line): tmp = AuditResource(self, line) if tmp.type == "primitive" and tmp.host == node: resources.append(tmp.id) return resources def ResourceLocation(self, rid): ResourceNodes = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": cmd = self.templates["RscRunning"] % (rid) (rc, lines) = self.rsh(node, cmd, None) if rc == 127: self.log("Command '%s' failed. Binary or pacemaker-cts package not installed?" % cmd) for line in lines: self.log("Output: "+line) elif rc == 0: ResourceNodes.append(node) return ResourceNodes def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": partition = self.rsh(node, self.templates["PartitionCmd"], 1) if not partition: self.log("no partition details for %s" % node) elif len(partition) > 2: nodes = partition.split() nodes.sort() partition = ' '.join(nodes) found = 0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" % (node, partition)) ccm_partitions.append(partition) else: self.debug("Partition '%s' from %s is consistent with existing entries" % (partition, node)) else: self.log("bad partition details for %s" % node) else: self.debug("Node %s is down... skipping" % node) self.debug("Found partitions: %s" % repr(ccm_partitions) ) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == "up": quorum = self.rsh(node, self.templates["QuorumCmd"], 1) if quorum.find("1") != -1: return 1 elif quorum.find("0") != -1: return 0 else: self.debug("WARN: Unexpected quorum test result from " + node + ":" + quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "(ERROR|error): crm_log_message_adv:", "(ERROR|error): MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB manager", "Connection to the CIB terminated...", "Sending message to the CIB manager FAILED", "Action A_RECOVER .* not supported", "(ERROR|error): stonithd_op_result_ready: not signed on", "pingd.*(ERROR|error): send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", r": Performing A_EXIT_1 - forcefully exiting ", r"Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", ] stonith_ignore = [ r"Updating failcount for child_DoFencing", - r"(ERROR|error).*: Sign-in failed: triggered a retry", + r"error.*: Fencer connection failed \(will retry\)", "pacemaker-execd.*(ERROR|error): stonithd_receive_ops_result failed.", ] stonith_ignore.extend(common_ignore) ccm = Process(self, "ccm", triggersreboot=self.fastfail, pats = [ "State transition .* S_RECOVERY", "pacemaker-controld.*Action A_RECOVER .* not supported", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*: Could not recover from internal error", "pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", # these status numbers are likely wrong now r"pacemaker-controld.*exited with status 2", r"attrd.*exited with status 1", r"cib.*exited with status 2", # Not if it was fenced # "A new node joined the cluster", # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", # "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE", # "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN", "State transition S_STARTING -> S_PENDING", ], badnews_ignore = common_ignore) based = Process(self, "pacemaker-based", triggersreboot=self.fastfail, pats = [ "State transition .* S_RECOVERY", "Lost connection to the CIB manager", "Connection to the CIB manager terminated", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", "pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", r"pacemaker-controld.*: Could not recover from internal error", # these status numbers are likely wrong now r"pacemaker-controld.*exited with status 2", r"attrd.*exited with status 1", ], badnews_ignore = common_ignore) execd = Process(self, "pacemaker-execd", triggersreboot=self.fastfail, pats = [ "State transition .* S_RECOVERY", "LRM Connection failed", "pacemaker-controld.*I_ERROR.*lrm_connection_destroy", "State transition S_STARTING -> S_PENDING", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*: Could not recover from internal error", # this status number is likely wrong now r"pacemaker-controld.*exited with status 2", ], badnews_ignore = common_ignore) controld = Process(self, "pacemaker-controld", triggersreboot=self.fastfail, pats = [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition .* S_IDLE", "State transition S_STARTING -> S_PENDING", ], badnews_ignore = common_ignore) schedulerd = Process(self, "pacemaker-schedulerd", triggersreboot=self.fastfail, pats = [ "State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*: Could not recover from internal error", r"pacemaker-controld.*CRIT.*: Connection to the scheduler failed", "pacemaker-controld.*I_ERROR.*save_cib_contents", # this status number is likely wrong now r"pacemaker-controld.*exited with status 2", ], badnews_ignore = common_ignore, dc_only=1) if self.Env["DoFencing"] == 1 : complist.append(Process(self, "stoniths", triggersreboot=self.fastfail, dc_pats = [ r"pacemaker-controld.*CRIT.*: Fencing daemon connection failed", "Attempting connection to fencing daemon", ], badnews_ignore = stonith_ignore)) if self.fastfail == 0: ccm.pats.extend([ # these status numbers are likely wrong now r"attrd.*exited with status 1", r"pacemaker-(based|controld).*exited with status 2", ]) based.pats.extend([ # these status numbers are likely wrong now r"attrd.*exited with status 1", r"pacemaker-controld.*exited with status 2", ]) execd.pats.extend([ # these status numbers are likely wrong now r"pacemaker-controld.*exited with status 2", ]) complist.append(ccm) complist.append(based) complist.append(execd) complist.append(controld) complist.append(schedulerd) return complist def StandbyStatus(self, node): out=self.rsh(node, self.templates["StandbyQueryCmd"] % node, 1) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self.templates["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True def AddDummyRsc(self, node, rid): rsc_xml = """ ' '""" % (rid, rid) constraint_xml = """ ' ' """ % (rid, node, node, rid) self.rsh(node, self.templates['CibAddXml'] % (rsc_xml)) self.rsh(node, self.templates['CibAddXml'] % (constraint_xml)) def RemoveDummyRsc(self, node, rid): constraint = "\"//rsc_location[@rsc='%s']\"" % (rid) rsc = "\"//primitive[@id='%s']\"" % (rid) self.rsh(node, self.templates['CibDelXpath'] % constraint) self.rsh(node, self.templates['CibDelXpath'] % rsc) ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/cts/patterns.py b/cts/patterns.py index 1bdace0ead..1b86ee7c31 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -1,391 +1,391 @@ """ Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS) """ # Pacemaker targets compatibility with Python 2.7 and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division __copyright__ = "Copyright 2008-2019 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys, os from cts.CTSvars import * patternvariants = {} class BasePatterns(object): def __init__(self, name): self.name = name patternvariants[name] = self self.ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", ] self.BadNews = [] self.components = {} self.commands = { "StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null", "CibQuery" : "cibadmin -Ql", "CibAddXml" : "cibadmin --modify -c --xml-text %s", "CibDelXpath" : "cibadmin --delete --xpath %s", # 300,000 == 5 minutes "RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "SetCheckInterval" : "cibadmin --modify -c --xml-text ''", "ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"", "MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self.search = { "Pat:DC_IDLE" : "pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started" : "%s\W.*controller successfully started", "Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped" : "%s\W.*LOST:.* %s ", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:Fencing_start" : "(Initiating remote operation|Requesting peer fencing ).* (for|of) %s", "Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* of %s by .* for .*@.*: OK", "Pat:Fencing_recover" : r"schedulerd.*: Recover %s", "Pat:RscOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?ok", "Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok", "Pat:NodeFenced" : r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", "Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0", } def get_component(self, key): if key in self.components: return self.components[key] print("Unknown component '%s' for %s" % (key, self.name)) return [] def get_patterns(self, key): if key == "BadNews": return self.BadNews elif key == "BadNewsIgnore": return self.ignore elif key == "Commands": return self.commands elif key == "Search": return self.search elif key == "Components": return self.components def __getitem__(self, key): if key == "Name": return self.name elif key in self.commands: return self.commands[key] elif key in self.search: return self.search[key] else: print("Unknown template '%s' for %s" % (key, self.name)) return None class crm_corosync(BasePatterns): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): BasePatterns.__init__(self, name) self.commands.update({ "StartCmd" : "service corosync start && service pacemaker start", "StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd" : "crm_node -e", "QuorumCmd" : "crm_node -q", "PartitionCmd" : "crm_node -p", }) self.search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines", "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(", "Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated with signal 9", "Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s", "Pat:InfraUp" : "%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker", }) self.ignore = self.ignore + [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self.BadNews = [ r"error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"\[[0-9]+\] terminated with signal [0-9]+ \(", r"schedulerd:.*Recover .*\(.* -\> .*\)", r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", #r"crm_ipc_send:.*Request .* failed", #r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received", # Not inherently bad, but worth tracking #r"No need to invoke the TE", #r"ping.*: DEBUG: Updated connected = 0", #r"Digest mis-match:", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", #r"Executing .* fencing operation", r"Election storm", r"stalled the FSA with pending inputs", ] self.components["common-ignore"] = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ", r".*:\s*Executing .* fencing operation \(.*\) on ", r".*:\s*Requesting fencing \([^)]+\) of node ", r"(Blackbox dump requested|Problem detected)", # "Resource .*stonith::.* is active on 2 nodes attempting recovery", # "Transition .* ERRORs found during PE processing", ] self.components["corosync-ignore"] = [ r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit: Fencing daemon connection failed", ] self.components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling Node .* for STONITH", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self.components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning failed child process: pacemaker-attrd", r"pacemakerd.* Respawning failed child process: pacemaker-based", r"pacemakerd.* Respawning failed child process: pacemaker-controld", r"pacemakerd.* Respawning failed child process: pacemaker-fenced", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-based-ignore"] = [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", ] self.components["pacemaker-execd"] = [ r"pacemaker-controld.*Connection to (pacemaker-execd|lrmd|executor) (failed|closed)", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-execd.* terminated with signal 9", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.*Respawning failed child process: pacemaker-execd", r"pacemakerd.*Respawning failed child process: pacemaker-controld", ] self.components["pacemaker-execd-ignore"] = [] self.components["pacemaker-controld"] = [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # Only if the node wasn't the DC: "State transition S_IDLE", "State transition .* -> S_IDLE", ] self.components["pacemaker-controld-ignore"] = [] self.components["pacemaker-attrd"] = [] self.components["pacemaker-attrd-ignore"] = [] self.components["pacemaker-schedulerd"] = [ "State transition .* S_RECOVERY", r"Respawning failed child process: pacemaker-controld", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", "Connection to pengine failed", "Connection to pengine.* closed", r"Connection to the scheduler failed", "pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", "pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-schedulerd-ignore"] = [] self.components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Fencing daemon connection failed", r"pacemaker-controld.*:\s*warn.*:\s*Callback already present", ] self.components["pacemaker-fenced-ignore"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit:.*Fencing daemon connection failed", - r"error:.*Sign-in failed: triggered a retry", + r"error:.*Fencer connection failed \(will retry\)", r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error", ] self.components["pacemaker-fenced-ignore"].extend(self.components["common-ignore"]) class crm_corosync_docker(crm_corosync): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): crm_corosync.__init__(self, name) self.commands.update({ "StartCmd" : "pcmk_start", "StopCmd" : "pcmk_stop", }) class PatternSelector(object): def __init__(self, name=None): self.name = name self.base = BasePatterns("crm-base") if not name: crm_corosync("crm-corosync") elif name == "crm-corosync": crm_corosync(name) elif name == "crm-corosync-docker": crm_corosync_docker(name) def get_variant(self, variant): if variant in patternvariants: return patternvariants[variant] print("defaulting to crm-base for %s" % variant) return self.base def get_patterns(self, variant, kind): return self.get_variant(variant).get_patterns(kind) def get_template(self, variant, key): v = self.get_variant(variant) return v[key] def get_component(self, variant, kind): return self.get_variant(variant).get_component(kind) def __getitem__(self, key): return self.get_template(self.name, key) # python cts/CTSpatt.py -k crm-corosync -t StartCmd if __name__ == '__main__': pdir=os.path.dirname(sys.path[0]) sys.path.insert(0, pdir) # So that things work from the source directory kind=None template=None skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == "-k" or args[i] == "--kind": skipthis=1 kind = args[i+1] elif args[i] == "-t" or args[i] == "--template": skipthis=1 template = args[i+1] else: print("Illegal argument " + args[i]) print(PatternSelector(kind)[template])