diff --git a/cts/CM_ais.py.in b/cts/CM_ais.py.in index c21e012001..a6c56eb4b1 100644 --- a/cts/CM_ais.py.in +++ b/cts/CM_ais.py.in @@ -1,233 +1,233 @@ #!@PYTHON@ '''CTS: Cluster Testing System: AIS dependent modules... ''' __copyright__=''' Copyright (C) 2007 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTS import * -from CM_lha import LinuxHAv2 +from CM_lha import crm_lha from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### -class crm_ais(LinuxHAv2): +class crm_ais(crm_lha): ''' The crm version 3 cluster manager class. It implements the things we need to talk to and manipulate crm clusters running on top of openais ''' def __init__(self, Environment, randseed=None): - LinuxHAv2.__init__(self, Environment, randseed=randseed) + crm_lha.__init__(self, Environment, randseed=randseed) self.update({ "Name" : "crm-ais", "StartCmd" : "@INITDIR@/openais start > /dev/null 2>&1", "StopCmd" : "@INITDIR@/openais stop > /dev/null 2>&1", "UUIDQueryCmd" : "@sbindir@/crmadmin -N", "EpocheCmd" : "@sbindir@/crm_node -e", "QuorumCmd" : "@sbindir@/crm_node -q", "ParitionCmd" : "@sbindir@/crm_node -p", "Pat:We_stopped" : "%s.*openais.*crm_exec_exit_fn: Shutdown complete", "Pat:They_stopped" : "%s crmd:.*Node %s: .* state=lost .new", "Pat:All_stopped" : "%s.*openais.*crm_exec_exit_fn: Shutdown complete", "Pat:They_dead" : "openais:.*Node %s is now: lost", "Pat:ChildKilled" : "%s openais.*Child process %s terminated with signal 9", "Pat:ChildRespawn" : "%s openais.*Respawning failed child process: %s", "Pat:ChildExit" : "Child process .* exited", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"Child process .* terminated with signal 11", r"Executing .* fencing operation", ), }) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "crm_mon:", "crmadmin:", "async_notify: strange, client not found", "ERROR: Message hist queue is filling up" ] return [] def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "apply_xml_diff: Diff application failed!", "crmd: .*Action A_RECOVER .* not supported", "pingd: .*ERROR: send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd", "verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", "ERROR: stonithd_op_result_ready: not signed on", "ERROR: attrd_connection_destroy: Lost connection to attrd", "nfo: te_fence_node: Executing .* fencing operation", ] complist.append(Process("cib", 0, [ "State transition S_IDLE", "Respawning .* crmd", "Respawning .* attrd", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Child process crmd exited .* rc=2", "Child process attrd exited .* rc=1", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, 0, self)) complist.append(Process("lrmd", 0, [ "State transition S_IDLE", "LRM Connection failed", "Respawning .* crmd", "crmd: .*I_ERROR.*lrm_connection_destroy", "Child process crmd exited .* rc=2", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, 0, self)) complist.append(Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # Only if the node wasn't the DC: "State transition S_IDLE", "State transition .* -> S_IDLE", ], [], common_ignore, 0, self)) complist.append(Process("attrd", 0, [ "crmd: .*ERROR: attrd_connection_destroy: Lost connection to attrd" ], [], common_ignore, 0, self)) aisexec_ignore = [ "ERROR: ais_dispatch: Receiving message .* failed", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "cib: .*ERROR: cib_ais_destroy: AIS connection terminated", "attrd: .*CRIT: attrd_ais_destroy: Lost connection to OpenAIS service!", "stonithd: .*ERROR: AIS connection terminated", ] aisexec_ignore.extend(common_ignore) complist.append(Process("aisexec", 0, [ "ERROR: ais_dispatch: AIS connection failed", "crmd: .*I_TERMINATE.*do_recover", "crmd: .*ERROR: do_exit: Could not recover from internal error", "pengine: .*Scheduling Node .* for STONITH", "stonithd: .*requests a STONITH operation RESET on node", "stonithd: .*Succeeded to STONITH the node", ], [], aisexec_ignore, 0, self)) complist.append(Process("pengine", 0, [ ], [ "State transition S_IDLE", "Respawning .* crmd", "Child process crmd exited .* rc=2", "crmd: .*pe_connection_destroy: Connection to the Policy Engine failed", "crmd: .*I_ERROR.*save_cib_contents", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], common_ignore, 0, self)) if self.Env["DoFencing"] == 1 : stonith_ignore = [ "ERROR: stonithd_signon: ", "update_failcount: Updating failcount for child_DoFencing", "ERROR: te_connect_stonith: Sign-in failed: triggered a retry", ] stonith_ignore.extend(common_ignore) complist.append(Process("stonithd", 0, [], [ "tengine_stonith_connection_destroy: Fencing daemon connection failed", "Attempting connection to fencing daemon", "te_connect_stonith: Connected", ], stonith_ignore, 0, self)) return complist def NodeUUID(self, node): return node ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/cts/CM_lha.py.in b/cts/CM_lha.py.in index 7cecc93a35..7b8d41e128 100755 --- a/cts/CM_lha.py.in +++ b/cts/CM_lha.py.in @@ -1,609 +1,609 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTS import * from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### -class LinuxHAv2(ClusterManager): +class crm_lha(ClusterManager): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): ClusterManager.__init__(self, Environment, randseed=randseed) #HeartbeatCM.__init__(self, Environment, randseed=randseed) self.fastfail = 0 self.clear_cache = 0 self.cib_installed = 0 self.config = None self.cluster_monitor = 0 self.use_short_names = 1 self.update({ "Name" : "crm-lha", "DeadTime" : 300, "StartTime" : 300, # Max time to start up "StableTime" : 30, "StartCmd" : "@INITDIR@/heartbeat@INIT_EXT@ start > /dev/null 2>&1", "StopCmd" : "@INITDIR@/heartbeat@INIT_EXT@ stop > /dev/null 2>&1", "ElectionCmd" : "@sbindir@/crmadmin -E %s", "StatusCmd" : "@sbindir@/crmadmin -t 60000 -S %s 2>/dev/null", "EpocheCmd" : "@sbindir@/crm_node -H -e", "QuorumCmd" : "@sbindir@/crm_node -H -q", "ParitionCmd" : "@sbindir@/crm_node -H -p", "CibQuery" : "@sbindir@/cibadmin -Ql", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>&1", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "LogFileName" : Environment["LogFileName"], "StandbyCmd" : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null", "UUIDQueryCmd" : "@sbindir@/crmadmin -N", "StandbyQueryCmd" : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE", # This wont work if we have multiple partitions "Pat:Local_started" : "%s crmd:.*The local CRM is operational", "Pat:Slave_started" : "%s crmd:.*State transition.*-> S_NOT_DC", "Pat:Master_started" : "%s crmd:.* State transition.*-> S_IDLE", "Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ", "Pat:All_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:ChildKilled" : "%s heartbeat.*%s.*killed by signal 9", "Pat:ChildRespawn" : "%s heartbeat.*Respawning client.*%s", "Pat:ChildExit" : "ERROR: Client .* exited with return code", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r"global_timer_callback:", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", ), }) if self.Env["DoBSC"]: del self["Pat:They_stopped"] del self["Pat:Logd_stopped"] self.Env["use_logd"] = 0 self._finalConditions() self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} self.CibFactory = ConfigFactory(self) self.cib = self.CibFactory.createConfig(self.Env["Schema"]) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "ERROR: crm_abort: crm_glib_handler: ", "ERROR: Message hist queue is filling up", "stonithd: .*CRIT: external_hostlist: 'vmware gethosts' returned an empty hostlist", "stonithd: .*ERROR: Could not list nodes for stonith RA external/vmware.", "pengine: Preventing .* from re-starting", ] return [] def install_config(self, node): if not self.ns.WaitForNodeToComeUp(node): self.log("Node %s is not up." % node) return None if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 self.rsh(node, "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml") self.rsh(node, "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig") self.rsh(node, "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.last") self.rsh(node, "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig.last") # Only install the CIB on the first node, all the other ones will pick it up from there if self.cib_installed == 1: return None self.cib_installed = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) warnings.filterwarnings("ignore") cib_file=os.tmpnam() warnings.resetwarnings() self.rsh("localhost", "rm -f "+cib_file) self.debug("Creating new CIB for " + node + " in: " + cib_file) self.rsh("localhost", "echo \'" + self.cib.contents() + "\' > " + cib_file) if 0!=self.rsh.echo_cp(None, cib_file, node, "@HA_VARLIBDIR@/heartbeat/crm/cib.xml"): #if 0!=self.rsh.cp(cib_file, "root@%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml" % node): raise ValueError("Can not create CIB on %s "%node) self.rsh("localhost", "rm -f "+cib_file) else: self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0 != self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) self.rsh(node, "chown @HA_CCMUSER@ @HA_VARLIBDIR@/heartbeat/crm/cib.xml") def prepare(self): '''Finish the Initialization process. Prepare to test...''' self.partitions_expected = 1 for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" self.unisolate_node(node) self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") watchpats.append(self["Pat:Slave_started"]%node) idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats) idle_watch.setwatch() out = self.rsh(node, self["StatusCmd"]%node, 1) self.debug("Node %s status: '%s'" %(node, out)) if not out or string.find(out, 'ok') < 0: if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" %(node, "down", self.ShouldBeStatus[node])) self.ShouldBeStatus[node]="down" return 0 if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s: %s" %(node, "up", self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node]="up" # check the output first - because syslog-ng looses messages if string.find(out, 'S_NOT_DC') != -1: # Up and stable return 2 if string.find(out, 'S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" %(node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" %(node)) return None def partition_stable(self, nodes, timeout=None): watchpats = [ ] watchpats.append("Current ping state: S_IDLE") watchpats.append(self["Pat:DC_IDLE"]) self.debug("Waiting for cluster stability...") if timeout == None: timeout = self["DeadTime"] idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout) idle_watch.setwatch() any_up = 0 for node in self.Env["nodes"]: # have each node dump its current state if self.ShouldBeStatus[node] == "up": self.rsh(node, self["StatusCmd"] %node, 1) any_up = 1 if any_up == 0: self.debug("Cluster is inactive") return 1 ret = idle_watch.look() while ret: self.debug(ret) for node in nodes: if re.search(node, ret): return 1 ret = idle_watch.look() self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) return None def cluster_stable(self, timeout=None): partitions = self.find_partitions() for partition in partitions: if not self.partition_stable(partition, timeout): return None return 1 def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh(node, self["StatusCmd"]%node, 1) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 return rc def active_resources(self, node): # [SM].* {node} matches Started, Slave, Master # Stopped wont be matched as it wont include {node} (rc, output) = self.rsh(node, """@sbindir@/crm_mon -1 | grep -e "[SM].* %s" """ % node, None) resources = [] for line in output: fields = line.split() resources.append(fields[0]) return resources def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"): ''' Execute an operation on a resource ''' cmd = self["ExecuteRscOp"] % (app, resource, op, interval) (rc, lines) = self.rsh(node, cmd, None) #self.debug("RscOp '%s' on %s: %d" % (cmd, node, rc)) #for line in lines: # self.debug("RscOp: "+line) return rc def ResourceLocation(self, rid): ResourceNodes = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": dummy = 0 rc = self.ResourceOp(rid, "monitor", node) # Strange error codes from remote_py # 65024 == not installed # 2048 == 8 # 1792 == 7 # 0 == 0 if rc == 65024: dummy = 1 #self.debug("%s is not installed on %s: %d" % (rid, node, rc)) elif rc == 0 or rc == 2048 or rc == 8: ResourceNodes.append(node) elif rc == 7 or rc == 1792: dummy = 1 #self.debug("%s is not running on %s: %d" % (rid, node, rc)) else: # not active on this node? self.log("Unknown rc code for %s on %s: %d" % (rid, node, rc)) return ResourceNodes def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": partition = self.rsh(node, self["ParitionCmd"], 1) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" %(node, partition)) ccm_partitions.append(partition) else: self.debug("Partition '%s' from %s is consistent with existing entries" %(partition, node)) else: self.log("bad partition details for %s" %node) else: self.debug("Node %s is down... skipping" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == "up": quorum = self.rsh(node, self["QuorumCmd"], 1) if string.find(quorum, "1") != -1: return 1 elif string.find(quorum, "0") != -1: return 0 else: self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "crmd: .*Action A_RECOVER .* not supported", "ERROR: stonithd_op_result_ready: not signed on", "pingd: .*ERROR: send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd", "verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", ] stonith_ignore = [ "ERROR: stonithd_signon: ", "update_failcount: Updating failcount for child_DoFencing", "ERROR: te_connect_stonith: Sign-in failed: triggered a retry", ] stonith_ignore.extend(common_ignore) ccm = Process("ccm", 0, [ "State transition S_IDLE", "CCM connection appears to have failed", "crmd: .*Action A_RECOVER .* not supported", "crmd: .*Input I_TERMINATE from do_recover", "Exiting to recover from CCM connection failure", "crmd:.*do_exit: Could not recover from internal error", "crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", "cib .*exited with return code 2.", "crmd:.*get_channel_token: No reply message - disconnected", # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "A new node joined the cluster", # "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE", # "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN", "State transition S_STARTING -> S_PENDING", ], [], common_ignore, self.fastfail, self) cib = Process("cib", 0, [ "State transition S_IDLE", "Lost connection to the CIB service", "Connection to the CIB terminated...", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", ], [], common_ignore, self.fastfail, self) lrmd = Process("lrmd", 0, [ "State transition S_IDLE", "LRM Connection failed", "crmd: .*I_ERROR.*lrm_connection_destroy", "State transition S_STARTING -> S_PENDING", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) crmd = Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition .* S_IDLE", "State transition S_STARTING -> S_PENDING", ], [ ], common_ignore, self.fastfail, self) pengine = Process("pengine", 1, [ "State transition S_IDLE", "crmd .*exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*do_exit: Could not recover from internal error", "crmd: .*CRIT: pe_connection_destroy: Connection to the Policy Engine failed", "crmd: .*I_ERROR.*save_cib_contents", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) if self.Env["DoFencing"] == 1 : complist.append(Process("stonithd", 0, [], [ "crmd: .*CRIT: tengine_stonith_connection_destroy: Fencing daemon connection failed", "Attempting connection to fencing daemon", "te_connect_stonith: Connected", ], stonith_ignore, 0, self)) # complist.append(Process("heartbeat", 0, [], [], [], None, self)) if self.fastfail == 0: ccm.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "cib .* exited with return code 2", "ERROR: Respawning client .*cib", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) cib.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) lrmd.pats.extend([ "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) pengine.pats.extend([ "ERROR: Respawning client .*crmd" ]) complist.append(ccm) complist.append(cib) complist.append(lrmd) complist.append(crmd) complist.append(pengine) return complist def NodeUUID(self, node): lines = self.rsh(node, self["UUIDQueryCmd"], 1) for line in lines: self.debug("UUIDLine:"+ line) m = re.search(r'%s.+\((.+)\)' % node, line) if m: return m.group(1) return "" def StandbyStatus(self, node): out=self.rsh(node, self["StandbyQueryCmd"]%node, 1) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/cts/CTSlab.py.in b/cts/CTSlab.py.in index 4cc62f414f..f4d5f37673 100755 --- a/cts/CTSlab.py.in +++ b/cts/CTSlab.py.in @@ -1,662 +1,668 @@ #!@PYTHON@ '''CTS: Cluster Testing System: Lab environment module ''' __copyright__=''' Copyright (C) 2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from UserDict import UserDict import sys, time, types, string, syslog, random, os, string, signal, traceback from CTS import ClusterManager, RemoteExec from CTStests import BSC_AddResource from socket import gethostbyname_ex from CM_ais import crm_ais -from CM_lha import LinuxHAv2 +from CM_lha import crm_lha tests = None cm = None old_handler = None DefaultFacility = "daemon" def sig_handler(signum, frame) : if cm != None: cm.log("Interrupted by signal %d"%signum) if signum == 10 and tests != None : tests.summarize() if signum == 15 : sys.exit(1) class Logger: TimeFormat = "%b %d %H:%M:%S\t" def __call__(self, lines): raise ValueError("Abstract class member (__call__)") def write(self, line): return self(line.rstrip()) def writelines(self, lines): for s in lines: self.write(s) return 1 def flush(self): return 1 def isatty(self): return None class SysLog(Logger): # http://docs.python.org/lib/module-syslog.html defaultsource="CTS" map = { "kernel": syslog.LOG_KERN, "user": syslog.LOG_USER, "mail": syslog.LOG_MAIL, "daemon": syslog.LOG_DAEMON, "auth": syslog.LOG_AUTH, "lpr": syslog.LOG_LPR, "news": syslog.LOG_NEWS, "uucp": syslog.LOG_UUCP, "cron": syslog.LOG_CRON, "local0": syslog.LOG_LOCAL0, "local1": syslog.LOG_LOCAL1, "local2": syslog.LOG_LOCAL2, "local3": syslog.LOG_LOCAL3, "local4": syslog.LOG_LOCAL4, "local5": syslog.LOG_LOCAL5, "local6": syslog.LOG_LOCAL6, "local7": syslog.LOG_LOCAL7, } def __init__(self, labinfo): if labinfo.has_key("syslogsource"): self.source=labinfo["syslogsource"] else: self.source=SysLog.defaultsource self.facility=DefaultFacility if labinfo.has_key("SyslogFacility") \ and labinfo["SyslogFacility"]: if SysLog.map.has_key(labinfo["SyslogFacility"]): self.facility=labinfo["SyslogFacility"] else: raise ValueError("%s: bad syslog facility"%labinfo["SyslogFacility"]) self.facility=SysLog.map[self.facility] syslog.openlog(self.source, 0, self.facility) def setfacility(self, facility): self.facility = facility if SysLog.map.has_key(self.facility): self.facility=SysLog.map[self.facility] syslog.closelog() syslog.openlog(self.source, 0, self.facility) def __call__(self, lines): if isinstance(lines, types.StringType): syslog.syslog(lines) else: for line in lines: syslog.syslog(line) def name(self): return "Syslog" class StdErrLog(Logger): def __init__(self, labinfo): pass def __call__(self, lines): t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): sys.__stderr__.writelines([t, lines, "\n"]) else: for line in lines: sys.__stderr__.writelines([t, line, "\n"]) sys.__stderr__.flush() def name(self): return "StdErrLog" class FileLog(Logger): def __init__(self, labinfo, filename=None): if filename == None: filename=labinfo["LogFileName"] self.logfile=filename import os self.hostname = os.uname()[1]+" " self.source = "CTS: " def __call__(self, lines): fd = open(self.logfile, "a") t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): fd.writelines([t, self.hostname, self.source, lines, "\n"]) else: for line in lines: fd.writelines([t, self.hostname, self.source, line, "\n"]) fd.close() def name(self): return "FileLog" class CtsLab(UserDict): '''This class defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. It is where you define the set of nodes that are in your test lab what kind of reset mechanism you use, etc. This class is derived from a UserDict because we hold many different parameters of different kinds, and this provides provide a uniform and extensible interface useful for any kind of communication between the user/administrator/tester and CTS. At this point in time, it is the intent of this class to model static configuration and/or environmental data about the environment which doesn't change as the tests proceed. Well-known names (keys) are an important concept in this class. The HasMinimalKeys member function knows the minimal set of well-known names for the class. The following names are standard (well-known) at this time: nodes An array of the nodes in the cluster reset A ResetMechanism object logger An array of objects that log strings... CMclass The type of ClusterManager we are running (This is a class object, not a class instance) RandSeed Random seed. It is a triple of bytes. (optional) The CTS code ignores names it doesn't know about/need. The individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions or other information to the tests through this mechanism. ''' def __init__(self): self.data = {} self.rsh = RemoteExec(self) self.RandomGen = random.Random() # Get a random seed for the random number generator. self["LogFileName"] = "@HA_VARLOGDIR@/ha-log" self["SyslogFacility"] = None self["DoStonith"] = 1 self["DoStandby"] = 1 self["DoFencing"] = 1 self["XmitLoss"] = "0.0" self["RecvLoss"] = "0.0" self["IPBase"] = "127.0.0.10" self["ClobberCIB"] = 0 self["CIBfilename"] = None self["CIBResource"] = 0 self["DoBSC"] = 0 self["use_logd"] = 0 self["oprofile"] = [] self["warn-inactive"] = 0 self["ListTests"] = 0 self["CMclass"] = crm_ais self["logrestartcmd"] = "@INITDIR@/syslog restart 2>&1 > /dev/null" self["Schema"] = "pacemaker-0.6" self["Stack"] = "openais" self["stonith-type"] = "external/ssh" self["stonith-params"] = "hostlist=all" self["at-boot"] = 1 # Does the cluster software start automatically when the node boot self["logger"] = ([StdErrLog(self)]) self["experimental-tests"] = 0 self["loop-tests"] = 0 self.SeedRandom() def SeedRandom(self, seed=None): if not seed: seed = int(time.time()) if self.has_key("RandSeed"): self.log("New random seed is: " + str(seed)) else: self.log("Random seed is: " + str(seed)) self["RandSeed"] = seed self.RandomGen.seed(str(seed)) def HasMinimalKeys(self): 'Return TRUE if our object has the minimal set of keys/values in it' result = 1 for key in self.MinimalKeys: if not self.has_key(key): result = None return result def log(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: logfcn(string.strip(args)) def debug(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: if logfcn.name() != "StdErrLog": logfcn("debug: %s" % string.strip(args)) def __setitem__(self, key, value): '''Since this function gets called whenever we modify the dictionary (object), we can (and do) validate those keys that we know how to validate. For the most part, we know how to validate the "MinimalKeys" elements. ''' # # List of nodes in the system # if key == "nodes": self.Nodes = {} for node in value: # I don't think I need the IP address, etc. but this validates # the node name against /etc/hosts and/or DNS, so it's a # GoodThing(tm). try: self.Nodes[node] = gethostbyname_ex(node) except: print node+" not found in DNS... aborting" raise # # List of Logging Mechanism(s) # elif key == "logger": if len(value) < 1: raise ValueError("Must have at least one logging mechanism") for logger in value: if not callable(logger): raise ValueError("'logger' elements must be callable") self._logfunctions = value # # Cluster Manager Class # elif key == "CMclass": if not issubclass(value, ClusterManager): raise ValueError("'CMclass' must be a subclass of" " ClusterManager") # # Initial Random seed... # #elif key == "RandSeed": # if len(value) != 3: # raise ValueError("'Randseed' must be a 3-element list/tuple") # for elem in value: # if not isinstance(elem, types.IntType): # raise ValueError("'Randseed' list must all be ints") self.data[key] = value def IsValidNode(self, node): 'Return TRUE if the given node is valid' return self.Nodes.has_key(node) def __CheckNode(self, node): "Raise a ValueError if the given node isn't valid" if not self.IsValidNode(node): raise ValueError("Invalid node [%s] in CheckNode" % node) def RandomNode(self): '''Choose a random node from the cluster''' return self.RandomGen.choice(self["nodes"]) def usage(arg): print "Illegal argument " + arg print "usage: " + sys.argv[0] +" [options] number-of-iterations" print "\nCommon options: " print "\t [(-L | --logfile) system-logfile-name]" print "\t [--(syslog-)facility syslog-facility]" print "\t [--limit-nodes max-number-of-nodes]" print "\t [--test-ip-base ip]" print "\t [--lha | --ais | --stack (heartbeat|openais)] " print "\t [--schema (pacemaker-0.6|pacemaker-1.0)] " print "\t [--populate-resources | -r]" print "\t [--list-tests]" print "\t [--choose testcase-name]" print "\t [--nodes 'list of cluster nodes separated by whitespace']" print "\t [--(cluster-starts-)at-boot (1|0)]" print "\t " print "\nAdditional (less common) options: " print "\t [--trunc (truncate logfile before starting)]" print "\t [--xmit-loss lost-rate(0.0-1.0)]" print "\t [--recv-loss lost-rate(0.0-1.0)]" print "\t [--oprofile \"whitespace separated list of nodes to oprofile\"]" print "\t [--standby (1 | 0 | yes | no)]" print "\t [--fencing (1 | 0 | yes | no)]" print "\t [--stonith (1 | 0 | yes | no)]" print "\t [--stonith-type type]" print "\t [--stonith-args name=value]" print "\t [--bsc]" print "\t [--expreimental]" print "\t [--loops]" print "\t [--seed]" + print "\t [--set option=value]" sys.exit(1) # # A little test code... # if __name__ == '__main__': from CTSaudits import AuditList from CTStests import TestList,RandomTests from CTS import Scenario, InitClusterManager, PingFest, PacketLoss, BasicSanityCheck Environment = CtsLab() NumIter = 500 Version = 1 LimitNodes = 0 TestCase = None TruncateLog = 0 ListTests = 0 HaveSeed = 0 StonithType = "external/ssh" StonithParams = None StonithParams = "hostlist=dynamic".split('=') node_list = '' # # The values of the rest of the parameters are now properly derived from # the configuration files. # # Stonith is configurable because it's slow, I have a few machines which # don't reboot very reliably, and it can mild damage to your machine if # you're using a real power switch. # # Standby is configurable because the test is very heartbeat specific # and I haven't written the code to set it properly yet. Patches are # being accepted... # Set the signal handler signal.signal(15, sig_handler) signal.signal(10, sig_handler) # Process arguments... skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == "-l" or args[i] == "--limit-nodes": skipthis=1 LimitNodes = int(args[i+1]) elif args[i] == "-r" or args[i] == "--populate-resources": Environment["CIBResource"] = 1 elif args[i] == "-L" or args[i] == "--logfile": skipthis=1 Environment["LogFileName"] = args[i+1] elif args[i] == "--test-ip-base": skipthis=1 Environment["IPBase"] = args[i+1] elif args[i] == "--oprofile": skipthis=1 Environment["oprofile"] = args[i+1].split(' ') elif args[i] == "--trunc": Environment["TruncateLog"]=1 elif args[i] == "--list-tests": Environment["ListTests"]=1 elif args[i] == "--bsc": Environment["DoBSC"] = 1 elif args[i] == "--fencing": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": Environment["DoFencing"] = 1 elif args[i+1] == "0" or args[i+1] == "no": Environment["DoFencing"] = 0 else: usage(args[i+1]) elif args[i] == "--stonith": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": Environment["DoStonith"]=1 elif args[i+1] == "0" or args[i+1] == "no": Environment["DoStonith"]=0 else: usage(args[i+1]) elif args[i] == "--stonith-type": StonithType = args[i+1] skipthis=1 elif args[i] == "--stonith-args": StonithParams = args[i+1].split('=') skipthis=1 elif args[i] == "--standby": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": Environment["DoStandby"] = 1 elif args[i+1] == "0" or args[i+1] == "no": Environment["DoStandby"] = 0 else: usage(args[i+1]) elif args[i] == "--clobber-cib" or args[i] == "-c": Environment["ClobberCIB"] = 1 elif args[i] == "--cib-filename": skipthis=1 Environment["CIBfilename"] = args[i+1] elif args[i] == "--xmit-loss": try: float(args[i+1]) except ValueError: print ("--xmit-loss parameter should be float") usage(args[i+1]) skipthis=1 Environment["XmitLoss"] = args[i+1] elif args[i] == "--recv-loss": try: float(args[i+1]) except ValueError: print ("--recv-loss parameter should be float") usage(args[i+1]) skipthis=1 Environment["RecvLoss"] = args[i+1] elif args[i] == "--choose": skipthis=1 TestCase = args[i+1] elif args[i] == "--nodes": skipthis=1 node_list = args[i+1].split(' ') elif args[i] == "--syslog-facility" or args[i] == "--facility": skipthis=1 Environment["SyslogFacility"] = args[i+1] elif args[i] == "--seed": skipthis=1 Environment.SeedRandom(args[i+1]) elif args[i] == "--warn-inactive": Environment["warn-inactive"] = 1 elif args[i] == "--schema": skipthis=1 Environment["Schema"] = args[i+1] elif args[i] == "--ais": Environment["Stack"] = "openais" elif args[i] == "--at-boot" or args[i] == "--cluster-starts-at-boot": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": Environment["at-boot"] = 1 elif args[i+1] == "0" or args[i+1] == "no": Environment["at-boot"] = 0 else: usage(args[i+1]) elif args[i] == "--heartbeat" or args[i] == "--lha": Environment["Stack"] = "heartbeat" elif args[i] == "--stack": skipthis=1 Environment["Stack"] = args[i+1] elif args[i] == "--expreimental": Environment["experimental-tests"] = 1 elif args[i] == "--loops": Environment["loop-tests"] = 1 + elif args[i] == "--set": + skipthis=1 + (name, value) = args[i+1].split('=') + Environment[name] = value + else: try: NumIter=int(args[i]) except ValueError: usage(args[i]) if Environment["DoBSC"]: NumIter = 2 LimitNodes = 1 Environment["ClobberCIB"] = 1 Environment["CIBResource"] = 0 Environment["logger"].append(FileLog(Environment)) else: Environment["logger"].append(SysLog(Environment)) if LimitNodes > 0: if len(node_list) > LimitNodes: print("Limiting the number of nodes configured=%d (max=%d)" %(len(node_list), LimitNodes)) while len(node_list) > LimitNodes: node_list.pop(len(node_list)-1) if len(node_list) < 1: print "No nodes specified!" sys.exit(1) Environment["nodes"] = node_list if Environment["Stack"] == "heartbeat": - Environment['CMclass'] = LinuxHAv2 + Environment['CMclass'] = crm_lha elif Environment["Stack"] == "openais": Environment['CMclass'] = crm_ais Environment["use_logd"] = 0 # Scenario selection if Environment["DoBSC"]: scenario = Scenario([ BasicSanityCheck(Environment) ]) else: scenario = Scenario( [ InitClusterManager(Environment), PacketLoss(Environment)]) # Your basic start up the world type of test scenario... #scenario = Scenario( #[ InitClusterManager(Environment) #, PingFest(Environment)]) # Create the Cluster Manager object cm = Environment['CMclass'](Environment) if TruncateLog: cm.log("Truncating %s" % LogFile) lf = open(LogFile, "w"); if lf != None: lf.truncate(0) lf.close() keys = [] for key in Environment.keys(): keys.append(key) keys.sort() for key in keys: cm.debug("Environment["+key+"]:\t"+str(Environment[key])) cm.log(">>>>>>>>>>>>>>>> BEGINNING " + repr(NumIter) + " TESTS ") cm.log("System log files: " + Environment["LogFileName"]) cm.log("Schema: %s" % Environment["Schema"]) cm.log("Stack: %s" % Environment["Stack"]) cm.log("Enable Stonith: %d" % Environment["DoStonith"]) cm.log("Enable Fencing: %d" % Environment["DoFencing"]) cm.log("Enable Standby: %d" % Environment["DoStandby"]) cm.log("Enable Resources: %d" % Environment["CIBResource"]) cm.ns.WaitForAllNodesToComeUp(Environment["nodes"]) cm.log("Cluster nodes: ") for node in Environment["nodes"]: cm.log(" * %s" % (node)) Audits = AuditList(cm) Tests = [] if Environment["DoBSC"]: test = BSC_AddResource(cm) Tests.append(test) elif TestCase != None: for test in TestList(cm, Audits): if test.name == TestCase: Tests.append(test) if Tests == []: usage("--choose: No applicable/valid tests chosen") else: Tests = TestList(cm, Audits) if Environment["ListTests"] == 1 : cm.log("Total %d tests"%len(Tests)) for test in Tests : cm.log(str(test.name)); sys.exit(0) tests = RandomTests(scenario, cm, Tests, Audits) Environment.RandomTests = tests try : overall, detailed = tests.run(NumIter) except : cm.Env.log("Exception by %s" % sys.exc_info()[0]) for logmethod in Environment["logger"]: traceback.print_exc(50, logmethod) tests.summarize() if tests.Stats["failure"] > 0: sys.exit(tests.Stats["failure"]) elif tests.Stats["success"] != NumIter: cm.Env.log("No failure count but success != requested iterations") sys.exit(1)