diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in index 8f01a43904..ca9820f293 100755 --- a/cts/CM_LinuxHAv2.py.in +++ b/cts/CM_LinuxHAv2.py.in @@ -1,610 +1,610 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTS import * from CM_hb import HeartbeatCM from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class LinuxHAv2(ClusterManager): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): ClusterManager.__init__(self, Environment, randseed=randseed) #HeartbeatCM.__init__(self, Environment, randseed=randseed) self.fastfail = 0 self.clear_cache = 0 self.cib_installed = 0 self.config = None self.cluster_monitor = 0 self.use_short_names = 1 self.update({ "Name" : "crm-lha", "DeadTime" : 300, "StartTime" : 300, # Max time to start up "StableTime" : 30, "StartCmd" : "@INITDIR@/heartbeat@INIT_EXT@ start > /dev/null 2>&1", "StopCmd" : "@INITDIR@/heartbeat@INIT_EXT@ stop > /dev/null 2>&1", "ElectionCmd" : "@sbindir@/crmadmin -E %s", "StatusCmd" : "@sbindir@/crmadmin -t 60000 -S %s 2>/dev/null", - "EpocheCmd" : "@sbindir@/ccm_tool -H -e", - "QuorumCmd" : "@sbindir@/ccm_tool -H -q", - "ParitionCmd" : "@sbindir@/ccm_tool -H -p", + "EpocheCmd" : "@sbindir@/crm_node -H -e", + "QuorumCmd" : "@sbindir@/crm_node -H -q", + "ParitionCmd" : "@sbindir@/crm_node -H -p", "CibQuery" : "@sbindir@/cibadmin -Ql", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>&1", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "LogFileName" : Environment["LogFileName"], "StandbyCmd" : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null", "UUIDQueryCmd" : "@sbindir@/crmadmin -N", "StandbyQueryCmd" : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE", # This wont work if we have multiple partitions "Pat:Local_started" : "%s crmd:.*The local CRM is operational", "Pat:Slave_started" : "%s crmd:.*State transition.*-> S_NOT_DC", "Pat:Master_started" : "%s crmd:.* State transition.*-> S_IDLE", "Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ", "Pat:All_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:ChildKilled" : "%s heartbeat.*%s.*killed by signal 9", "Pat:ChildRespawn" : "%s heartbeat.*Respawning client.*%s", "Pat:ChildExit" : "ERROR: Client .* exited with return code", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r"global_timer_callback:", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", ), }) if self.Env["DoBSC"]: del self["Pat:They_stopped"] del self["Pat:Logd_stopped"] self.Env["use_logd"] = 0 self._finalConditions() self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} self.CibFactory = ConfigFactory(self) self.cib = self.CibFactory.createConfig(self.Env["Schema"]) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "ERROR: crm_abort: crm_glib_handler: ", "ERROR: Message hist queue is filling up", "stonithd: .*CRIT: external_hostlist: 'vmware gethosts' returned an empty hostlist", "stonithd: .*ERROR: Could not list nodes for stonith RA external/vmware.", "pengine: Preventing .* from re-starting", ] return [] def install_config(self, node): if not self.ns.WaitForNodeToComeUp(node): self.log("Node %s is not up." % node) return None if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml") self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig") self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.last") self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig.last") # Only install the CIB on the first node, all the other ones will pick it up from there if self.cib_installed == 1: return None self.cib_installed = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) warnings.filterwarnings("ignore") cib_file=os.tmpnam() warnings.resetwarnings() os.system("rm -f "+cib_file) self.debug("Creating new CIB for " + node + " in: " + cib_file) os.system("echo \'" + self.cib.contents() + "\' > " + cib_file) if 0!=self.rsh.echo_cp(None, cib_file, node, "@HA_VARLIBDIR@/heartbeat/crm/cib.xml"): raise ValueError("Can not create CIB on %s "%node) os.system("rm -f "+cib_file) else: self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) self.rsh.remote_py(node, "os", "system", "chown @HA_CCMUSER@ @HA_VARLIBDIR@/heartbeat/crm/cib.xml") def prepare(self): '''Finish the Initialization process. Prepare to test...''' self.partitions_expected = 1 for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" self.unisolate_node(node) self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") watchpats.append(self["Pat:Slave_started"]%node) idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats) idle_watch.setwatch() out=self.rsh.readaline(node, self["StatusCmd"]%node) self.debug("Node %s status: '%s'" %(node, out)) if not out or string.find(out, 'ok') < 0: if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" %(node, "down", self.ShouldBeStatus[node])) self.ShouldBeStatus[node]="down" return 0 if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s: %s" %(node, "up", self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node]="up" # check the output first - because syslog-ng looses messages if string.find(out, 'S_NOT_DC') != -1: # Up and stable return 2 if string.find(out, 'S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" %(node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" %(node)) return None def partition_stable(self, nodes, timeout=None): watchpats = [ ] watchpats.append("Current ping state: S_IDLE") watchpats.append(self["Pat:DC_IDLE"]) self.debug("Waiting for cluster stability...") if timeout == None: timeout = self["DeadTime"] idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout) idle_watch.setwatch() any_up = 0 for node in self.Env["nodes"]: # have each node dump its current state if self.ShouldBeStatus[node] == "up": self.rsh.readaline(node, (self["StatusCmd"] %node) ) any_up = 1 if any_up == 0: self.debug("Cluster is inactive") return 1 ret = idle_watch.look() while ret: self.debug(ret) for node in nodes: if re.search(node, ret): return 1 ret = idle_watch.look() self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) return None def cluster_stable(self, timeout=None): partitions = self.find_partitions() for partition in partitions: if not self.partition_stable(partition, timeout): return None return 1 def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh.readaline(node, self["StatusCmd"]%node) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 return rc def active_resources(self, node): # [SM].* {node} matches Started, Slave, Master # Stopped wont be matched as it wont include {node} (rc, output) = self.rsh.remote_py( node, "os", "system", """@sbindir@/crm_mon -1 | grep -e "[SM].* %s" """ % node) resources = [] for line in output: fields = line.split() resources.append(fields[0]) return resources def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"): ''' Execute an operation on a resource ''' cmd = self["ExecuteRscOp"] % (app, resource, op, interval) (rc, lines) = self.rsh.remote_py(node, "os", "system", cmd) #self.debug("RscOp '%s' on %s: %d" % (cmd, node, rc)) #for line in lines: # self.debug("RscOp: "+line) return rc def ResourceLocation(self, rid): ResourceNodes = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": dummy = 0 rc = self.ResourceOp(rid, "monitor", node) # Strange error codes from remote_py # 65024 == not installed # 2048 == 8 # 1792 == 7 # 0 == 0 if rc == 65024: dummy = 1 #self.debug("%s is not installed on %s: %d" % (rid, node, rc)) elif rc == 0 or rc == 2048 or rc == 8: ResourceNodes.append(node) elif rc == 7 or rc == 1792: dummy = 1 #self.debug("%s is not running on %s: %d" % (rid, node, rc)) else: # not active on this node? self.log("Unknown rc code for %s on %s: %d" % (rid, node, rc)) return ResourceNodes def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": partition = self.rsh.readaline(node, self["ParitionCmd"]) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" %(node, partition)) ccm_partitions.append(partition) else: self.debug("Partition '%s' from %s is consistent with existing entries" %(partition, node)) else: self.log("bad partition details for %s" %node) else: self.debug("Node %s is down... skipping" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == "up": quorum = self.rsh.readaline(node, self["QuorumCmd"]) if string.find(quorum, "1") != -1: return 1 elif string.find(quorum, "0") != -1: return 0 else: self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "crmd: .*Action A_RECOVER .* not supported", "ERROR: stonithd_op_result_ready: not signed on", "pingd: .*ERROR: send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd", "verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", ] stonith_ignore = [ "ERROR: stonithd_signon: ", "update_failcount: Updating failcount for child_DoFencing", "ERROR: te_connect_stonith: Sign-in failed: triggered a retry", ] stonith_ignore.extend(common_ignore) ccm = Process("ccm", 0, [ "State transition S_IDLE", "CCM connection appears to have failed", "crmd: .*Action A_RECOVER .* not supported", "crmd: .*Input I_TERMINATE from do_recover", "Exiting to recover from CCM connection failure", "crmd:.*do_exit: Could not recover from internal error", "crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", "cib .*exited with return code 2.", "crmd:.*get_channel_token: No reply message - disconnected", # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "A new node joined the cluster", # "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE", # "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN", "State transition S_STARTING -> S_PENDING", ], [], common_ignore, self.fastfail, self) cib = Process("cib", 0, [ "State transition S_IDLE", "Lost connection to the CIB service", "Connection to the CIB terminated...", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", "attrd .*exited with return code 1.", ], [], common_ignore, self.fastfail, self) lrmd = Process("lrmd", 0, [ "State transition S_IDLE", "LRM Connection failed", "crmd: .*I_ERROR.*lrm_connection_destroy", "State transition S_STARTING -> S_PENDING", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) crmd = Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition .* S_IDLE", "State transition S_STARTING -> S_PENDING", ], [ ], common_ignore, self.fastfail, self) pengine = Process("pengine", 1, [ "State transition S_IDLE", "crmd .*exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*do_exit: Could not recover from internal error", "crmd: .*CRIT: pe_connection_destroy: Connection to the Policy Engine failed", "crmd: .*I_ERROR.*save_cib_contents", "crmd .*exited with return code 2.", ], [], common_ignore, self.fastfail, self) if self.Env["DoFencing"] == 1 : complist.append(Process("stonithd", 0, [], [ "crmd: .*CRIT: tengine_stonith_connection_destroy: Fencing daemon connection failed", "Attempting connection to fencing daemon", "te_connect_stonith: Connected", ], stonith_ignore, 0, self)) # complist.append(Process("heartbeat", 0, [], [], [], None, self)) if self.fastfail == 0: ccm.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "cib .* exited with return code 2", "ERROR: Respawning client .*cib", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) cib.pats.extend([ "attrd .* exited with return code 1", "ERROR: Respawning client .*attrd", "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) lrmd.pats.extend([ "crmd .* exited with return code 2", "ERROR: Respawning client .*crmd" ]) pengine.pats.extend([ "ERROR: Respawning client .*crmd" ]) complist.append(ccm) complist.append(cib) complist.append(lrmd) complist.append(crmd) complist.append(pengine) return complist def NodeUUID(self, node): lines = self.rsh.readlines(node, self["UUIDQueryCmd"]) for line in lines: self.debug("UUIDLine:"+ line) m = re.search(r'%s.+\((.+)\)' % node, line) if m: return m.group(1) return "" def StandbyStatus(self, node): out=self.rsh.readaline(node, self["StandbyQueryCmd"]%node) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/cts/CM_ais.py.in b/cts/CM_ais.py.in index f177eb9767..6fed03978f 100644 --- a/cts/CM_ais.py.in +++ b/cts/CM_ais.py.in @@ -1,276 +1,276 @@ #!@PYTHON@ '''CTS: Cluster Testing System: AIS dependent modules... ''' __copyright__=''' Copyright (C) 2007 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTS import * from CM_hb import HeartbeatCM from CM_LinuxHAv2 import LinuxHAv2 from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class crm_ais(LinuxHAv2): ''' The crm version 3 cluster manager class. It implements the things we need to talk to and manipulate crm clusters running on top of openais ''' def __init__(self, Environment, randseed=None): LinuxHAv2.__init__(self, Environment, randseed=randseed) self.update({ "Name" : "crm-ais", "StartCmd" : "@INITDIR@/openais start > /dev/null 2>&1", "StopCmd" : "@INITDIR@/openais stop > /dev/null 2>&1", "UUIDQueryCmd" : "@sbindir@/crmadmin -N", - "EpocheCmd" : "@sbindir@/ccm_tool -e", - "QuorumCmd" : "@sbindir@/ccm_tool -q", - "ParitionCmd" : "@sbindir@/ccm_tool -p", + "EpocheCmd" : "@sbindir@/crm_node -e", + "QuorumCmd" : "@sbindir@/crm_node -q", + "ParitionCmd" : "@sbindir@/crm_node -p", "Pat:We_stopped" : "%s.*openais.*crm_exec_exit_fn: Shutdown complete", "Pat:They_stopped" : "%s crmd:.*Node %s: .* state=lost .new", "Pat:All_stopped" : "%s.*openais.*crm_exec_exit_fn: Shutdown complete", "Pat:They_dead" : "openais:.*Node %s is now: lost", "Pat:ChildKilled" : "%s openais.*Child process %s terminated with signal 9", "Pat:ChildRespawn" : "%s openais.*Respawning failed child process: %s", "Pat:ChildExit" : "Child process .* exited", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"Child process .* terminated with signal 11", ), }) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "crmadmin:", "async_notify: strange, client not found", "ERROR: Message hist queue is filling up" ] return [] def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: self.debug("Retrieving partition details for %s" %node) if self.ShouldBeStatus[node] == "up": partition = self.rsh.readaline(node, self["ParitionCmd"]) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" %(node, partition)) ccm_partitions.append(partition) else: self.log("ERROR: Bad partition details for %s: '%s'" % (node, partition)) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == "up": quorum = self.rsh.readaline(node, self["QuorumCmd"]) if string.find(quorum, "1") != -1: return 1 elif string.find(quorum, "0") != -1: return 0 else: self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "apply_xml_diff: Diff application failed!", "crmd: .*Action A_RECOVER .* not supported", "pingd: .*ERROR: send_update: Could not send update", "send_ipc_message: IPC Channel to .* is not connected", "unconfirmed_actions: Waiting on .* unconfirmed actions", "cib_native_msgready: Message pending on command channel", "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd", "verify_stopped: Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", "ERROR: stonithd_op_result_ready: not signed on", ] complist.append(Process("cib", 0, [ "State transition S_IDLE", "Respawning .* crmd", "Respawning .* attrd", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Child process crmd exited .* rc=2", "Child process attrd exited .* rc=1", "State transition S_STARTING -> S_PENDING", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, 0, self)) complist.append(Process("lrmd", 0, [ "State transition S_IDLE", "LRM Connection failed", "Respawning .* crmd", "crmd: .*I_ERROR.*lrm_connection_destroy", "State transition S_STARTING -> S_PENDING", "Child process crmd exited .* rc=2", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, 0, self)) complist.append(Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # Only if the node wasn't the DC: "State transition S_IDLE", "State transition .* -> S_IDLE", "State transition S_STARTING -> S_PENDING", ], [], common_ignore, 0, self)) complist.append(Process("attrd", 0, [ ], [], common_ignore, 0, self)) aisexec_ignore = [ "ERROR: ais_dispatch: Receiving message header failed", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "cib: .*ERROR: cib_ais_destroy: AIS connection terminated", "attrd: .*CRIT: attrd_ais_destroy: Lost connection to OpenAIS service!", "stonithd: .*ERROR: AIS connection terminated", ] aisexec_ignore.extend(common_ignore) complist.append(Process("aisexec", 0, [ "ERROR: ais_dispatch: AIS connection failed", "crmd: .*I_TERMINATE.*do_recover", "crmd: .*ERROR: do_exit: Could not recover from internal error", "crmd: .*State transition S_STARTING -> S_PENDING", "pengine: .*Scheduling Node .* for STONITH", "stonithd: .*requests a STONITH operation RESET on node", "stonithd: .*Succeeded to STONITH the node", ], [], aisexec_ignore, 0, self)) complist.append(Process("pengine", 0, [ ], [ "State transition S_IDLE", "Respawning .* crmd", "Child process crmd exited .* rc=2", "crmd: .*pe_connection_destroy: Connection to the Policy Engine failed", "crmd: .*I_ERROR.*save_cib_contents", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], common_ignore, 0, self)) if self.Env["DoFencing"] == 1 : stonith_ignore = [ "ERROR: stonithd_signon: ", "update_failcount: Updating failcount for child_DoFencing", "ERROR: te_connect_stonith: Sign-in failed: triggered a retry", ] stonith_ignore.extend(common_ignore) complist.append(Process("stonithd", 0, [], [ "tengine_stonith_connection_destroy: Fencing daemon connection failed", "Attempting connection to fencing daemon", "te_connect_stonith: Connected", ], stonith_ignore, 0, self)) return complist def NodeUUID(self, node): return node ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass diff --git a/tools/Makefile.am b/tools/Makefile.am index 90303c6f71..190cfb581c 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -1,123 +1,123 @@ # # heartbeat: Linux-HA heartbeat code # # Copyright (C) 2001 Michael Moerz # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in ccdv INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ -I$(top_builddir)/libltdl -I$(top_srcdir)/libltdl \ -I$(AISPREFIX)/include/openais COMMONLIBS = \ $(top_builddir)/lib/common/libcrmcommon.la \ $(top_builddir)/lib/cib/libcib.la \ -lplumb \ $(CURSESLIBS) $(LIBRT) \ $(CLUSTERLIBS) EXTRA_DIST = ccdv.c attrd.h halibdir = $(libdir)/@HB_PKG@ halib_SCRIPTS = haresources2cib.py hb2openais.sh halib_PROGRAMS = attrd pingd halib_PYTHON = crm_primitive.py hb2openais-helper.py -sbin_PROGRAMS = crmadmin cibadmin ccm_tool crm_diff crm_mon iso8601 \ +sbin_PROGRAMS = crmadmin cibadmin crm_node crm_diff crm_mon iso8601 \ crm_master crm_standby crm_failcount crm_attribute \ crm_resource crm_verify crm_uuid crm_shadow attrd_updater sbin_SCRIPTS = crm ## SOURCES ccdv: $(top_srcdir)/tools/ccdv.c gcc $(AM_CFLAGS) $(CFLAGS) -o ccdv $(top_srcdir)/tools/ccdv.c ## SOURCES #noinst_HEADERS = config.h control.h crmd.h noinst_HEADERS = crmadmin_SOURCES = crmadmin.c crmadmin_LDADD = $(COMMONLIBS) $(CLUSTERLIBS) \ $(top_builddir)/lib/pengine/libpe_status.la crm_uuid_SOURCES = crm_uuid.c crm_uuid_LDADD = $(top_builddir)/lib/common/libcrmcluster.la cibadmin_SOURCES = cibadmin.c cibadmin_LDADD = $(COMMONLIBS) crm_shadow_SOURCES = cib_shadow.c crm_shadow_LDADD = $(COMMONLIBS) -ccm_tool_SOURCES = ccm_epoche.c -ccm_tool_LDADD = $(COMMONLIBS) $(CLUSTERLIBS) \ +crm_node_SOURCES = ccm_epoche.c +crm_node_LDADD = $(COMMONLIBS) $(CLUSTERLIBS) \ $(top_builddir)/lib/common/libcrmcluster.la crm_diff_SOURCES = xml_diff.c crm_diff_LDADD = $(COMMONLIBS) crm_mon_SOURCES = crm_mon.c crm_mon_LDADD = $(COMMONLIBS) -llrm \ $(top_builddir)/lib/pengine/libpe_status.la # Arguments could be made that this should live in crm/pengine crm_verify_SOURCES = crm_verify.c crm_verify_LDADD = $(COMMONLIBS) \ $(top_builddir)/lib/pengine/libpe_status.la \ $(top_builddir)/pengine/libpengine.la crm_master_SOURCES = crm_attribute.c crm_master_LDADD = $(COMMONLIBS) crm_standby_SOURCES = crm_attribute.c crm_standby_LDADD = $(COMMONLIBS) crm_attribute_SOURCES = crm_attribute.c crm_attribute_LDADD = $(COMMONLIBS) crm_failcount_SOURCES = crm_attribute.c crm_failcount_LDADD = $(COMMONLIBS) crm_resource_SOURCES = crm_resource.c crm_resource_LDADD = $(COMMONLIBS) \ $(top_builddir)/lib/pengine/libpe_rules.la \ $(top_builddir)/lib/pengine/libpe_status.la iso8601_SOURCES = test.iso8601.c iso8601_LDADD = $(COMMONLIBS) # A little trick. Now ccdv can be auto-built but not auto-cleaned. attrd_DEPENDENCIES = ccdv attrd_SOURCES = attrd.c attrd_LDADD = $(COMMONLIBS) $(top_builddir)/lib/common/libcrmcluster.la pingd_SOURCES = pingd.c pingd_LDADD = $(COMMONLIBS) attrd_updater_SOURCES = attrd_updater.c attrd_updater_LDADD = $(COMMONLIBS) clean-generic: rm -f *.log *.debug *.xml *~ install-exec-local: uninstall-local: .PHONY: install-exec-hook diff --git a/tools/README.hb_report b/tools/README.hb_report index 45aaca3294..5d8f5d6757 100644 --- a/tools/README.hb_report +++ b/tools/README.hb_report @@ -1,308 +1,308 @@ Heartbeat reporting =================== Dejan Muhamedagic v1.0 `hb_report` is a utility to collect all information relevant to Heartbeat over the given period of time. Quick start ----------- Run `hb_report` on one of the nodes or on the host which serves as a central log server. Run `hb_report` without parameters to see usage. A few examples: 1. Last night during the backup there were several warnings encountered (logserver is the log host): + logserver# hb_report -f 3:00 -t 4:00 /tmp/report + collects everything from all nodes from 3am to 4am last night. The files are stored in /tmp/report and compressed to a tarball /tmp/report.tar.gz. 2. Just found a problem during testing: node1# date : note the current time node1# /etc/init.d/heartbeat start node1# nasty_command_that_breaks_things node1# sleep 120 : wait for the cluster to settle node1# hb_report -f time /tmp/hb1 Introduction ------------ Managing clusters is cumbersome. Heartbeat v2 with its numerous configuration files and multi-node clusters just adds to the complexity. No wonder then that most problem reports were less than optimal. This is an attempt to rectify that situation and make life easier for both the users and the developers. On security ----------- `hb_report` is a fairly complex program. As some of you are probably going to run it as `root` let us state a few important things you should keep in mind: 1. Don't run `hb_report` as `root`! It is fairly simple to setup things in such a way that root access is not needed. I won't go into details, just to stress that all information collected should be readable by accounts belonging the haclient group. 2. If you still have to run this as root. Well, don't use the `-C` option. 3. Of course, every possible precaution has been taken not to disturb processes, or touch or remove files out of the given destination directory. If you (by mistake) specify an existing directory, `hb_report` will bail out soon. If you specify a relative path, it won't work either. The final product of `hb_report` is a tarball. However, the destination directory is not removed on any node, unless the user specifies `-C`. If you're too lazy to cleanup the previous run, do yourself a favour and just supply a new destination directory. You've been warned. If you worry about the space used, just put all your directories under `/tmp` and setup a cronjob to remove those directories once a week: .......... for d in /tmp/*; do test -d $d || continue test -f $d/description.txt || test -f $d/.env || continue grep -qs 'By: hb_report' $d/description.txt || grep -qs '^UNIQUE_MSG=Mark' $d/.env || continue rm -r $d done .......... Mode of operation ----------------- Cluster data collection is straightforward: just run the same procedure on all nodes and collect the reports. There is, apart from many small ones, one large complication: central syslog destination. So, in order to allow this to be fully automated, we should sometimes run the procedure on the log host too. Actually, if there is a log host, then the best way is to run `hb_report` there. We use `ssh` for the remote program invocation. Even though it is possible to run `hb_report` without ssh by doing a more menial job, the overall user experience is much better if ssh works. Anyway, how else do you manage your cluster? Another ssh related point: In case your security policy proscribes loghost-to-cluster-over-ssh communications, then you'll have to copy the log file to one of the nodes and point `hb_report` to it. Prerequisites ------------- 1. ssh + This is not strictly required, but you won't regret having a password-less ssh. It is not too difficult to setup and will save you a lot of time. If you can't have it, for example because your security policy does not allow such a thing, or you just prefer menial work, then you will have to resort to the semi-manual semi-automated report generation. See below for instructions. + If you need to supply a password for your passphrase/login, then please use the `-u` option. 2. Times + In order to find files and messages in the given period and to parse the `-f` and `-t` options, `hb_report` uses perl and one of the `Date::Parse` or `Date::Manip` perl modules. Note that you need only one of these. Furthermore, on nodes which have no logs and where you don't run `hb_report` directly, no date parsing is necessary. In other words, if you run this on a loghost then you don't need these perl modules on the cluster nodes. + On rpm based distributions, you can find `Date::Parse` in `perl-TimeDate` and on Debian and its derivatives in `libtimedate-perl`. 3. Core dumps + To backtrace core dumps `gdb` is needed and the Heartbeat packages with the debugging info. The debug info packages may be installed at the time the report is created. Let's hope that you will need this really seldom. What is in the report --------------------- 1. Heartbeat related - heartbeat version/release information - heartbeat configuration (CIB, ha.cf, logd.cf) -- heartbeat status (output from crm_mon, crm_verify, ccm_tool) +- heartbeat status (output from crm_mon, crm_verify, crm_tool) - pengine transition graphs (if any) - backtraces of core dumps (if any) - heartbeat logs (if any) 2. System related - general platform information (`uname`, `arch`, `distribution`) - system statistics (`uptime`, `top`, `ps`, `netstat -i`, `arp`) 3. User created :) - problem description (template to be edited) 4. Generated - problem analysis (generated) It is preferred that the Heartbeat is running at the time of the report, but not absolutely required. `hb_report` will also do a quick analysis of the collected information. Times ----- Specifying times can at times be a nuisance. That is why we have chosen to use one of the perl modules--they do allow certain freedom when talking dates. You can either read the instructions at the http://search.cpan.org/dist/TimeDate/lib/Date/Parse.pm#EXAMPLE_DATES[Date::Parse examples page]. or just rely on common sense and try stuff like: 3:00 (today at 3am) 15:00 (today at 3pm) 2007/9/1 2pm (September 1st at 2pm) `hb_report` will (probably) complain if it can't figure out what do you mean. Try to delimit the event as close as possible in order to reduce the size of the report, but still leaving a minute or two around for good measure. Note that `-f` is not an optional option. And don't forget to quote dates when they contain spaces. It is also possible to extract a CTS test. Just prefix the test number with `cts:` in the `-f` option. Should I send all this to the rest of Internet? ----------------------------------------------- We make an effort to remove sensitive data from the Heartbeat configuration (CIB, ha.cf, and transition graphs). However, you _have_ to tell us what is sensitive! Use the `-p` option to specify additional regular expressions to match variable names which may contain information you don't want to leak. For example: # hb_report -f 18:00 -p "user.*" -p "secret.*" /var/tmp/report We look by default for variable names matching "pass.*" and the stonith_host ha.cf directive. Logs and other files are not filtered. Please filter them yourself if necessary. Logs ---- It may be tricky to find syslog logs. The scheme used is to log a unique message on all nodes and then look it up in the usual syslog locations. This procedure is not foolproof, in particular if the syslog files are in a non-standard directory. We look in /var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster. In case we can't find the logs, please supply their location: # hb_report -f 5pm -l /var/log/cluster1/ha-log -S /tmp/report_node1 If you have different log locations on different nodes, well, perhaps you'd like to make them the same and make life easier for everybody. The log files are collected from all hosts where found. In case your syslog is configured to log to both the log server and local files and `hb_report` is run on the log server you will end up with multiple logs with same content. Files starting with "ha-" are preferred. In case syslog sends messages to more than one file, if one of them is named ha-log or ha-debug those will be favoured to syslog or messages. If there is no separate log for Heartbeat, possibly unrelated messages from other programs are included. We don't filter logs, just pick a segment for the period you specified. NB: Don't have a central log host? Read the CTS README and setup one. Manual report collection ------------------------ So, your ssh doesn't work. In that case, you will have to run this procedure on all nodes. Use `-S` so that we don't bother with ssh: # hb_report -f 5:20pm -t 5:30pm -S /tmp/report_node1 If you also have a log host which is not in the cluster, then you'll have to copy the log to one of the nodes and tell us where it is: # hb_report -f 5:20pm -t 5:30pm -l /var/tmp/ha-log -S /tmp/report_node1 Furthermore, to prevent `hb_report` from asking you to edit the report to describe the problem on every node use `-D` on all but one: # hb_report -f 5:20pm -t 5:30pm -DS /tmp/report_node1 If you reconsider and want the ssh setup, take a look at the CTS README file for instructions. Analysis -------- The point of analysis is to get out the most important information from probably several thousand lines worth of text. Perhaps this should be more properly named as report review as it is rather simple, but let's pretend that we are doing something utterly sophisticated. The analysis consists of the following: - compare files coming from different nodes; if they are equal, make one copy in the top level directory, remove duplicates, and create soft links instead - print errors, warnings, and lines matching `-L` patterns from logs - report if there were coredumps and by whom - report crm_verify results The goods --------- 1. Common + - ha-log (if found on the log host) - description.txt (template and user report) - analysis.txt 2. Per node + - ha.cf - logd.cf - ha-log (if found) - cib.xml (`cibadmin -Ql` or `cp` if Heartbeat is not running) -- ccm_tool.txt (`ccm_tool -p`) +- ccm_tool.txt (`crm_node -p`) - crm_mon.txt (`crm_mon -1`) - crm_verify.txt (`crm_verify -V`) - pengine/ (only on DC, directory with pengine transitions) - sysinfo.txt (static info) - sysstats.txt (dynamic info) - backtraces.txt (if coredumps found) - DC (well...) - RUNNING or STOPPED diff --git a/tools/utillib.sh b/tools/utillib.sh index 5a2d12b6cc..1648204192 100644 --- a/tools/utillib.sh +++ b/tools/utillib.sh @@ -1,389 +1,389 @@ # Copyright (C) 2007 Dejan Muhamedagic # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # # ha.cf/logd.cf parsing # getcfvar() { [ -f $HA_CF ] || return sed 's/#.*//' < $HA_CF | grep -w "^$1" | sed 's/^[^[:space:]]*[[:space:]]*//' } iscfvarset() { test "`getcfvar \"$1\"`" } iscfvartrue() { getcfvar "$1" | egrep -qsi "^(true|y|yes|on|1)" } getnodes() { getcfvar node } # # logging # syslogmsg() { severity=$1 shift 1 logtag="" [ "$HA_LOGTAG" ] && logtag="-t $HA_LOGTAG" logger -p ${HA_LOGFACILITY:-"daemon"}.$severity $logtag $* } # # find log destination # uselogd() { iscfvartrue use_logd && return 0 # if use_logd true iscfvarset logfacility || iscfvarset logfile || iscfvarset debugfile || return 0 # or none of the log options set false } findlogdcf() { for f in \ `which strings > /dev/null 2>&1 && strings $HA_BIN/ha_logd | grep 'logd\.cf'` \ `for d; do echo $d/logd.cf $d/ha_logd.cf; done` do if [ -f "$f" ]; then echo $f return 0 fi done return 1 } getlogvars() { savecf=$HA_CF if uselogd; then [ -f "$LOGD_CF" ] || fatal "could not find logd.cf or ha_logd.cf" HA_CF=$LOGD_CF fi HA_LOGFACILITY=`getcfvar logfacility` [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" HA_LOGFILE=`getcfvar logfile` HA_DEBUGFILE=`getcfvar debugfile` HA_SYSLOGMSGFMT="" iscfvartrue syslogmsgfmt && HA_SYSLOGMSGFMT=1 HA_CF=$savecf } findmsg() { # this is tricky, we try a few directories syslogdir="/var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster" favourites="ha-*" mark=$1 log="" for d in $syslogdir; do [ -d $d ] || continue log=`fgrep -l "$mark" $d/$favourites` && break log=`fgrep -l "$mark" $d/*` && break done 2>/dev/null echo $log } # # print a segment of a log file # str2time() { perl -e "\$time='$*';" -e ' eval "use Date::Parse"; if (!$@) { print str2time($time); } else { eval "use Date::Manip"; if (!$@) { print UnixDate(ParseDateString($time), "%s"); } } ' } getstamp() { if [ "$HA_SYSLOGMSGFMT" -o "$HA_LOGFACILITY" ]; then awk '{print $1,$2,$3}' else awk '{print $2}' | sed 's/_/ /' fi } linetime() { l=`tail -n +$2 $1 | head -1 | getstamp` str2time "$l" } findln_by_time() { logf=$1 tm=$2 first=1 last=`wc -l < $logf` while [ $first -le $last ]; do mid=$(((last+first)/2)) trycnt=10 while [ $trycnt -gt 0 ]; do tmid=`linetime $logf $mid` [ "$tmid" ] && break warning "cannot extract time: $logf:$mid; will try the next one" trycnt=$((trycnt-1)) mid=$((mid+1)) done if [ -z "$tmid" ]; then warning "giving up on log..." return fi if [ $tmid -gt $tm ]; then last=$((mid-1)) elif [ $tmid -lt $tm ]; then first=$((mid+1)) else break fi done echo $mid } dumplog() { logf=$1 from_line=$2 to_line=$3 [ "$from_line" ] || return tail -n +$from_line $logf | if [ "$to_line" ]; then head -$((to_line-from_line+1)) else cat fi } # # find files newer than a and older than b # isnumber() { echo "$*" | grep -qs '^[0-9][0-9]*$' } touchfile() { t=`maketempfile` && perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' && echo $t } find_files() { dir=$1 from_time=$2 to_time=$3 isnumber "$from_time" && [ "$from_time" -gt 0 ] || { warning "sorry, can't find files based on time if you don't supply time" return } from_stamp=`touchfile $from_time` findexp="-newer $from_stamp" if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then to_stamp=`touchfile $to_time` findexp="$findexp ! -newer $to_stamp" fi find $dir -type f $findexp rm -f $from_stamp $to_stamp } # # coredumps # findbinary() { random_binary=`which cat 2>/dev/null` # suppose we are lucky binary=`gdb $random_binary $1 < /dev/null 2>/dev/null | grep 'Core was generated' | awk '{print $5}' | sed "s/^.//;s/[.']*$//"` [ x = x"$binary" ] && return fullpath=`which $binary 2>/dev/null` if [ x = x"$fullpath" ]; then [ -x $HA_BIN/$binary ] && echo $HA_BIN/$binary else echo $fullpath fi } getbt() { which gdb > /dev/null 2>&1 || { warning "please install gdb to get backtraces" return } for corefile; do absbinpath=`findbinary $corefile` [ x = x"$absbinpath" ] && return 1 echo "====================== start backtrace ======================" ls -l $corefile gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \ $absbinpath $corefile 2>/dev/null echo "======================= end backtrace =======================" done } # # heartbeat configuration/status # iscrmrunning() { crmadmin -D >/dev/null 2>&1 & pid=$! maxwait=10 while kill -0 $pid 2>/dev/null && [ $maxwait -gt 0 ]; do sleep 1 maxwait=$((maxwait-1)) done if kill -0 $pid 2>/dev/null; then kill $pid false else wait $pid fi } dumpstate() { crm_mon -1 | grep -v '^Last upd' > $1/crm_mon.txt cibadmin -Ql > $1/cib.xml - ccm_tool -p > $1/ccm_tool.txt 2>&1 + crm_node -p > $1/ccm_tool.txt 2>&1 } getconfig() { [ -f $HA_CF ] && cp -p $HA_CF $1/ [ -f $LOGD_CF ] && cp -p $LOGD_CF $1/ if iscrmrunning; then dumpstate $1 touch $1/RUNNING else cp -p $HA_VARLIB/crm/cib.xml $1/ 2>/dev/null touch $1/STOPPED fi [ -f "$1/cib.xml" ] && crm_verify -V -x $1/cib.xml >$1/crm_verify.txt 2>&1 } # # remove values of sensitive attributes # # this is not proper xml parsing, but it will work under the # circumstances sanitize_xml_attrs() { sed $( for patt in $SANITIZE; do echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/" done ) } sanitize_hacf() { awk ' $1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; } {print} ' } sanitize_one() { file=$1 compress="" echo $file | grep -qs 'gz$' && compress=gzip echo $file | grep -qs 'bz2$' && compress=bzip2 if [ "$compress" ]; then decompress="$compress -dc" else compress=cat decompress=cat fi tmp=`maketempfile` && ref=`maketempfile` || fatal "cannot create temporary files" touch -r $file $ref # save the mtime if [ "`basename $file`" = ha.cf ]; then sanitize_hacf else $decompress | sanitize_xml_attrs | $compress fi < $file > $tmp mv $tmp $file touch -r $ref $file rm -f $ref } # # keep the user posted # fatal() { echo "`uname -n`: ERROR: $*" >&2 exit 1 } warning() { echo "`uname -n`: WARN: $*" >&2 } info() { echo "`uname -n`: INFO: $*" >&2 } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } # # get some system info # distro() { which lsb_release >/dev/null 2>&1 && { lsb_release -d return } relf=`ls /etc/debian_version 2>/dev/null` || relf=`ls /etc/slackware-version 2>/dev/null` || relf=`ls -d /etc/*-release 2>/dev/null` && { for f in $relf; do test -f $f && { echo "`ls $f` `cat $f`" return } done } warning "no lsb_release no /etc/*-release no /etc/debian_version" } hb_ver() { # for Linux .deb based systems which dpkg > /dev/null 2>&1 && { for pkg in heartbeat heartbeat-2; do dpkg-query -f '${Version}' -W $pkg 2>/dev/null && break done [ $? -eq 0 ] && debsums -s $pkg 2>/dev/null return } # for Linux .rpm based systems which rpm > /dev/null 2>&1 && { rpm -q --qf '%{version}' heartbeat && rpm --verify heartbeat return } # for OpenBSD which pkg_info > /dev/null 2>&1 && { pkg_info | grep heartbeat | cut -d "-" -f 2- | cut -d " " -f 1 return } # for Solaris which pkginfo > /dev/null 2>&1 && { pkginfo | awk '{print $3}' } # more packagers? } crm_info() { $HA_BIN/crmd version 2>&1 }