diff --git a/cts/CTStests.py.in b/cts/CTStests.py.in index 3fe92e891b..8952178f5e 100644 --- a/cts/CTStests.py.in +++ b/cts/CTStests.py.in @@ -1,2459 +1,2461 @@ #!@PYTHON@ '''CTS: Cluster Testing System: Tests module There are a few things we want to do here: ''' __copyright__=''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. Add RecourceRecover testcase Zhao Kai ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import CTS from CM_hb import HBConfig import CTSaudits import time, os, re, types, string, tempfile, sys from CTSaudits import * from stat import * # List of all class objects for tests which we ought to # consider running. class RandomTests: ''' A collection of tests which are run at random. ''' def __init__(self, scenario, cm, tests, Audits): self.CM = cm self.Env = cm.Env self.Scenario = scenario self.Tests = [] self.Audits = [] self.ns=CTS.NodeStatus(self.Env) for test in tests: if not issubclass(test.__class__, CTSTest): raise ValueError("Init value must be a subclass of CTSTest") if test.is_applicable(): self.Tests.append(test) if not scenario.IsApplicable(): raise ValueError("Scenario not applicable in" " given Environment") self.Stats = {"success":0, "failure":0, "BadNews":0} self.IndividualStats= {} for audit in Audits: if not issubclass(audit.__class__, ClusterAudit): raise ValueError("Init value must be a subclass of ClusterAudit") if audit.is_applicable(): self.Audits.append(audit) def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def audit(self, BadNews, test): errcount=0 BadNewsDebug=0 #BadNews.debug=1 while errcount < 1000: if BadNewsDebug: print "Looking for BadNews" match=BadNews.look(0) if match: if BadNewsDebug: print "BadNews found: "+match add_err = 1 ignorelist = [] if test: ignorelist=test.errorstoignore() ignorelist.append(" CTS: ") ignorelist.append("BadNews:") for ignore in ignorelist: if re.search(ignore, match): if BadNewsDebug: print "Ignoring:"+match+" (pattern: "+ignore+")" add_err = 0 if add_err == 1: ignorelist=self.CM.errorstoignore() for ignore in ignorelist: if re.search(ignore, match): if BadNewsDebug: print "Ignoring:"+match+" (pattern: "+ignore+")" add_err = 0 if add_err == 1: self.CM.log("BadNews: " + match) self.incr("BadNews") errcount=errcount+1 else: break else: self.CM.log("Big problems. Shutting down.") self.CM.stopall() self.summarize() raise ValueError("Looks like we hit the jackpot! :-)") for audit in self.Audits: if not audit(): self.CM.log("Audit " + audit.name() + " FAILED.") self.incr("auditfail") if test: test.incr("auditfail") def summarize(self): self.CM.log("****************") self.CM.log("Overall Results:" + repr(self.Stats)) self.CM.log("****************") self.CM.log("Detailed Results") for test in self.Tests: self.CM.log("Test %s: \t%s" %(test.name, repr(test.Stats))) self.CM.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") def run(self, max=1): ( ''' Set up the given scenario, then run the selected tests at random for the selected number of iterations. ''') BadNews=CTS.LogWatcher(self.CM["LogFileName"], self.CM["BadRegexes"] , timeout=0) BadNews.setwatch() self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"]) for node in self.CM.Env["nodes"]: if node in self.CM.Env["oprofile"]: self.CM.log("Enabling oprofile on %s" % node) self.CM.rsh.remote_py(node, "os", "system", "opcontrol --init") self.CM.rsh.remote_py(node, "os", "system", "opcontrol --start") if not self.Scenario.SetUp(self.CM): return None for node in self.CM.Env["nodes"]: if node in self.CM.Env["oprofile"]: self.CM.rsh.remote_py( node, "os", "system", "opcontrol --save=cts.setup") testcount=1 time.sleep(30) # This makes sure everything is stabilized before starting... self.audit(BadNews, None) while testcount <= max: test = self.Env.RandomGen.choice(self.Tests) # Some tests want a node as an argument. nodechoice = self.Env.RandomNode() #logsize = os.stat(self.CM["LogFileName"])[ST_SIZE] #self.CM.log("Running test %s (%s) \t[%d : %d]" # % (test.name, nodechoice, testcount, logsize)) self.CM.log("Running test %s (%s) \t[%d]" % (test.name, nodechoice, testcount)) testcount = testcount + 1 starttime=time.time() test.starttime=starttime ret=test(nodechoice) for node in self.CM.Env["nodes"]: if node in self.CM.Env["oprofile"]: self.CM.rsh.remote_py( node, "os", "system", "opcontrol --save=cts.%d" % (testcount-1)) if ret: self.incr("success") else: self.incr("failure") self.CM.log("Test %s (%s) \t[FAILED]" %(test.name,nodechoice)) # Better get the current info from the cluster... self.CM.statall() # Make sure logging is working and we have enough disk space... if not self.CM.Env["DoBSC"]: if not self.CM.TestLogging(): sys.exit(1) if not self.CM.CheckDf(): sys.exit(1) stoptime=time.time() elapsed_time = stoptime - starttime test_time = stoptime - test.starttime if not test.has_key("min_time"): test["elapsed_time"] = elapsed_time test["min_time"] = test_time test["max_time"] = test_time else: test["elapsed_time"] = test["elapsed_time"] + elapsed_time if test_time < test["min_time"]: test["min_time"] = test_time if test_time > test["max_time"]: test["max_time"] = test_time self.audit(BadNews, test) self.Scenario.TearDown(self.CM) for node in self.CM.Env["nodes"]: if node in self.CM.Env["oprofile"]: self.CM.log("Disabling oprofile on %s" % node) self.CM.rsh.remote_py(node, "os", "system", "opcontrol --shutdown") self.audit(BadNews, None) for test in self.Tests: self.IndividualStats[test.name] = test.Stats return self.Stats, self.IndividualStats AllTestClasses = [ ] class CTSTest: ''' A Cluster test. We implement the basic set of properties and behaviors for a generic cluster test. Cluster tests track their own statistics. We keep each of the kinds of counts we track as separate {name,value} pairs. ''' def __init__(self, cm): #self.name="the unnamed test" self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} # if not issubclass(cm.__class__, ClusterManager): # raise ValueError("Must be a ClusterManager object") self.CM = cm self.timeout=120 self.starttime=0 def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def failure(self, reason="none"): '''Increment the failure count''' self.incr("failure") self.CM.log("Test " + self.name + " failed [reason:" + reason + "]") return None def success(self): '''Increment the success count''' self.incr("success") return 1 def skipped(self): '''Increment the skipped count''' self.incr("skipped") return 1 def __call__(self, node): '''Perform the given test''' raise ValueError("Abstract Class member (__call__)") self.incr("calls") return self.failure() def is_applicable(self): '''Return TRUE if we are applicable in the current test configuration''' raise ValueError("Abstract Class member (is_applicable)") return 1 def canrunnow(self): '''Return TRUE if we can meaningfully run right now''' return 1 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] ################################################################### class StopTest(CTSTest): ################################################################### '''Stop (deactivate) the cluster manager on a node''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="Stop" self.uspat = self.CM["Pat:We_stopped"] self.thempat = self.CM["Pat:They_stopped"] def __call__(self, node): '''Perform the 'stop' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] != self.CM["up"]: return self.skipped() patterns = [] # Technically we should always be able to notice ourselves stopping patterns.append(self.CM["Pat:We_stopped"] % node) if self.CM.Env["use_logd"]: patterns.append(self.CM["Pat:Logd_stopped"] % node) # Any active node needs to notice this one left # NOTE: This wont work if we have multiple partitions for other in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[other] == self.CM["up"] and other != node: patterns.append(self.CM["Pat:They_stopped"] %(other, node)) #self.debug("Checking %s will notice %s left"%(other, node)) watch = CTS.LogWatcher( self.CM["LogFileName"], patterns, self.CM["DeadTime"]) watch.setwatch() if node == self.CM.OurNode: self.incr("us") else: if self.CM.upcount() <= 1: self.incr("all") else: self.incr("them") self.CM.StopaCM(node) watch_result = watch.lookforall() + failreason=None UnmatchedList = "||" if watch.unmatched: for regex in watch.unmatched: - self.CM.log ("Warn: Shutdown pattern not found: %s" % (regex)) + self.CM.log ("ERROR: Shutdown pattern not found: %s" % (regex)) UnmatchedList += regex + "||"; + failreason="Missing shutdown pattern" self.CM.cluster_stable(self.CM["DeadTime"]) - # because syslog looses so many messages we can only really fail - # the stop if _none_ of the CCM peers notice the node leave - # Note: Syslog only if not watch.unmatched or self.CM.upcount() == 0: return self.success() elif len(watch.unmatched) >= self.CM.upcount(): return self.failure("no match against (%s)" % UnmatchedList) - return self.success() + if failreason == None: + return self.success() + else: + return self.failure(failreason) # # We don't register StopTest because it's better when called by # another test... # ################################################################### class StartTest(CTSTest): ################################################################### '''Start (activate) the cluster manager on a node''' def __init__(self, cm, debug=None): CTSTest.__init__(self,cm) self.name="start" self.debug = debug self.uspat = self.CM["Pat:We_started"] self.thempat = self.CM["Pat:They_started"] def __call__(self, node): '''Perform the 'start' test. ''' self.incr("calls") if self.CM.upcount() == 0: self.incr("us") else: self.incr("them") if self.CM.ShouldBeStatus[node] != self.CM["down"]: return self.skipped() elif self.CM.StartaCM(node): return self.success() else: return self.failure("Startup %s on node %s failed" %(self.CM["Name"], node)) def is_applicable(self): '''StartTest is always applicable''' return 1 # # We don't register StartTest because it's better when called by # another test... # ################################################################### class FlipTest(CTSTest): ################################################################### '''If it's running, stop it. If it's stopped start it. Overthrow the status quo... ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Flip" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'Flip' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.incr("stopped") ret = self.stop(node) type="up->down" # Give the cluster time to recognize it's gone... time.sleep(self.CM["StableTime"]) elif self.CM.ShouldBeStatus[node] == self.CM["down"]: self.incr("started") ret = self.start(node) type="down->up" else: return self.skipped() self.incr(type) if ret: return self.success() else: return self.failure("%s failure" % type) def is_applicable(self): '''FlipTest is always applicable''' return 1 # Register FlipTest as a good test to run AllTestClasses.append(FlipTest) ################################################################### class RestartTest(CTSTest): ################################################################### '''Stop and restart a node''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Restart" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'restart' test. ''' self.incr("calls") self.incr("node:" + node) ret1 = 1 if self.CM.StataCM(node): self.incr("WasStopped") if not self.start(node): return self.failure("start (setup) failure: "+node) self.starttime=time.time() if not self.stop(node): return self.failure("stop failure: "+node) if not self.start(node): return self.failure("start failure: "+node) return self.success() def is_applicable(self): '''RestartTest is always applicable''' return 1 # Register RestartTest as a good test to run AllTestClasses.append(RestartTest) ################################################################### class StonithTest(CTSTest): ################################################################### '''Reboot a node by whacking it with stonith.''' def __init__(self, cm, timeout=900): CTSTest.__init__(self,cm) self.name="Stonith" self.theystopped = self.CM["Pat:They_dead"] self.allstopped = self.CM["Pat:All_stopped"] self.usstart = self.CM["Pat:We_started"] self.themstart = self.CM["Pat:They_started"] self.timeout = timeout self.ssherror = False def _reset(self, node): StonithWorked=False for tries in 1,2,3,4,5: if self.CM.Env.ResetNode(node): StonithWorked=True break return StonithWorked def setup(self, target_node): # nothing to do return 1 def __call__(self, node): '''Perform the 'stonith' test. (whack the node)''' self.incr("calls") stopwatch = 0 rc = 0 if not self.setup(node): return self.failure("Setup failed") # Figure out what log message to look for when/if it goes down # # Any active node needs to notice this one left # NOTE: This wont work if we have multiple partitions stop_patterns = [] for other in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[other] == self.CM["up"] and other != node: stop_patterns.append(self.CM["Pat:They_stopped"] %(other, node)) stopwatch = 1 #self.debug("Checking %s will notice %s left"%(other, node)) if self.CM.ShouldBeStatus[node] == self.CM["down"]: # actually no-one will notice this node die since HA isnt running stopwatch = 0 # Figure out what log message to look for when it comes up if self.CM.upcount() == 1 and self.CM.ShouldBeStatus[node] == self.CM["up"]: uppat = (self.usstart % node) else: uppat = (self.themstart % node) upwatch = CTS.LogWatcher(self.CM["LogFileName"], [uppat] , timeout=self.timeout) if stopwatch == 1: watch = CTS.LogWatcher(self.CM["LogFileName"], stop_patterns , timeout=self.CM["DeadTime"]+10) watch.setwatch() # Reset (stonith) the node self.CM.debug("Resetting: "+node) StonithWorked = self._reset(node) if not StonithWorked: return self.failure("Stonith didn't work") if self.ssherror == True: self.CM.log("NOTE: Stonith command reported success but node %s did not restart (atd, reboot or ssh error)" % node) return self.success() upwatch.setwatch() # Look() and see if the machine went down if stopwatch == 0: # Allow time for the node to die time.sleep(self.CM["DeadTime"]+10) elif not watch.lookforall(): if watch.unmatched: for regex in watch.unmatched: self.CM.log("Warn: STONITH pattern not found: %s"%regex) # !!no-one!! saw this node die if len(watch.unmatched) == len(stop_patterns): return self.failure("No-one saw %s die" %node) # else: syslog* lost a message # Alas I dont think this check is plausable (beekhof) # # Check it really stopped... #self.CM.ShouldBeStatus[node] = self.CM["down"] #if self.CM.StataCM(node) == 1: # ret1=0 # Look() and see if the machine came back up rc=0 if upwatch.look(): self.CM.debug("Startup pattern found: %s" %uppat) rc=1 else: self.CM.log("Warn: Startup pattern not found: %s" %uppat) # Check it really started... self.CM.ShouldBeStatus[node] = self.CM["up"] if rc == 0 and self.CM.StataCM(node) == 1: rc=1 # wait for the cluster to stabilize self.CM.cluster_stable() if node in self.CM.Env["oprofile"]: self.CM.log("Enabling oprofile on %s" % node) self.CM.rsh.remote_py(node, "os", "system", "opcontrol --init") self.CM.rsh.remote_py(node, "os", "system", "opcontrol --start") # return case processing if rc == 0: return self.failure("Node %s did not restart" %node) else: return self.success() def is_applicable(self): '''StonithTest is applicable unless suppressed by CM.Env["DoStonith"] == FALSE''' # for v2, stonithd test is a better test to run. if self.CM["Name"] == "linux-ha-v2": return None if self.CM.Env.has_key("DoStonith"): return self.CM.Env["DoStonith"] return 1 # Register StonithTest as a good test to run AllTestClasses.append(StonithTest) ################################################################### class StonithdTest(StonithTest): ################################################################### def __init__(self, cm, timeout=600): StonithTest.__init__(self, cm, timeout=600) self.name="Stonithd" self.startall = SimulStartLite(cm) self.start = StartTest(cm) self.stop = StopTest(cm) self.init_node = None def _reset(self, target_node): if len(self.CM.Env["nodes"]) < 2: return self.skipped() StonithWorked = False SshNotWork = 0 for tries in range(1,5): # For some unknown reason, every now and then the ssh plugin just # can't kill the target_node - everything works fine with stonithd # and the plugin, but atd, reboot or ssh (or maybe something else) # doesn't do its job and target_node remains alive. So look for # the indicative messages and bubble-up the error via ssherror watchpats = [] watchpats.append("Initiating ssh-reset") watchpats.append("CRIT: still able to ping") watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+30) watch.setwatch() fail_reasons = [] if self.CM.Env.ResetNode2(self.init_node, target_node, fail_reasons): StonithWorked = True break if watch.lookforall(): SshNotWork = SshNotWork + 1 continue for reason in fail_reasons: self.CM.log(reason) if StonithWorked == False and SshNotWork == tries: StonithWorked = True self.ssherror = True return StonithWorked def setup(self, target_node): if len(self.CM.Env["nodes"]) < 2: return 1 self.init_node = self.CM.Env.RandomNode() while self.init_node == target_node: self.init_node = self.CM.Env.RandomNode() if not self.startall(None): return self.failure("Test setup failed") return 1 def is_applicable(self): if not self.CM["Name"] == "linux-ha-v2": return 0 if self.CM.Env.has_key("DoStonith"): return self.CM.Env["DoStonith"] return 1 AllTestClasses.append(StonithdTest) ################################################################### class IPaddrtest(CTSTest): ################################################################### '''Find the machine supporting a particular IP address, and knock it down. [Hint: This code isn't finished yet...] ''' def __init__(self, cm, IPaddrs): CTSTest.__init__(self,cm) self.name="IPaddrtest" self.IPaddrs = IPaddrs self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, IPaddr): ''' Perform the IPaddr test... ''' self.incr("calls") node = self.CM.Env.RandomNode() self.incr("node:" + node) if self.CM.ShouldBeStatus[node] == self.CM["down"]: self.incr("WasStopped") self.start(node) ret1 = self.stop(node) # Give the cluster time to recognize we're gone... time.sleep(self.CM["StableTime"]) ret2 = self.start(node) if not ret1: return self.failure("Could not stop") if not ret2: return self.failure("Could not start") return self.success() def is_applicable(self): '''IPaddrtest is always applicable (but shouldn't be)''' return 1 ################################################################### class StartOnebyOne(CTSTest): ################################################################### '''Start all the nodes ~ one by one''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StartOnebyOne" self.stopall = SimulStopLite(cm) self.start = StartTest(cm) self.ns=CTS.NodeStatus(cm.Env) def __call__(self, dummy): '''Perform the 'StartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Test setup failed") failed=[] self.starttime=time.time() for node in self.CM.Env["nodes"]: if not self.start(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to start: " + repr(failed)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): '''StartOnebyOne is always applicable''' return 1 # Register StartOnebyOne as a good test to run AllTestClasses.append(StartOnebyOne) ################################################################### class SimulStart(CTSTest): ################################################################### '''Start all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStart" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'SimulStart' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Setup failed") self.CM.clear_all_caches() if not self.startall(None): return self.failure("Startall failed") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): '''SimulStart is always applicable''' return 1 # Register SimulStart as a good test to run AllTestClasses.append(SimulStart) ################################################################### class SimulStop(CTSTest): ################################################################### '''Stop all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStop" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, dummy): '''Perform the 'SimulStop' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.stopall(None): return self.failure("Stopall failed") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): '''SimulStop is always applicable''' return 1 # Register SimulStop as a good test to run AllTestClasses.append(SimulStop) ################################################################### class StopOnebyOne(CTSTest): ################################################################### '''Stop all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StopOnebyOne" self.startall = SimulStartLite(cm) self.stop = StopTest(cm) def __call__(self, dummy): '''Perform the 'StopOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") failed=[] self.starttime=time.time() for node in self.CM.Env["nodes"]: if not self.stop(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to stop: " + repr(failed)) self.CM.clear_all_caches() return self.success() def is_applicable(self): '''StopOnebyOne is always applicable''' return 1 # Register StopOnebyOne as a good test to run AllTestClasses.append(StopOnebyOne) ################################################################### class RestartOnebyOne(CTSTest): ################################################################### '''Restart all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="RestartOnebyOne" self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'RestartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") did_fail=[] self.starttime=time.time() self.restart = RestartTest(self.CM) for node in self.CM.Env["nodes"]: if not self.restart(node): did_fail.append(node) if did_fail: return self.failure("Could not restart %d nodes: %s" %(len(did_fail), repr(did_fail))) return self.success() def is_applicable(self): '''RestartOnebyOne is always applicable''' return 1 # Register StopOnebyOne as a good test to run AllTestClasses.append(RestartOnebyOne) ################################################################### class PartialStart(CTSTest): ################################################################### '''Start a node - but tell it to stop before it finishes starting up''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="PartialStart" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'PartialStart' test. ''' self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Setup failed") # FIXME! This should use the CM class to get the pattern # then it would be applicable in general watchpats = [] watchpats.append("Starting crmd") watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats, timeout=self.CM["DeadTime"]+10) watch.setwatch() self.CM.StartaCMnoBlock(node) ret = watch.lookforall() if not ret: self.CM.log("Patterns not found: " + repr(watch.unmatched)) return self.failure("Setup of %s failed" % node) ret = self.stopall(None) if not ret: return self.failure("%s did not stop in time" % node) return self.success() def is_applicable(self): '''Partial is always applicable''' if self.CM["Name"] == "linux-ha-v2": return 1 else: return 0 # Register StopOnebyOne as a good test to run AllTestClasses.append(PartialStart) ################################################################### class StandbyTest(CTSTest): ################################################################### '''Put a node in standby mode''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="standby" self.successpat = self.CM["Pat:StandbyOK"] self.nostandbypat = self.CM["Pat:StandbyNONE"] self.transient = self.CM["Pat:StandbyTRANSIENT"] def __call__(self, node): '''Perform the 'standby' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == self.CM["down"]: return self.skipped() if self.CM.upcount() < 2: self.incr("nostandby") pat = self.nostandbypat else: self.incr("standby") pat = self.successpat # # You could make a good argument that the cluster manager # ought to give us good clues on when its a bad time to # switch over to the other side, but heartbeat doesn't... # It could also queue the request. But, heartbeat # doesn't do that either :-) # retrycount=0 while (retrycount < 10): watch = CTS.LogWatcher(self.CM["LogFileName"] , [pat, self.transient] , timeout=self.CM["DeadTime"]+10) watch.setwatch() self.CM.rsh(node, self.CM["Standby"]) match = watch.look() if match: if re.search(self.transient, match): self.incr("retries") time.sleep(2) retrycount=retrycount+1 else: return self.success() else: break # No point in retrying... return self.failure("did not find pattern " + pat) def is_applicable(self): '''StandbyTest is applicable when the CM has a Standby command''' if not self.CM.has_key("Standby"): return None else: #if self.CM.Env.has_key("DoStandby"): #flag=self.CM.Env["DoStandby"] #if type(flag) == types.IntType: #return flag #if not re.match("[yt]", flag, re.I): #return None # # We need to strip off everything after the first blank # cmd=self.CM["Standby"] cmd = cmd.split()[0] if not os.access(cmd, os.X_OK): return None cf = self.CM.cf if not cf.Parameters.has_key("auto_failback"): return None elif cf.Parameters["auto_failback"][0] == "legacy": return None return 1 # Register StandbyTest as a good test to run AllTestClasses.append(StandbyTest) ####################################################################### class StandbyTest2(CTSTest): ####################################################################### '''Standby with CRM of HA release 2''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="standby2" self.start = StartTest(cm) self.startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resouces, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): self.incr("calls") ret=self.startall(None) if not ret: return self.failure("Start all nodes failed") self.CM.debug("Make sure node %s is active" % node) if self.CM.StandbyStatus(node) != "off": if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.CM.debug("Getting resources running on node %s" % node) rsc_on_node = [] for rsc in self.CM.Resources(): if rsc.IsRunningOn(node): rsc_on_node.append(rsc) self.CM.debug("Setting node %s to standby mode" % node) if not self.CM.SetStandbyMode(node, "on"): return self.failure("can't set node %s to standby mode" % node) time.sleep(30) # Allow time for the update to be applied and cause something self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "on": return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) self.CM.debug("Checking resources") for rsc in self.CM.Resources(): if rsc.IsRunningOn(node): return self.failure("%s set to standby, %s is still running on it" % (node, rsc.rid)) self.CM.debug("Setting node %s to active mode" % node) if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) time.sleep(30) # Allow time for the update to be applied and cause something self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.CM.debug("Checking resources") for rsc in rsc_on_node: if not rsc.IsRunningOn(node): return self.failure("%s set to active but %s is NOT back" % (node, rsc.rid)) return self.success() def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 AllTestClasses.append(StandbyTest2) ####################################################################### class Fastdetection(CTSTest): ####################################################################### '''Test the time which one node find out the other node is killed very quickly''' def __init__(self,cm,timeout=60): CTSTest.__init__(self, cm) self.name = "DetectionTime" self.they_stopped = self.CM["Pat:They_stopped"] self.timeout = timeout self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.standby = StandbyTest(cm) self.__setitem__("min", 0) self.__setitem__("max", 0) self.__setitem__("totaltime", 0) def __call__(self, node): '''Perform the fastfailureDetection test''' self.incr("calls") ret=self.startall(None) if not ret: return self.failure("Test setup failed") if self.CM.upcount() < 2: return self.skipped() # Make sure they're not holding any resources ret = self.standby(node) if not ret: return ret stoppat = (self.they_stopped % ("", node)) stopwatch = CTS.LogWatcher(self.CM["LogFileName"], [stoppat], timeout=self.timeout) stopwatch.setwatch() # # This test is CM-specific - FIXME!! # if self.CM.rsh(node, "killall -9 heartbeat")==0: Starttime = os.times()[4] if stopwatch.look(): Stoptime = os.times()[4] # This test is CM-specific - FIXME!! self.CM.rsh(node, "killall -9 @libdir@/heartbeat/ccm @libdir@/heartbeat/ipfail >/dev/null 2>&1; true") Detectiontime = Stoptime-Starttime detectms = int(Detectiontime*1000+0.5) self.CM.log("...failure detection time: %d ms" % detectms) self.Stats["totaltime"] = self.Stats["totaltime"] + Detectiontime if self.Stats["min"] == 0: self.Stats["min"] = Detectiontime if Detectiontime > self.Stats["max"]: self.Stats["max"] = Detectiontime if Detectiontime < self.Stats["min"]: self.Stats["min"] = Detectiontime self.CM.ShouldBeStatus[node] = self.CM["down"] self.start(node) return self.success() else: # This test is CM-specific - FIXME!! self.CM.rsh(node, "killall -9 @libdir@/heartbeat/ccm @libdir@/heartbeat/ipfail >/dev/null 2>&1; true") self.CM.ShouldBeStatus[node] = self.CM["down"] ret=self.start(node) return self.failure("Didn't find the log message") else: return self.failure("Couldn't kill cluster manager") def is_applicable(self): '''This test is applicable when auto_failback != legacy''' return self.standby.is_applicable() # This test is CM-specific - FIXME!! def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ "ccm.*ERROR: ccm_control_process:failure to send protoversion request" , "ccm.*ERROR: Lost connection to heartbeat service. Need to bail out" ] AllTestClasses.append(Fastdetection) ############################################################################## class BandwidthTest(CTSTest): ############################################################################## # Tests should not be cluster-manager-specific # If you need to find out cluster manager configuration to do this, then # it should be added to the generic cluster manager API. '''Test the bandwidth which heartbeat uses''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Bandwidth" self.start = StartTest(cm) self.__setitem__("min",0) self.__setitem__("max",0) self.__setitem__("totalbandwidth",0) self.tempfile = tempfile.mktemp(".cts") self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform the Bandwidth test''' self.incr("calls") if self.CM.upcount()<1: return self.skipped() Path = self.CM.InternalCommConfig() if "ip" not in Path["mediatype"]: return self.skipped() port = Path["port"][0] port = int(port) ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(5) # We get extra messages right after startup. fstmpfile = "/var/run/band_estimate" dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ % (port, fstmpfile) rc = self.CM.rsh(node, dumpcmd) if rc == 0: farfile = "root@%s:%s" % (node, fstmpfile) self.CM.rsh.cp(farfile, self.tempfile) Bandwidth = self.countbandwidth(self.tempfile) if not Bandwidth: self.CM.log("Could not compute bandwidth.") return self.success() intband = int(Bandwidth + 0.5) self.CM.log("...bandwidth: %d bits/sec" % intband) self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth if self.Stats["min"] == 0: self.Stats["min"] = Bandwidth if Bandwidth > self.Stats["max"]: self.Stats["max"] = Bandwidth if Bandwidth < self.Stats["min"]: self.Stats["min"] = Bandwidth self.CM.rsh(node, "rm -f %s" % fstmpfile) os.unlink(self.tempfile) return self.success() else: return self.failure("no response from tcpdump command [%d]!" % rc) def countbandwidth(self, file): fp = open(file, "r") fp.seek(0) count = 0 sum = 0 while 1: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count=count+1 linesplit = string.split(line," ") for j in range(len(linesplit)-1): if linesplit[j]=="udp": break if linesplit[j]=="length:": break try: sum = sum + int(linesplit[j+1]) except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T1 = linesplit[0] timesplit = string.split(T1,":") time2split = string.split(timesplit[2],".") time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 break while count < 100: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count+1 linessplit = string.split(line," ") for j in range(len(linessplit)-1): if linessplit[j] =="udp": break if linesplit[j]=="length:": break try: sum=int(linessplit[j+1])+sum except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T2 = linessplit[0] timesplit = string.split(T2,":") time2split = string.split(timesplit[2],".") time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 time = time2-time1 if (time <= 0): return 0 return (sum*8)/time def is_applicable(self): '''BandwidthTest is always applicable''' return 0 AllTestClasses.append(BandwidthTest) ########################################################################## class RedundantpathTest(CTSTest): ########################################################################## '''In heartbeat, it has redundant path to communicate between the cluster''' # # Tests should not be cluster-manager specific # One needs to isolate what you need from the cluster manager and then # add a (new) API to do it. # def __init__(self,cm,timeout=60): CTSTest.__init__(self,cm) self.name = "RedundantpathTest" self.timeout = timeout def PathCount(self): '''Return number of communication paths''' Path = self.CM.InternalCommConfig() cf = self.CM.cf eths = [] serials = [] num = 0 for interface in Path["interface"]: if re.search("eth",interface): eths.append(interface) num = num + 1 if re.search("/dev",interface): serials.append(interface) num = num + 1 return (num, eths, serials) def __call__(self,node): '''Perform redundant path test''' self.incr("calls") if self.CM.ShouldBeStatus[node]!=self.CM["up"]: return self.skipped() (num, eths, serials) = self.PathCount() for eth in eths: if self.CM.rsh(node,"ifconfig %s down" % eth)==0: PathDown = "OK" break if PathDown != "OK": for serial in serials: if self.CM.rsh(node,"setserial %s uart none" % serial)==0: PathDown = "OK" break if PathDown != "OK": return self.failure("Cannot break the path") time.sleep(self.timeout) for audit in CTSaudits.AuditList(self.CM): if not audit(): for eth in eths: self.CM.rsh(node,"ifconfig %s up" % eth) for serial in serials: self.CM.rsh(node,"setserial %s uart 16550" % serial) return self.failure("Redundant path fail") for eth in eths: self.CM.rsh(node,"ifconfig %s up" % eth) for serial in serials: self.CM.rsh(node,"setserial %s uart 16550" % serial) return self.success() def is_applicable(self): '''It is applicable when you have more than one connection''' return self.PathCount()[0] > 1 # FIXME!! Why is this one commented out? #AllTestClasses.append(RedundantpathTest) ########################################################################## class DRBDTest(CTSTest): ########################################################################## '''In heartbeat, it provides replicated storage.''' def __init__(self,cm, timeout=10): CTSTest.__init__(self,cm) self.name = "DRBD" self.timeout = timeout def __call__(self, dummy): '''Perform the 'DRBD' test.''' self.incr("calls") for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["down"]: return self.skipped() # Note: All these special cases with Start/Stop/StatusDRBD # should be reworked to use resource objects instead of # being hardwired to bypass the objects here. for node in self.CM.Env["nodes"]: done=time.time()+self.timeout+1 while (time.time()done: return self.failure("Can't start drbd, please check it") device={} for node in self.CM.Env["nodes"]: device[node]=self.getdevice(node) node = self.CM.Env["nodes"][0] done=time.time()+self.timeout+1 while 1: if (time.time()>done): return self.failure("the drbd could't sync") self.CM.rsh(node,"cp /proc/drbd /var/run >/dev/null 2>&1") if self.CM.rsh.cp("%s:/var/run/drbd" % node,"/var/run"): line = open("/tmp/var/run").readlines()[2] p = line.find("Primary") s1 = line.find("Secondary") s2 = line.rfind("Secondary") if s1!=s2: if self.CM.rsh(node,"drbdsetup %s primary" % device[node]): pass if p!=-1: if p/dev/null" % (self.rid, node)) watch.lookforall() self.CM.cluster_stable() recovernode=self.CM.ResourceLocation(self.rid) if len(recovernode)==1: self.CM.debug("Recovered: %s is running on %s" %(self.rid, recovernode[0])) if not watch.unmatched: return self.success() else: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) elif len(recovernode)==0: return self.failure("%s was not recovered and is inactive" % self.rid) else: return self.failure("%s is now active on more than one node: %s" %(self.rid, str(recovernode))) def is_applicable(self): '''ResourceRecover is applicable only when there are resources running on our cluster and environment is linux-ha-v2''' if self.CM["Name"] == "linux-ha-v2": resourcelist=self.CM.Resources() if len(resourcelist)==0: self.CM.log("No resources on this cluster") return 0 else: return 1 return 0 def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """Updating failcount for %s""" % self.rid, """Unknown operation: fail""", """ERROR: sending stonithRA op to stonithd failed.""", """ERROR: process_lrm_event: LRM operation %s_%s_%d""" % (self.rid, self.action, self.interval), """ERROR: process_graph_event: Action %s_%s_%d initiated outside of a transition""" % (self.rid, self.action, self.interval), ] AllTestClasses.append(ResourceRecover) ################################################################### class ComponentFail(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="ComponentFail" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) self.complist = cm.Components() self.theystart = cm["Pat:They_started"] def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") # start all nodes if not self.CM.cluster_stable(): self.stopall(None) ret = self.startall(None) if not ret: return self.failure("Setup failed") # select a component to kill component = self.CM.Env.RandomGen.choice(self.complist) self.CM.log("choose %s to kill"%component.name) patterns = [] patterns.append("%s heartbeat.*Respawning.*%s" %(node, component.name)) patterns.append(self.theystart%node) # set the watch for stable watch = CTS.LogWatcher( self.CM["LogFileName"], patterns, self.CM["DeadTime"]+10) watch.setwatch() # kill the component component.kill(node) # check to see Heartbeat noticed match = watch.look() if match: self.CM.log("Found match: %s"%(match)) # now watch it recover... if self.CM.cluster_stable(): return self.success() else: self.failure("Cluster not stable") else: return self.failure("Heartbeat didnt notice %s die" %component) def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 def errorstoignore(self): '''Return list of errors which should be ignored''' return ["""heartbeat.*killed by signal 9""", """heartbeat.*Respawning"""] #AllTestClasses.append(ComponentFail) #################################################################### class Split_brainTest2(CTSTest): #################################################################### '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "Split_brain2" self.start = StartTest(cm) self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed") count1 = self.CM.Env.RandomGen.randint(1,len(self.CM.Env["nodes"])-1) partition1 = [] while len(partition1) < count1: select = self.CM.Env.RandomGen.choice(self.CM.Env["nodes"]) if not select in partition1: partition1.append(select) partition2 = [] for member in self.CM.Env["nodes"]: if not member in partition1: partition2.append(member) allownodes1 = "" for member in partition1: allownodes1 += member + " " allownodes2 = "" for member in partition2: allownodes2 += member + " " self.CM.log("Partition1: " + str(partition1)) self.CM.log("Partition2: " + str(partition2)) '''isolate nodes, Look for node is dead message''' watchdeadpats = [ ] deadpat = self.CM["Pat:They_dead"] for member in self.CM.Env["nodes"]: thispat = (deadpat % member) watchdeadpats.append(thispat) watchdead = CTS.LogWatcher(self.CM["LogFileName"], watchdeadpats\ , timeout=self.CM["DeadTime"]+60) watchdead.ReturnOnlyMatch() watchdead.setwatch() for member in partition1: if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 : self.CM.savecomm_node(node) if not self.CM.isolate_node(member,allownodes1): return self.failure("Could not isolate the nodes") for member in partition2: if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 : self.CM.savecomm_node(node) if not self.CM.isolate_node(member,allownodes2): return self.failure("Could not isolate the nodes") if not watchdead.lookforall(): for member in self.CM.Env["nodes"]: self.CM.unisolate_node(member) self.CM.log("Patterns not found: " + repr(watchdead.unmatched)) return self.failure("Didn't find the log 'dead' message") dcnum=0 while dcnum < 2: dcnum = 0 for member in self.CM.Env["nodes"]: if self.CM.is_node_dc(member): dcnum += 1 time.sleep(1) ''' Unisolate the node, look for the return partition message and check whether they restart ''' watchpartitionpats = [self.CM["Pat:DC_IDLE"]] partitionpat = self.CM["Pat:Return_partition"] for member in self.CM.Env["nodes"]: thispat = (partitionpat % member) watchpartitionpats.append(thispat) watchpartition = CTS.LogWatcher(self.CM["LogFileName"], watchpartitionpats\ , timeout=self.CM["DeadTime"]+60) watchpartition.setwatch() for member in self.CM.Env["nodes"]: if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 : self.CM.restorecomm_node(node) self.CM.unisolate_node(member) if not watchpartition.lookforall(): self.CM.log("Patterns not found: " + repr(watchpartition.unmatched)) return self.failure("Didn't find return from partition messages") return self.success() def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ "ERROR:.*Both machines own.*resources" , "ERROR:.*lost a lot of packets!" , "ERROR: Cannot rexmit pkt .*: seqno too low" , "ERROR: Irretrievably lost packet: node" ] #AllTestClasses.append(Split_brainTest2) #################################################################### class MemoryTest(CTSTest): #################################################################### '''Check to see if anyone is leaking memory''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Memory" # self.test = ElectionMemoryTest(cm) self.test = ResourceRecover(cm) self.startall = SimulStartLite(cm) self.before = {} self.after = {} def __call__(self, node): ps_command='''ps -eo ucomm,pid,pmem,tsiz,dsiz,rss,vsize | grep -e ccm -e ha_logd -e cib -e crmd -e lrmd -e tengine -e pengine''' memory_error = [ "", "", "", "Code", "Data", "Resident", "Total" ] ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(10) for node in self.CM.Env["nodes"]: self.before[node] = {} rsh_pipe = self.CM.rsh.popen(node, ps_command) rsh_pipe.tochild.close() result = rsh_pipe.fromchild.readline() while result: tokens = result.split() self.before[node][tokens[1]] = result result = rsh_pipe.fromchild.readline() rsh_pipe.fromchild.close() self.lastrc = rsh_pipe.wait() # do something... if not self.test(node): return self.failure("Underlying test failed") time.sleep(10) for node in self.CM.Env["nodes"]: self.after[node] = {} rsh_pipe = self.CM.rsh.popen(node, ps_command) rsh_pipe.tochild.close() result = rsh_pipe.fromchild.readline() while result: tokens = result.split() self.after[node][tokens[1]] = result result = rsh_pipe.fromchild.readline() rsh_pipe.fromchild.close() self.lastrc = rsh_pipe.wait() failed_nodes = [] for node in self.CM.Env["nodes"]: failed = 0 for process in self.before[node]: messages = [] before_line = self.before[node][process] after_line = self.after[node][process] if not after_line: self.CM.log("%s %s[%s] exited during the test" %(node, before_tokens[0], before_tokens[1])) continue before_tokens = before_line.split() after_tokens = after_line.split() # 3 : Code size # 4 : Data size # 5 : Resident size # 6 : Total size for index in [ 3, 4, 6 ]: mem_before = int(before_tokens[index]) mem_after = int(after_tokens[index]) mem_diff = mem_after - mem_before mem_allow = mem_before * 0.01 # for now... mem_allow = 0 if mem_diff > mem_allow: failed = 1 messages.append("%s size grew by %dkB (%dkB)" %(memory_error[index], mem_diff, mem_after)) elif mem_diff < 0: messages.append("%s size shrank by %dkB (%dkB)" %(memory_error[index], mem_diff, mem_after)) if len(messages) > 0: self.CM.log("Process %s[%s] on %s: %s" %(before_tokens[0], before_tokens[1], node, repr(messages))) self.CM.debug("%s Before: %s[%s] (%s%%):\tcode=%skB, data=%skB, resident=%skB, total=%skB" %(node, before_tokens[0], before_tokens[1], before_tokens[2], before_tokens[3], before_tokens[4], before_tokens[5], before_tokens[6])) self.CM.debug("%s After: %s[%s] (%s%%):\tcode=%skB, data=%skB, resident=%skB, total=%skB" %(node, after_tokens[0], after_tokens[1], after_tokens[2], after_tokens[3], after_tokens[4], after_tokens[5], after_tokens[6])) if failed == 1: failed_nodes.append(node) if len(failed_nodes) > 0: return self.failure("Memory leaked on: " + repr(failed_nodes)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """ERROR: .* LRM operation.*monitor on .*: not running""", """pengine:.*Handling failed """] def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 #AllTestClasses.append(MemoryTest) #################################################################### class ElectionMemoryTest(CTSTest): #################################################################### '''Check to see if anyone is leaking memory''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Election" def __call__(self, node): self.rsh.readaline(node, self.CM["ElectionCmd"]%node) if self.CM.cluster_stable(): return self.success() return self.failure("Cluster not stable") def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): '''Never applicable, only for use by the memory test''' return 0 AllTestClasses.append(ElectionMemoryTest) #################################################################### class SpecialTest1(CTSTest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SpecialTest1" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return ret # Start the selected node ret = self.restart1(node) if not ret: return ret # Start all remaining nodes ret = self.startall(None) return ret def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): return 1 AllTestClasses.append(SpecialTest1) ################################################################### class NearQuorumPointTest(CTSTest): ################################################################### ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] #decide what to do with each node for node in self.CM.Env["nodes"]: action = self.CM.Env.RandomGen.choice(["start","stop"]) #action = self.CM.Env.RandomGen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.CM.debug("start nodes:" + repr(startset)) self.CM.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self.CM.ShouldBeStatus[node] == self.CM["up"]: watchpats.append(self.CM["Pat:We_stopped"] % node) for node in startset: if self.CM.ShouldBeStatus[node] == self.CM["down"]: watchpats.append(self.CM["Pat:They_started"] % node) if len(watchpats) == 0: return self.skipped() watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() #begin actions for node in stopset: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.StopaCMnoBlock(node) for node in startset: if self.CM.ShouldBeStatus[node] == self.CM["down"]: self.CM.StartaCMnoBlock(node) #get the result if watch.lookforall(): self.CM.cluster_stable() return self.success() self.CM.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self.CM.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self.CM.StataCM(node) == 0: downnodes.append(node) if upnodes == [] and downnodes == []: self.CM.cluster_stable() return self.success() if len(upnodes) > 0: self.CM.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self.CM.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() def errorstoignore(self): '''Return list of errors which should be ignored''' return [] def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 return 0 AllTestClasses.append(NearQuorumPointTest) ################################################################### class BSC_AddResource(CTSTest): ################################################################### '''Add a resource to the cluster''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="AddResource" self.resource_offset = 0 self.cib_cmd="""@sbindir@/cibadmin -C -o %s -X '%s' """ def __call__(self, node): self.resource_offset = self.resource_offset + 1 r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) start_pat = "crmd.*%s_start_0.*complete" patterns = [] patterns.append(start_pat % r_id) watch = CTS.LogWatcher( self.CM["LogFileName"], patterns, self.CM["DeadTime"]) watch.setwatch() fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"] = ip if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): return self.failure("Make resource %s failed" % r_id) failed = 0 watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.CM.log ("Warn: Pattern not found: %s" % (regex)) failed = 1 if failed: return self.failure("Resource pattern(s) not found") if not self.CM.cluster_stable(self.CM["DeadTime"]): return self.failure("Unstable cluster") return self.success() def make_ip_resource(self, node, id, rclass, type, ip): self.CM.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) rsc_xml=""" """ % (id, rclass, type, id, id, ip) node_constraint=""" """ % (id, id, id, id, node) rc = 0 (rc, lines) = self.CM.rsh.remote_py(node, "os", "system", self.cib_cmd % ("constraints", node_constraint)) if rc != 0: self.CM.log("Constraint creation failed: %d" % rc) return None (rc, lines) = self.CM.rsh.remote_py(node, "os", "system", self.cib_cmd % ("resources", rsc_xml)) if rc != 0: self.CM.log("Resource creation failed: %d" % rc) return None return 1 def is_applicable(self): if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["DoBSC"]: return 1 return None def TestList(cm): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): result.append(bound_test) return result class SimulStopLite(CTSTest): ################################################################### '''Stop any active nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStopLite" def __call__(self, dummy): '''Perform the 'SimulStopLite' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.incr("WasStarted") watchpats.append(self.CM["Pat:All_stopped"] % node) if self.CM.Env["use_logd"]: watchpats.append(self.CM["Pat:Logd_stopped"] % node) if len(watchpats) == 0: self.CM.clear_all_caches() return self.skipped() # Stop all the nodes - at about the same time... watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() self.starttime=time.time() for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.StopaCMnoBlock(node) if watch.lookforall(): self.CM.clear_all_caches() return self.success() did_fail=0 up_nodes = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 1: did_fail=1 up_nodes.append(node) if did_fail: return self.failure("Active nodes exist: " + repr(up_nodes)) self.CM.log("Warn: All nodes stopped but CTS didnt detect: " + repr(watch.unmatched)) self.CM.clear_all_caches() return self.failure("Missing log message: "+repr(watch.unmatched)) def is_applicable(self): '''SimulStopLite is a setup test and never applicable''' return 0 ################################################################### class SimulStartLite(CTSTest): ################################################################### '''Start any stopped nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStartLite" def __call__(self, dummy): '''Perform the 'SimulStartList' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["down"]: self.incr("WasStopped") watchpats.append(self.CM["Pat:They_started"] % node) if len(watchpats) == 0: return self.skipped() # Start all the nodes - at about the same time... watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats , timeout=self.CM["DeadTime"]+10) watch.setwatch() self.starttime=time.time() for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["down"]: self.CM.StartaCMnoBlock(node) if watch.lookforall(): for attempt in (1, 2, 3, 4, 5): if self.CM.cluster_stable(): return self.success() return self.failure("Cluster did not stabilize") did_fail=0 unstable = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 0: did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstarted nodes exist: " + repr(unstable)) unstable = [] for node in self.CM.Env["nodes"]: if not self.CM.node_stable(node): did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstable cluster nodes exist: " + repr(unstable)) self.CM.log("ERROR: All nodes started but CTS didnt detect: " + repr(watch.unmatched)) return self.failure() def is_applicable(self): '''SimulStartLite is a setup test and never applicable''' return 0 ################################################################### class LoggingTest(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Logging" def __call__(self, dummy): '''Perform the 'Logging' test. ''' self.incr("calls") # Make sure logging is working and we have enough disk space... if not self.CM.TestLogging(): sys.exit(1) if not self.CM.CheckDf(): sys.exit(1) def is_applicable(self): '''ResourceRecover is applicable only when there are resources running on our cluster and environment is linux-ha-v2''' return self.CM.Env["DoBSC"] def errorstoignore(self): '''Return list of errors which should be ignored''' return [] #AllTestClasses.append(LoggingTest) diff --git a/lib/crm/common/utils.c b/lib/crm/common/utils.c index 3837512dbf..718a73a0ab 100644 --- a/lib/crm/common/utils.c +++ b/lib/crm/common/utils.c @@ -1,1728 +1,1655 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef MAXLINE # define MAXLINE 512 #endif static uint ref_counter = 0; gboolean crm_assert_failed = FALSE; unsigned int crm_log_level = LOG_INFO; gboolean crm_config_error = FALSE; gboolean crm_config_warning = FALSE; void crm_set_env_options(void); gboolean check_time(const char *value) { if(crm_get_msec(value) < 5000) { return FALSE; } return TRUE; } gboolean check_timer(const char *value) { if(crm_get_msec(value) < 0) { return FALSE; } return TRUE; } gboolean check_boolean(const char *value) { int tmp = FALSE; if(crm_str_to_boolean(value, &tmp) != 1) { return FALSE; } return TRUE; } gboolean check_number(const char *value) { errno = 0; if(value == NULL) { return FALSE; } else if(safe_str_eq(value, MINUS_INFINITY_S)) { } else if(safe_str_eq(value, INFINITY_S)) { } else { crm_int_helper(value, NULL); } if(errno != 0) { return FALSE; } return TRUE; } int char2score(const char *score) { int score_f = 0; if(score == NULL) { } else if(safe_str_eq(score, MINUS_INFINITY_S)) { score_f = -INFINITY; } else if(safe_str_eq(score, INFINITY_S)) { score_f = INFINITY; } else if(safe_str_eq(score, "+"INFINITY_S)) { score_f = INFINITY; } else { score_f = crm_parse_int(score, NULL); if(score_f > 0 && score_f > INFINITY) { score_f = INFINITY; } else if(score_f < 0 && score_f < -INFINITY) { score_f = -INFINITY; } } return score_f; } char * score2char(int score) { if(score >= INFINITY) { return crm_strdup("+"INFINITY_S); } else if(score <= -INFINITY) { return crm_strdup("-"INFINITY_S); } return crm_itoa(score); } const char * cluster_option(GHashTable* options, gboolean(*validate)(const char*), const char *name, const char *old_name, const char *def_value) { const char *value = NULL; CRM_ASSERT(name != NULL); if(options != NULL) { value = g_hash_table_lookup(options, name); } if(value == NULL && old_name && options != NULL) { value = g_hash_table_lookup(options, old_name); if(value != NULL) { crm_config_warn("Using deprecated name '%s' for" " cluster option '%s'", old_name, name); g_hash_table_insert( options, crm_strdup(name), crm_strdup(value)); value = g_hash_table_lookup(options, old_name); } } if(value == NULL) { crm_notice("Using default value '%s' for cluster option '%s'", def_value, name); if(options == NULL) { return def_value; } g_hash_table_insert( options, crm_strdup(name), crm_strdup(def_value)); value = g_hash_table_lookup(options, name); } if(validate && validate(value) == FALSE) { crm_config_err("Value '%s' for cluster option '%s' is invalid." " Defaulting to %s", value, name, def_value); g_hash_table_replace(options, crm_strdup(name), crm_strdup(def_value)); value = g_hash_table_lookup(options, name); } return value; } const char * get_cluster_pref(GHashTable *options, pe_cluster_option *option_list, int len, const char *name) { int lpc = 0; const char *value = NULL; gboolean found = FALSE; for(lpc = 0; lpc < len; lpc++) { if(safe_str_eq(name, option_list[lpc].name)) { found = TRUE; value = cluster_option(options, option_list[lpc].is_valid, option_list[lpc].name, option_list[lpc].alt_name, option_list[lpc].default_value); } } CRM_CHECK(found, crm_err("No option named: %s", name)); CRM_ASSERT(value != NULL); return value; } void config_metadata(const char *name, const char *version, const char *desc_short, const char *desc_long, pe_cluster_option *option_list, int len) { int lpc = 0; fprintf(stdout, "" "\n" "\n" " %s\n" " %s\n" " %s\n" " \n", name, version, desc_long, desc_short); for(lpc = 0; lpc < len; lpc++) { if(option_list[lpc].description_long == NULL && option_list[lpc].description_short == NULL) { continue; } fprintf(stdout, " \n" " %s\n" " \n" " %s%s%s\n" " \n", option_list[lpc].name, option_list[lpc].description_short, option_list[lpc].type, option_list[lpc].default_value, option_list[lpc].description_long?option_list[lpc].description_long:option_list[lpc].description_short, option_list[lpc].values?" Allowed values: ":"", option_list[lpc].values?option_list[lpc].values:""); } fprintf(stdout, " \n\n"); } void verify_all_options(GHashTable *options, pe_cluster_option *option_list, int len) { int lpc = 0; for(lpc = 0; lpc < len; lpc++) { cluster_option(options, option_list[lpc].is_valid, option_list[lpc].name, option_list[lpc].alt_name, option_list[lpc].default_value); } } char * generateReference(const char *custom1, const char *custom2) { const char *local_cust1 = custom1; const char *local_cust2 = custom2; int reference_len = 4; char *since_epoch = NULL; reference_len += 20; /* too big */ reference_len += 40; /* too big */ if(local_cust1 == NULL) { local_cust1 = "_empty_"; } reference_len += strlen(local_cust1); if(local_cust2 == NULL) { local_cust2 = "_empty_"; } reference_len += strlen(local_cust2); crm_malloc0(since_epoch, reference_len); if(since_epoch != NULL) { sprintf(since_epoch, "%s-%s-%ld-%u", local_cust1, local_cust2, (unsigned long)time(NULL), ref_counter++); } return since_epoch; } gboolean decodeNVpair(const char *srcstring, char separator, char **name, char **value) { int lpc = 0; int len = 0; const char *temp = NULL; CRM_ASSERT(name != NULL && value != NULL); *name = NULL; *value = NULL; crm_debug_4("Attempting to decode: [%s]", srcstring); if (srcstring != NULL) { len = strlen(srcstring); while(lpc <= len) { if (srcstring[lpc] == separator) { crm_malloc0(*name, lpc+1); if(*name == NULL) { break; /* and return FALSE */ } strncpy(*name, srcstring, lpc); (*name)[lpc] = '\0'; /* this sucks but as the strtok manpage says.. * it *is* a bug */ len = len-lpc; len--; if(len <= 0) { *value = NULL; } else { crm_malloc0(*value, len+1); if(*value == NULL) { crm_free(*name); break; /* and return FALSE */ } temp = srcstring+lpc+1; strncpy(*value, temp, len); (*value)[len] = '\0'; } return TRUE; } lpc++; } } if(*name != NULL) { crm_free(*name); } *name = NULL; *value = NULL; return FALSE; } char * crm_concat(const char *prefix, const char *suffix, char join) { int len = 0; char *new_str = NULL; CRM_ASSERT(prefix != NULL); CRM_ASSERT(suffix != NULL); len = strlen(prefix) + strlen(suffix) + 2; crm_malloc0(new_str, (len)); sprintf(new_str, "%s%c%s", prefix, join, suffix); new_str[len-1] = 0; return new_str; } char * generate_hash_key(const char *crm_msg_reference, const char *sys) { char *hash_key = crm_concat(sys?sys:"none", crm_msg_reference, '_'); crm_debug_3("created hash key: (%s)", hash_key); return hash_key; } char * generate_hash_value(const char *src_node, const char *src_subsys) { char *hash_value = NULL; if (src_node == NULL || src_subsys == NULL) { return NULL; } if (strcasecmp(CRM_SYSTEM_DC, src_subsys) == 0) { hash_value = crm_strdup(src_subsys); if (!hash_value) { crm_err("memory allocation failed in " "generate_hash_value()"); } return hash_value; } hash_value = crm_concat(src_node, src_subsys, '_'); crm_info("created hash value: (%s)", hash_value); return hash_value; } char * crm_itoa(int an_int) { int len = 32; char *buffer = NULL; crm_malloc0(buffer, (len+1)); if(buffer != NULL) { snprintf(buffer, len, "%d", an_int); } return buffer; } extern int LogToLoggingDaemon(int priority, const char * buf, int bstrlen, gboolean use_pri_str); gboolean crm_log_init(const char *entity, gboolean coredir) { /* const char *test = "Testing log daemon connection"; */ /* Redirect messages from glib functions to our handler */ /* cl_malloc_forced_for_glib(); */ g_log_set_handler(NULL, G_LOG_LEVEL_ERROR | G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING | G_LOG_LEVEL_MESSAGE | G_LOG_LEVEL_INFO | G_LOG_LEVEL_DEBUG | G_LOG_FLAG_RECURSION | G_LOG_FLAG_FATAL, cl_glib_msg_handler, NULL); /* and for good measure... - this enum is a bit field (!) */ g_log_set_always_fatal((GLogLevelFlags)0); /*value out of range*/ cl_log_set_entity(entity); - cl_log_set_facility(LOG_LOCAL7); + cl_log_set_facility(LOG_DAEMON); if(coredir) { cl_set_corerootdir(HA_COREDIR); cl_cdtocoredir(); } - crm_set_env_options(); + cl_inherit_logging_environment(500); CL_SIGNAL(DEBUG_INC, alter_debug); CL_SIGNAL(DEBUG_DEC, alter_debug); return TRUE; } /* returns the old value */ unsigned int set_crm_log_level(unsigned int level) { unsigned int old = crm_log_level; while(crm_log_level < 100 && crm_log_level < level) { alter_debug(DEBUG_INC); } while(crm_log_level > 0 && crm_log_level > level) { alter_debug(DEBUG_DEC); } return old; } unsigned int get_crm_log_level(void) { return crm_log_level; } void crm_log_message_adv(int level, const char *prefix, const HA_Message *msg) { if((int)crm_log_level >= level) { do_crm_log(level, "#========= %s message start ==========#", prefix?prefix:""); if(level > LOG_DEBUG) { cl_log_message(LOG_DEBUG, msg); } else { cl_log_message(level, msg); } } } int compare_version(const char *version1, const char *version2) { int rc = 0; int lpc = 0; char *step1 = NULL, *step2 = NULL; char *rest1 = NULL, *rest2 = NULL; if(version1 == NULL && version2 == NULL) { return 0; } else if(version1 == NULL) { return -1; } else if(version2 == NULL) { return 1; } rest1 = crm_strdup(version1); rest2 = crm_strdup(version2); while(1) { int cmp = 0; int step1_i = 0; int step2_i = 0; char *tmp1 = NULL, *tmp2 = NULL; decodeNVpair(rest1, '.', &step1, &tmp1); decodeNVpair(rest2, '.', &step2, &tmp2); if(step1 == NULL && step2 == NULL) { CRM_CHECK(tmp1 == tmp2 && tmp1 == NULL, crm_err("Leftover data: %s, %s", crm_str(tmp1), crm_str(tmp2))); crm_free(tmp1); crm_free(tmp2); break; } if(step1 != NULL) { step1_i = crm_parse_int(step1, NULL); } if(step2 != NULL) { step2_i = crm_parse_int(step2, NULL); } if(step1_i < step2_i){ cmp = -1; } else if (step1_i > step2_i){ cmp = 1; } crm_debug_4("compare[%d (%d)]: %d(%s) %d(%s)", lpc++, cmp, step1_i, crm_str(step1), step2_i, crm_str(step2)); crm_free(rest1); crm_free(rest2); crm_free(step1); crm_free(step2); rest1 = tmp1; rest2 = tmp2; if(cmp < 0) { rc = -1; break; } else if(cmp > 0) { rc = 1; break; } } crm_free(rest1); crm_free(rest2); if(rc == 0) { crm_debug_3("%s == %s", version1, version2); } else if(rc < 0) { crm_debug_3("%s < %s", version1, version2); } else if(rc > 0) { crm_debug_3("%s > %s", version1, version2); } return rc; } gboolean do_stderr = FALSE; void alter_debug(int nsig) { CL_SIGNAL(DEBUG_INC, alter_debug); CL_SIGNAL(DEBUG_DEC, alter_debug); switch(nsig) { case DEBUG_INC: if (crm_log_level < 100) { crm_log_level++; } break; case DEBUG_DEC: if (crm_log_level > 0) { crm_log_level--; } break; default: fprintf(stderr, "Unknown signal %d\n", nsig); cl_log(LOG_ERR, "Unknown signal %d", nsig); break; } } void g_hash_destroy_str(gpointer data) { crm_free(data); } int crm_int_helper(const char *text, char **end_text) { int atoi_result = -1; char *local_end_text = NULL; errno = 0; if(text != NULL) { if(end_text != NULL) { atoi_result = (int)strtol(text, end_text, 10); } else { atoi_result = (int)strtol(text, &local_end_text, 10); } /* CRM_CHECK(errno != EINVAL); */ if(errno == EINVAL) { crm_err("Conversion of %s failed", text); atoi_result = -1; } else { if(errno == ERANGE) { crm_err("Conversion of %s was clipped", text); } if(end_text == NULL && local_end_text[0] != '\0') { crm_err("Characters left over after parsing " "\"%s\": \"%s\"", text, local_end_text); } } } return atoi_result; } int crm_parse_int(const char *text, const char *default_text) { int atoi_result = -1; if(text != NULL) { atoi_result = crm_int_helper(text, NULL); if(errno == 0) { return atoi_result; } } if(default_text != NULL) { atoi_result = crm_int_helper(default_text, NULL); if(errno == 0) { return atoi_result; } } else { crm_err("No default conversion value supplied"); } return -1; } gboolean crm_str_eq(const char *a, const char *b, gboolean use_case) { if(a == NULL || b == NULL) { /* shouldn't be comparing NULLs */ CRM_CHECK(a != b, return TRUE); return FALSE; } else if(use_case && a[0] != b[0]) { return FALSE; } else if(a == b) { return TRUE; } else if(strcasecmp(a, b) == 0) { return TRUE; } return FALSE; } gboolean safe_str_neq(const char *a, const char *b) { if(a == b) { return FALSE; } else if(a==NULL || b==NULL) { return TRUE; } else if(strcasecmp(a, b) == 0) { return FALSE; } return TRUE; } char * crm_strdup_fn(const char *src, const char *file, const char *fn, int line) { char *dup = NULL; CRM_CHECK(src != NULL, return NULL); crm_malloc0(dup, strlen(src) + 1); return strcpy(dup, src); } static GHashTable *crm_uuid_cache = NULL; static GHashTable *crm_uname_cache = NULL; void empty_uuid_cache(void) { if(crm_uuid_cache != NULL) { g_hash_table_destroy(crm_uuid_cache); crm_uuid_cache = NULL; } } void unget_uuid(const char *uname) { if(crm_uuid_cache == NULL) { return; } g_hash_table_remove(crm_uuid_cache, uname); } const char * get_uuid(ll_cluster_t *hb, const char *uname) { cl_uuid_t uuid_raw; char *uuid_calc = NULL; const char *unknown = "00000000-0000-0000-0000-000000000000"; if(crm_uuid_cache == NULL) { crm_uuid_cache = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } CRM_CHECK(uname != NULL, return NULL); /* avoid blocking calls where possible */ uuid_calc = g_hash_table_lookup(crm_uuid_cache, uname); if(uuid_calc != NULL) { return uuid_calc; } if(hb->llc_ops->get_uuid_by_name(hb, uname, &uuid_raw) == HA_FAIL) { crm_err("get_uuid_by_name() call failed for host %s", uname); crm_free(uuid_calc); return NULL; } crm_malloc0(uuid_calc, 50); if(uuid_calc == NULL) { return NULL; } cl_uuid_unparse(&uuid_raw, uuid_calc); if(safe_str_eq(uuid_calc, unknown)) { crm_warn("Could not calculate UUID for %s", uname); crm_free(uuid_calc); return NULL; } g_hash_table_insert(crm_uuid_cache, crm_strdup(uname), uuid_calc); uuid_calc = g_hash_table_lookup(crm_uuid_cache, uname); return uuid_calc; } const char * get_uname(ll_cluster_t *hb, const char *uuid) { char *uname = NULL; if(crm_uuid_cache == NULL) { crm_uname_cache = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } CRM_CHECK(uuid != NULL, return NULL); /* avoid blocking calls where possible */ uname = g_hash_table_lookup(crm_uname_cache, uuid); if(uname != NULL) { return uname; } if(uuid != NULL) { cl_uuid_t uuid_raw; char *uuid_copy = crm_strdup(uuid); cl_uuid_parse(uuid_copy, &uuid_raw); if(hb->llc_ops->get_name_by_uuid( hb, &uuid_raw, uname, 256) == HA_FAIL) { crm_err("Could not calculate UUID for %s", uname); uname = NULL; crm_free(uuid_copy); } else { g_hash_table_insert( crm_uuid_cache, uuid_copy, crm_strdup(uname)); uname = g_hash_table_lookup(crm_uname_cache, uuid); } return uname; } return NULL; } void set_uuid(ll_cluster_t *hb,crm_data_t *node,const char *attr,const char *uname) { const char *uuid_calc = get_uuid(hb, uname); crm_xml_add(node, attr, uuid_calc); return; } #define ENV_PREFIX "HA_" void crm_set_env_options(void) { - char *param_val = NULL; - const char *param_name = NULL; - - /* apparently we're not allowed to free the result of getenv */ - - param_name = ENV_PREFIX "" KEY_DEBUGLEVEL; - param_val = getenv(param_name); - if(param_val != NULL) { - int debug_level = crm_parse_int(param_val, NULL); - if(debug_level > 0 && (debug_level+LOG_INFO) > (int)crm_log_level) { - set_crm_log_level(LOG_INFO + debug_level); - } - crm_debug("%s = %s", param_name, param_val); - param_val = NULL; - } - - param_name = ENV_PREFIX "" KEY_FACILITY; - param_val = getenv(param_name); - crm_debug("%s = %s", param_name, param_val); - if(param_val != NULL) { - int facility = cl_syslogfac_str2int(param_val); - if(facility >= 0) { - cl_log_set_facility(facility); - } - param_val = NULL; - } - - param_name = ENV_PREFIX "" KEY_LOGFILE; - param_val = getenv(param_name); - crm_debug("%s = %s", param_name, param_val); - if(param_val != NULL) { - if(safe_str_eq("/dev/null", param_val)) { - param_val = NULL; - } - cl_log_set_logfile(param_val); - param_val = NULL; - } - - param_name = ENV_PREFIX "" KEY_DBGFILE; - param_val = getenv(param_name); - crm_debug("%s = %s", param_name, param_val); - if(param_val != NULL) { - if(safe_str_eq("/dev/null", param_val)) { - param_val = NULL; - } - cl_log_set_debugfile(param_val); - param_val = NULL; - } - - param_name = ENV_PREFIX "" KEY_LOGDAEMON; - param_val = getenv(param_name); - crm_debug("%s = %s", param_name, param_val); - if(param_val != NULL) { - int uselogd; - cl_str_to_boolean(param_val, &uselogd); - cl_log_set_uselogd(uselogd); - if(uselogd) { - cl_set_logging_wqueue_maxlen(500); - cl_log_set_logd_channel_source(NULL, NULL); - } - param_val = NULL; - } - - param_name = ENV_PREFIX "" KEY_CONNINTVAL; - param_val = getenv(param_name); - crm_debug("%s = %s", param_name, param_val); - if(param_val != NULL) { - int logdtime; - logdtime = crm_get_msec(param_val); - cl_log_set_logdtime(logdtime); - param_val = NULL; - } - - inherit_compress(); + cl_inherit_logging_environment(500); } gboolean crm_is_true(const char * s) { gboolean ret = FALSE; if(s != NULL) { cl_str_to_boolean(s, &ret); } return ret; } int crm_str_to_boolean(const char * s, int * ret) { if(s == NULL) { return -1; } else if (strcasecmp(s, "true") == 0 || strcasecmp(s, "on") == 0 || strcasecmp(s, "yes") == 0 || strcasecmp(s, "y") == 0 || strcasecmp(s, "1") == 0){ *ret = TRUE; return 1; } else if (strcasecmp(s, "false") == 0 || strcasecmp(s, "off") == 0 || strcasecmp(s, "no") == 0 || strcasecmp(s, "n") == 0 || strcasecmp(s, "0") == 0){ *ret = FALSE; return 1; } return -1; } #ifndef NUMCHARS # define NUMCHARS "0123456789." #endif #ifndef WHITESPACE # define WHITESPACE " \t\n\r\f" #endif long crm_get_msec(const char * input) { const char * cp = input; const char * units; long multiplier = 1000; long divisor = 1; long ret = -1; double dret; if(input == NULL) { return 0; } cp += strspn(cp, WHITESPACE); units = cp + strspn(cp, NUMCHARS); units += strspn(units, WHITESPACE); if (strchr(NUMCHARS, *cp) == NULL) { return ret; } if (strncasecmp(units, "ms", 2) == 0 || strncasecmp(units, "msec", 4) == 0) { multiplier = 1; divisor = 1; }else if (strncasecmp(units, "us", 2) == 0 || strncasecmp(units, "usec", 4) == 0) { multiplier = 1; divisor = 1000; }else if (strncasecmp(units, "s", 1) == 0 || strncasecmp(units, "sec", 3) == 0) { multiplier = 1000; divisor = 1; }else if (strncasecmp(units, "m", 1) == 0 || strncasecmp(units, "min", 3) == 0) { multiplier = 60*1000; divisor = 1; }else if (strncasecmp(units, "h", 1) == 0 || strncasecmp(units, "hr", 2) == 0) { multiplier = 60*60*1000; divisor = 1; }else if (*units != EOS && *units != '\n' && *units != '\r') { return ret; } dret = atof(cp); dret *= (double)multiplier; dret /= (double)divisor; dret += 0.5; ret = (long)dret; return(ret); } gboolean ccm_have_quorum(oc_ed_t event) { if(event==OC_EV_MS_NEW_MEMBERSHIP) { return TRUE; } return FALSE; } const char * ccm_event_name(oc_ed_t event) { if(event==OC_EV_MS_NEW_MEMBERSHIP) { return "NEW MEMBERSHIP"; } else if(event==OC_EV_MS_NOT_PRIMARY) { return "NOT PRIMARY"; } else if(event==OC_EV_MS_PRIMARY_RESTORED) { return "PRIMARY RESTORED"; } else if(event==OC_EV_MS_EVICTED) { return "EVICTED"; } else if(event==OC_EV_MS_INVALID) { return "INVALID"; } return "NO QUORUM MEMBERSHIP"; } const char * op_status2text(op_status_t status) { switch(status) { case LRM_OP_PENDING: return "pending"; break; case LRM_OP_DONE: return "complete"; break; case LRM_OP_ERROR: return "Error"; break; case LRM_OP_TIMEOUT: return "Timed Out"; break; case LRM_OP_NOTSUPPORTED: return "NOT SUPPORTED"; break; case LRM_OP_CANCELLED: return "Cancelled"; break; } CRM_CHECK(status >= LRM_OP_PENDING && status <= LRM_OP_CANCELLED, crm_err("Unknown status: %d", status)); return "UNKNOWN!"; } char * generate_op_key(const char *rsc_id, const char *op_type, int interval) { int len = 35; char *op_id = NULL; CRM_CHECK(rsc_id != NULL, return NULL); CRM_CHECK(op_type != NULL, return NULL); len += strlen(op_type); len += strlen(rsc_id); crm_malloc0(op_id, len); CRM_CHECK(op_id != NULL, return NULL); sprintf(op_id, "%s_%s_%d", rsc_id, op_type, interval); return op_id; } gboolean parse_op_key(const char *key, char **rsc_id, char **op_type, int *interval) { char *mutable_key = NULL; char *mutable_key_ptr = NULL; int len = 0, offset = 0, ch = 0; CRM_CHECK(key != NULL, return FALSE); *interval = 0; len = strlen(key); offset = len-1; crm_debug_3("Source: %s", key); while(offset > 0 && isdigit(key[offset])) { int digits = len-offset; ch = key[offset] - '0'; CRM_CHECK(ch < 10, return FALSE); CRM_CHECK(ch >= 0, return FALSE); while(digits > 1) { digits--; ch = ch * 10; } *interval += ch; offset--; } crm_debug_3(" Interval: %d", *interval); CRM_CHECK(key[offset] == '_', return FALSE); mutable_key = crm_strdup(key); mutable_key_ptr = mutable_key_ptr; mutable_key[offset] = 0; offset--; while(offset > 0 && key[offset] != '_') { offset--; } CRM_CHECK(key[offset] == '_', crm_free(mutable_key); return FALSE); mutable_key_ptr = mutable_key+offset+1; crm_debug_3(" Action: %s", mutable_key_ptr); *op_type = crm_strdup(mutable_key_ptr); mutable_key[offset] = 0; offset--; CRM_CHECK(mutable_key != mutable_key_ptr, crm_free(mutable_key); return FALSE); crm_debug_3(" Resource: %s", mutable_key); *rsc_id = crm_strdup(mutable_key); crm_free(mutable_key); return TRUE; } char * generate_notify_key(const char *rsc_id, const char *notify_type, const char *op_type) { int len = 12; char *op_id = NULL; CRM_CHECK(rsc_id != NULL, return NULL); CRM_CHECK(op_type != NULL, return NULL); CRM_CHECK(notify_type != NULL, return NULL); len += strlen(op_type); len += strlen(rsc_id); len += strlen(notify_type); crm_malloc0(op_id, len); if(op_id != NULL) { sprintf(op_id, "%s_%s_notify_%s_0", rsc_id, notify_type, op_type); } return op_id; } char * generate_transition_magic_v202(const char *transition_key, int op_status) { int len = 80; char *fail_state = NULL; CRM_CHECK(transition_key != NULL, return NULL); len += strlen(transition_key); crm_malloc0(fail_state, len); if(fail_state != NULL) { snprintf(fail_state, len, "%d:%s", op_status,transition_key); } return fail_state; } char * generate_transition_magic(const char *transition_key, int op_status, int op_rc) { int len = 80; char *fail_state = NULL; CRM_CHECK(transition_key != NULL, return NULL); len += strlen(transition_key); crm_malloc0(fail_state, len); if(fail_state != NULL) { snprintf(fail_state, len, "%d:%d;%s", op_status, op_rc, transition_key); } return fail_state; } gboolean decode_transition_magic( const char *magic, char **uuid, int *transition_id, int *action_id, int *op_status, int *op_rc) { char *rc = NULL; char *key = NULL; char *magic2 = NULL; char *status = NULL; gboolean result = TRUE; if(decodeNVpair(magic, ':', &status, &magic2) == FALSE) { crm_err("Couldn't find ':' in: %s", magic); result = FALSE; goto bail; } if(decodeNVpair(magic2, ';', &rc, &key) == FALSE) { crm_err("Couldn't find ';' in: %s", magic2); result = FALSE; goto bail; } CRM_CHECK(decode_transition_key(key, uuid, transition_id, action_id), result = FALSE; goto bail; ); *op_rc = crm_parse_int(rc, NULL); *op_status = crm_parse_int(status, NULL); bail: crm_free(rc); crm_free(key); crm_free(magic2); crm_free(status); return result; } char * generate_transition_key(int transition_id, int action_id, const char *node) { int len = 40; char *fail_state = NULL; CRM_CHECK(node != NULL, return NULL); len += strlen(node); crm_malloc0(fail_state, len); if(fail_state != NULL) { snprintf(fail_state, len, "%d:%d:%s", action_id, transition_id, node); } return fail_state; } gboolean decode_transition_key( const char *key, char **uuid, int *transition_id, int *action_id) { char *tmp = NULL; char *action = NULL; char *transition = NULL; *uuid = NULL; *action_id = -1; *transition_id = -1; if(decodeNVpair(key, ':', &action, &tmp) == FALSE) { crm_err("Couldn't find ':' in: %s", key); return FALSE; } *action_id = crm_parse_int(action, NULL); crm_free(action); if(decodeNVpair(tmp, ':', &transition, uuid) == FALSE) { /* this would be an error but some versions dont * have the action */ *transition_id = *action_id; *action_id = -1; *uuid = tmp; } else { *transition_id = crm_parse_int(transition, NULL); crm_free(transition); crm_free(tmp); } return TRUE; } void filter_action_parameters(crm_data_t *param_set, const char *version) { const char *timeout = NULL; const char *interval = NULL; #if CRM_DEPRECATED_SINCE_2_0_5 const char *filter_205[] = { XML_ATTR_TE_TARGET_RC, XML_ATTR_LRM_PROBE, XML_RSC_ATTR_START, XML_RSC_ATTR_NOTIFY, XML_RSC_ATTR_UNIQUE, XML_RSC_ATTR_MANAGED, XML_RSC_ATTR_PRIORITY, XML_RSC_ATTR_MULTIPLE, XML_RSC_ATTR_STICKINESS, XML_RSC_ATTR_FAIL_STICKINESS, XML_RSC_ATTR_TARGET_ROLE, /* ignore clone fields */ XML_RSC_ATTR_INCARNATION, XML_RSC_ATTR_INCARNATION_MAX, XML_RSC_ATTR_INCARNATION_NODEMAX, XML_RSC_ATTR_MASTER_MAX, XML_RSC_ATTR_MASTER_NODEMAX, /* old field names */ "role", "crm_role", "te-target-rc", /* ignore notify fields */ "notify_stop_resource", "notify_stop_uname", "notify_start_resource", "notify_start_uname", "notify_active_resource", "notify_active_uname", "notify_inactive_resource", "notify_inactive_uname", "notify_promote_resource", "notify_promote_uname", "notify_demote_resource", "notify_demote_uname", "notify_master_resource", "notify_master_uname", "notify_slave_resource", "notify_slave_uname" }; #endif const char *attr_filter[] = { XML_ATTR_ID, XML_ATTR_CRM_VERSION, XML_LRM_ATTR_OP_DIGEST, }; gboolean do_delete = FALSE; int lpc = 0; static int meta_len = 0; if(meta_len == 0) { meta_len = strlen(CRM_META); } if(param_set == NULL) { return; } #if CRM_DEPRECATED_SINCE_2_0_5 if(version == NULL || compare_version("1.0.5", version)) { for(lpc = 0; lpc < DIMOF(filter_205); lpc++) { xml_remove_prop(param_set, filter_205[lpc]); } } #endif for(lpc = 0; lpc < DIMOF(attr_filter); lpc++) { xml_remove_prop(param_set, attr_filter[lpc]); } timeout = crm_element_value(param_set, CRM_META"_timeout"); interval = crm_element_value(param_set, CRM_META"_interval"); xml_prop_iter(param_set, prop_name, prop_value, do_delete = FALSE; if(strncasecmp(prop_name, CRM_META, meta_len) == 0) { do_delete = TRUE; } if(do_delete) { /* remove it */ xml_remove_prop(param_set, prop_name); /* unwind the counetr */ __counter--; } ); if(crm_get_msec(interval) && compare_version(version, "1.0.8")) { /* Re-instate the operation's timeout value */ if(timeout != NULL) { crm_xml_add(param_set, CRM_META"_timeout", timeout); } } } void filter_reload_parameters(crm_data_t *param_set, const char *restart_string) { int len = 0; char *name = NULL; char *match = NULL; if(param_set == NULL) { return; } xml_prop_iter(param_set, prop_name, prop_value, name = NULL; len = strlen(prop_name) + 3; crm_malloc0(name, len); sprintf(name, " %s ", prop_name); name[len-1] = 0; match = strstr(restart_string, name); if(match == NULL) { /* remove it */ crm_debug_3("%s not found in %s", prop_name, restart_string); xml_remove_prop(param_set, prop_name); /* unwind the counetr */ __counter--; } crm_free(name); ); } void crm_abort(const char *file, const char *function, int line, const char *assert_condition, gboolean do_fork) { int rc = 0; int pid = 0; int status = 0; if(do_fork == FALSE) { do_crm_log(LOG_ERR, "%s: Triggered fatal assert at %s:%d : %s", function, file, line, assert_condition); } else if(crm_log_level < LOG_DEBUG) { do_crm_log(LOG_ERR, "%s: Triggered non-fatal assert at %s:%d : %s", function, file, line, assert_condition); return; } else { pid=fork(); } switch(pid) { case -1: crm_err("Cannot fork!"); return; default: /* Parent */ do_crm_log(LOG_ERR, "%s: Forked child %d to record non-fatal assert at %s:%d : %s", function, pid, file, line, assert_condition); do { rc = waitpid(pid, &status, 0); if(rc < 0 && errno != EINTR) { cl_perror("%s: Cannot wait on forked child %d", function, pid); } } while(rc < 0 && errno == EINTR); return; case 0: /* Child */ abort(); break; } } char * generate_series_filename( const char *directory, const char *series, int sequence, gboolean bzip) { int len = 40; char *filename = NULL; const char *ext = "raw"; CRM_CHECK(directory != NULL, return NULL); CRM_CHECK(series != NULL, return NULL); len += strlen(directory); len += strlen(series); crm_malloc0(filename, len); CRM_CHECK(filename != NULL, return NULL); if(bzip) { ext = "bz2"; } sprintf(filename, "%s/%s-%d.%s", directory, series, sequence, ext); return filename; } int get_last_sequence(const char *directory, const char *series) { FILE *file_strm = NULL; int start = 0, length = 0, read_len = 0; char *series_file = NULL; char *buffer = NULL; int seq = 0; int len = 36; CRM_CHECK(directory != NULL, return 0); CRM_CHECK(series != NULL, return 0); len += strlen(directory); len += strlen(series); crm_malloc0(series_file, len); CRM_CHECK(series_file != NULL, return 0); sprintf(series_file, "%s/%s.last", directory, series); file_strm = fopen(series_file, "r"); if(file_strm == NULL) { crm_debug("Series file %s does not exist", series_file); crm_free(series_file); return 0; } /* see how big the file is */ start = ftell(file_strm); fseek(file_strm, 0L, SEEK_END); length = ftell(file_strm); fseek(file_strm, 0L, start); CRM_ASSERT(start == ftell(file_strm)); crm_debug_3("Reading %d bytes from file", length); crm_malloc0(buffer, (length+1)); read_len = fread(buffer, 1, length, file_strm); if(read_len != length) { crm_err("Calculated and read bytes differ: %d vs. %d", length, read_len); crm_free(buffer); buffer = NULL; } else if(length <= 0) { crm_info("%s was not valid", series_file); crm_free(buffer); buffer = NULL; } crm_free(series_file); seq = crm_parse_int(buffer, "0"); crm_free(buffer); fclose(file_strm); return seq; } void write_last_sequence( const char *directory, const char *series, int sequence, int max) { int rc = 0; int len = 36; char *buffer = NULL; FILE *file_strm = NULL; char *series_file = NULL; CRM_CHECK(directory != NULL, return); CRM_CHECK(series != NULL, return); if(max == 0) { return; } while(max > 0 && sequence > max) { sequence -= max; } buffer = crm_itoa(sequence); len += strlen(directory); len += strlen(series); crm_malloc0(series_file, len); sprintf(series_file, "%s/%s.last", directory, series); file_strm = fopen(series_file, "w"); if(file_strm == NULL) { crm_err("Cannout open series file %s for writing", series_file); goto bail; } rc = fprintf(file_strm, "%s", buffer); if(rc < 0) { cl_perror("Cannot write to series file %s", series_file); } bail: if(file_strm != NULL) { fflush(file_strm); fclose(file_strm); } crm_free(series_file); crm_free(buffer); } void crm_make_daemon(const char *name, gboolean daemonize, const char *pidfile) { long pid; const char *devnull = "/dev/null"; if(daemonize == FALSE) { return; } pid = fork(); if (pid < 0) { fprintf(stderr, "%s: could not start daemon\n", name); cl_perror("fork"); exit(LSB_EXIT_GENERIC); } else if (pid > 0) { exit(LSB_EXIT_OK); } if (cl_lock_pidfile(pidfile) < 0 ) { pid = cl_read_pidfile_no_checking(pidfile); crm_warn("%s: already running [pid %ld] (%s).\n", name, pid, pidfile); exit(LSB_EXIT_OK); } umask(022); close(FD_STDIN); (void)open(devnull, O_RDONLY); /* Stdin: fd 0 */ close(FD_STDOUT); (void)open(devnull, O_WRONLY); /* Stdout: fd 1 */ close(FD_STDERR); (void)open(devnull, O_WRONLY); /* Stderr: fd 2 */ } gboolean crm_is_writable(const char *dir, const char *file, const char *user, const char *group, gboolean need_both) { int s_res = -1; struct stat buf; char *full_file = NULL; const char *target = NULL; gboolean pass = TRUE; gboolean readwritable = FALSE; CRM_ASSERT(dir != NULL); if(file != NULL) { full_file = crm_concat(dir, file, '/'); target = full_file; s_res = stat(full_file, &buf); if( s_res == 0 && S_ISREG(buf.st_mode) == FALSE ) { crm_err("%s must be a regular file", target); pass = FALSE; goto out; } } if (s_res != 0) { target = dir; s_res = stat(dir, &buf); if(s_res != 0) { crm_err("%s must exist and be a directory", dir); pass = FALSE; goto out; } else if( S_ISDIR(buf.st_mode) == FALSE ) { crm_err("%s must be a directory", dir); pass = FALSE; } } if(user) { struct passwd *sys_user = NULL; sys_user = getpwnam(user); readwritable = (sys_user != NULL && buf.st_uid == sys_user->pw_uid && (buf.st_mode & (S_IRUSR|S_IWUSR))); if(readwritable == FALSE) { crm_err("%s must be owned and r/w by user %s", target, user); if(need_both) { pass = FALSE; } } } if(group) { struct group *sys_grp = getgrnam(group); readwritable = ( sys_grp != NULL && buf.st_gid == sys_grp->gr_gid && (buf.st_mode & (S_IRGRP|S_IWGRP))); if(readwritable == FALSE) { if(need_both || user == NULL) { pass = FALSE; crm_err("%s must be owned and r/w by group %s", target, group); } else { crm_warn("%s should be owned and r/w by group %s", target, group); } } } out: crm_free(full_file); return pass; }