diff --git a/GNUmakefile b/GNUmakefile index d877bc3965..9a2aab2e55 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1,171 +1,170 @@ # # Copyright (C) 2008 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # -include Makefile PACKAGE ?= pacemaker # Force 'make dist' to be consistent with 'make export' distdir = $(PACKAGE)-$(TAG) TARFILE = $(distdir).tar.bz2 DIST_ARCHIVES = $(TARFILE) LAST_RELEASE = $(firstword $(shell hg tags| grep Pacemaker | head -n 1)) RPM_ROOT = $(shell pwd) RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ --define "_specdir $(RPM_ROOT)" \ --define "_srcrpmdir $(RPM_ROOT)" \ MOCK_OPTIONS ?= --resultdir=$(RPM_ROOT)/mock --no-cleanup-after # Default to fedora compliant spec files # SLES: /etc/SuSE-release # openSUSE: /etc/SuSE-release # RHEL: /etc/redhat-release # Fedora: /etc/fedora-release, /etc/redhat-release, /etc/system-release getdistro = $(shell test -e /etc/SuSE-release || echo fedora; test -e /etc/SuSE-release && echo suse) PROFILE ?= $(shell rpm --eval fedora-%{fedora}-%{_arch}) DISTRO ?= $(call getdistro) TAG ?= $(firstword $(shell hg id -i | tr '+' ' ')) WITH ?= BUILD_COUNTER ?= build.counter COUNT = $(shell test ! -e $(BUILD_COUNTER) || echo $(shell expr 1 + $(shell cat $(BUILD_COUNTER)))) initialize: ./autogen.sh echo "Now run configure with any arguments (eg. --prefix) specific to your system" export: rm -f $(PACKAGE)-scratch.tar.* $(PACKAGE)-tip.tar.* if [ ! -f $(TARFILE) ]; then \ if [ $(TAG) = scratch ]; then \ hg commit -m "DO-NOT-PUSH"; \ hg archive --prefix $(distdir) -t tbz2 -r tip $(TARFILE); \ hg rollback; \ else \ hg archive --prefix $(distdir) -t tbz2 -r $(TAG) $(TARFILE); \ fi; \ echo `date`: Rebuilt $(TARFILE); \ else \ echo `date`: Using existing tarball: $(TARFILE); \ fi $(PACKAGE)-opensuse.spec: $(PACKAGE)-suse.spec $(PACKAGE)-suse.spec: $(PACKAGE).spec.in rm -f $@ cp $(PACKAGE).spec.in $@ sed -i.sed s:%{_docdir}/%{name}:%{_docdir}/%{name}-%{version}:g $@ sed -i.sed s:corosynclib:libcorosync:g $@ - sed -i.sed s:pacemaker-libs:lib%{name}3:g $@ sed -i.sed 's:%{name}-libs:lib%{name}3:g' $@ sed -i.sed s:heartbeat-libs:heartbeat:g $@ sed -i.sed s:cluster-glue-libs:libglue:g $@ sed -i.sed s:libselinux-devel:automake:g $@ sed -i.sed s:lm_sensors-devel:automake:g $@ sed -i.sed s:bzip2-devel:libbz2-devel:g $@ sed -i.sed s:Development/Libraries:Development/Libraries/C\ and\ C++:g $@ sed -i.sed s:System\ Environment/Daemons:Productivity/Clustering/HA:g $@ sed -i.sed s:bcond_without\ publican:bcond_with\ publican:g $@ sed -i.sed s:\#global\ py_sitedir:\%global\ py_sitedir:g $@ sed -i.sed s:docbook-style-xsl:docbook-xsl-stylesheets:g $@ sed -i.sed s:libtool-ltdl-devel::g $@ @echo Rebuilt $@ # Works for all fedora based distros $(PACKAGE)-%.spec: $(PACKAGE).spec.in rm -f $@ cp $(PACKAGE).spec.in $(PACKAGE)-$*.spec @echo Rebuilt $@ srpm-%: export $(PACKAGE)-%.spec rm -f *.src.rpm $(PACKAGE).spec cp $(PACKAGE)-$*.spec $(PACKAGE).spec if [ -e $(BUILD_COUNTER) ]; then \ echo $(COUNT) > $(BUILD_COUNTER); \ sed -i.sed 's/global\ specversion.*/global\ specversion\ $(COUNT)/' $(PACKAGE).spec; \ fi sed -i.sed 's/global\ upstream_version.*/global\ upstream_version\ $(TAG)/' $(PACKAGE).spec rpmbuild -bs --define "dist .$*" $(RPM_OPTS) $(WITH) $(PACKAGE).spec # eg. WITH="--with cman" make rpm mock-%: make srpm-$(firstword $(shell echo $(@:mock-%=%) | tr '-' ' ')) -rm -rf $(RPM_ROOT)/mock @echo "mock --root=$* --rebuild $(WITH) $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm" mock --root=$* --rebuild $(WITH) $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm srpm: srpm-$(DISTRO) mock: mock-$(PROFILE) rpm: srpm @echo To create custom builds, edit the flags and options in $(PACKAGE).spec first rpmbuild $(RPM_OPTS) $(WITH) --rebuild $(RPM_ROOT)/*.src.rpm scratch: make TAG=scratch mock COVERITY_DIR = $(shell pwd)/coverity-$(TAG) COVHOST ?= coverity.example.com COVPASS ?= password coverity: test -e configure || ./autogen.sh test -e Makefile || ./configure make clean rm -rf $(COVERITY_DIR) cov-build --dir $(COVERITY_DIR) make core @echo "Waiting for a Coverity license..." cov-analyze --dir $(COVERITY_DIR) --wait-for-license cov-format-errors --dir $(COVERITY_DIR) --emacs-style > $(TAG).coverity cov-format-errors --dir $(COVERITY_DIR) rsync -avzxlSD --progress $(COVERITY_DIR)/c/output/errors/ root@www.clusterlabs.org:/var/www/html/coverity/$(PACKAGE)/$(TAG) make clean # rm -rf $(COVERITY_DIR) #cov-commit-defects --host $(COVHOST) --dir $(COVERITY_DIR) --stream $(PACKAGE) --user auto --password $(COVPASS) deb: echo To make create custom builds, edit the configure flags in debian/rules first dpkg-buildpackage -rfakeroot -us -uc global: clean-generic gtags -q global-html: global htags -sanhIT global-www: global-html rsync -avzxlSD --progress HTML/ root@www.clusterlabs.org:/var/www/html/global/$(PACKAGE) changes: @printf "\n* `date +"%a %b %d %Y"` `hg showconfig ui.username` $(VERSION)-1" @printf "\n- Update source tarball to revision: `hg id`" @printf "\n- Statistics:\n" @printf " Changesets: `hg log -M --template "{desc|firstline|strip}\n" -r $(LAST_RELEASE):tip | wc -l`\n" @printf " Diff: " @hg diff -r $(LAST_RELEASE):tip | diffstat | tail -n 1 @printf "\n- Changes since $(LAST_RELEASE)\n" @hg log -M --template " + {desc|firstline|strip}\n" -r $(LAST_RELEASE):tip | grep -v -e Dev: -e Low: -e Hg: -e "Added tag.*for changeset" | sort -uf @printf "\n" rel-tags: tags find . -name TAGS -exec sed -i.sed 's:\(.*\)/\(.*\)/TAGS:\2/TAGS:g' \{\} \; diff --git a/cts/CTS.py b/cts/CTS.py index 30273ef421..870314fce4 100644 --- a/cts/CTS.py +++ b/cts/CTS.py @@ -1,1441 +1,1451 @@ '''CTS: Cluster Testing System: Main module Classes related to testing high-availability clusters... ''' __copyright__=''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import types, string, select, sys, time, re, os, struct, signal import time, syslog, random, traceback, base64, pickle, binascii, fcntl from socket import gethostbyname_ex from UserDict import UserDict from subprocess import Popen,PIPE from cts.CTSvars import * class CtsLab(UserDict): '''This class defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. It is where you define the set of nodes that are in your test lab what kind of reset mechanism you use, etc. This class is derived from a UserDict because we hold many different parameters of different kinds, and this provides provide a uniform and extensible interface useful for any kind of communication between the user/administrator/tester and CTS. At this point in time, it is the intent of this class to model static configuration and/or environmental data about the environment which doesn't change as the tests proceed. Well-known names (keys) are an important concept in this class. The HasMinimalKeys member function knows the minimal set of well-known names for the class. The following names are standard (well-known) at this time: nodes An array of the nodes in the cluster reset A ResetMechanism object logger An array of objects that log strings... CMclass The type of ClusterManager we are running (This is a class object, not a class instance) RandSeed Random seed. It is a triple of bytes. (optional) The CTS code ignores names it doesn't know about/need. The individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions or other information to the tests through this mechanism. ''' def __init__(self): self.data = {} self.rsh = RemoteExec(self) self.RandomGen = random.Random() self.Scenario = None # Get a random seed for the random number generator. self["LogWatcher"] = "any" self["LogFileName"] = "/var/log/messages" self["OutputFile"] = None self["SyslogFacility"] = "daemon" self["CMclass"] = None self["logger"] = ([StdErrLog(self)]) self.SeedRandom() def SeedRandom(self, seed=None): if not seed: seed = int(time.time()) if self.has_key("RandSeed"): self.log("New random seed is: " + str(seed)) else: self.log("Random seed is: " + str(seed)) self["RandSeed"] = seed self.RandomGen.seed(str(seed)) def HasMinimalKeys(self): 'Return TRUE if our object has the minimal set of keys/values in it' result = 1 for key in self.MinimalKeys: if not self.has_key(key): result = None return result def log(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: logfcn(string.strip(args)) def debug(self, args): "Log using each of the supplied logging methods" for logfcn in self._logfunctions: if logfcn.name() != "StdErrLog": logfcn("debug: %s" % string.strip(args)) def dump(self): keys = [] for key in self.keys(): keys.append(key) keys.sort() for key in keys: self.debug("Environment["+key+"]:\t"+str(self[key])) def run(self, Scenario, Iterations): if not Scenario: self.log("No scenario was defined") return 1 self.log("Cluster nodes: ") for node in self["nodes"]: self.log(" * %s" % (node)) if not Scenario.SetUp(): return 1 try : Scenario.run(Iterations) except : self.log("Exception by %s" % sys.exc_info()[0]) for logmethod in self["logger"]: traceback.print_exc(50, logmethod) Scenario.summarize() Scenario.TearDown() return 1 #ClusterManager.oprofileSave(Iterations) Scenario.TearDown() Scenario.summarize() if Scenario.Stats["failure"] > 0: return Scenario.Stats["failure"] elif Scenario.Stats["success"] != Iterations: self.log("No failure count but success != requested iterations") return 1 return 0 def __setitem__(self, key, value): '''Since this function gets called whenever we modify the dictionary (object), we can (and do) validate those keys that we know how to validate. For the most part, we know how to validate the "MinimalKeys" elements. ''' # # List of nodes in the system # if key == "nodes": self.Nodes = {} for node in value: # I don't think I need the IP address, etc. but this validates # the node name against /etc/hosts and/or DNS, so it's a # GoodThing(tm). try: self.Nodes[node] = gethostbyname_ex(node) except: print node+" not found in DNS... aborting" raise # # List of Logging Mechanism(s) # elif key == "logger": if len(value) < 1: raise ValueError("Must have at least one logging mechanism") for logger in value: if not callable(logger): raise ValueError("'logger' elements must be callable") self._logfunctions = value # # Cluster Manager Class # elif key == "CMclass": if value and not issubclass(value, ClusterManager): raise ValueError("'CMclass' must be a subclass of" " ClusterManager") # # Initial Random seed... # #elif key == "RandSeed": # if len(value) != 3: # raise ValueError("'Randseed' must be a 3-element list/tuple") # for elem in value: # if not isinstance(elem, types.IntType): # raise ValueError("'Randseed' list must all be ints") self.data[key] = value def IsValidNode(self, node): 'Return TRUE if the given node is valid' return self.Nodes.has_key(node) def __CheckNode(self, node): "Raise a ValueError if the given node isn't valid" if not self.IsValidNode(node): raise ValueError("Invalid node [%s] in CheckNode" % node) def RandomNode(self): '''Choose a random node from the cluster''' return self.RandomGen.choice(self["nodes"]) class Logger: TimeFormat = "%b %d %H:%M:%S\t" def __call__(self, lines): raise ValueError("Abstract class member (__call__)") def write(self, line): return self(line.rstrip()) def writelines(self, lines): for s in lines: self.write(s) return 1 def flush(self): return 1 def isatty(self): return None class SysLog(Logger): # http://docs.python.org/lib/module-syslog.html defaultsource="CTS" map = { "kernel": syslog.LOG_KERN, "user": syslog.LOG_USER, "mail": syslog.LOG_MAIL, "daemon": syslog.LOG_DAEMON, "auth": syslog.LOG_AUTH, "lpr": syslog.LOG_LPR, "news": syslog.LOG_NEWS, "uucp": syslog.LOG_UUCP, "cron": syslog.LOG_CRON, "local0": syslog.LOG_LOCAL0, "local1": syslog.LOG_LOCAL1, "local2": syslog.LOG_LOCAL2, "local3": syslog.LOG_LOCAL3, "local4": syslog.LOG_LOCAL4, "local5": syslog.LOG_LOCAL5, "local6": syslog.LOG_LOCAL6, "local7": syslog.LOG_LOCAL7, } def __init__(self, labinfo): if labinfo.has_key("syslogsource"): self.source=labinfo["syslogsource"] else: self.source=SysLog.defaultsource self.facility="daemon" if labinfo.has_key("SyslogFacility") and labinfo["SyslogFacility"]: if SysLog.map.has_key(labinfo["SyslogFacility"]): self.facility=labinfo["SyslogFacility"] else: raise ValueError("%s: bad syslog facility"%labinfo["SyslogFacility"]) self.facility=SysLog.map[self.facility] syslog.openlog(self.source, 0, self.facility) def setfacility(self, facility): self.facility = facility if SysLog.map.has_key(self.facility): self.facility=SysLog.map[self.facility] syslog.closelog() syslog.openlog(self.source, 0, self.facility) def __call__(self, lines): if isinstance(lines, types.StringType): syslog.syslog(lines) else: for line in lines: syslog.syslog(line) def name(self): return "Syslog" class StdErrLog(Logger): def __init__(self, labinfo): pass def __call__(self, lines): t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): sys.__stderr__.writelines([t, lines, "\n"]) else: for line in lines: sys.__stderr__.writelines([t, line, "\n"]) sys.__stderr__.flush() def name(self): return "StdErrLog" class FileLog(Logger): def __init__(self, labinfo, filename=None): if filename == None: filename=labinfo["LogFileName"] self.logfile=filename import os self.hostname = os.uname()[1]+" " self.source = "CTS: " def __call__(self, lines): fd = open(self.logfile, "a") t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, types.StringType): fd.writelines([t, self.hostname, self.source, lines, "\n"]) else: for line in lines: fd.writelines([t, self.hostname, self.source, line, "\n"]) fd.close() def name(self): return "FileLog" class RemoteExec: '''This is an abstract remote execution class. It runs a command on another machine - somehow. The somehow is up to us. This particular class uses ssh. Most of the work is done by fork/exec of ssh or scp. ''' def __init__(self, Env=None, silent=False): self.Env = Env self.silent = silent # -n: no stdin, -x: no X11, # -o ServerAliveInterval=5 disconnect after 3*5s if the server stops responding self.Command = "ssh -l root -n -x -o ServerAliveInterval=5 -o ConnectTimeout=10 -o TCPKeepAlive=yes -o ServerAliveCountMax=3 " # -B: batch mode, -q: no stats (quiet) self.CpCommand = "scp -B -q" self.OurNode=string.lower(os.uname()[1]) def enable_qarsh(self): # http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/ self.log("Using QARSH for connections to cluster nodes") self.Command = "qarsh -l root" self.CpCommand = "qacp" def _fixcmd(self, cmd): return re.sub("\'", "'\\''", cmd) def _cmd(self, *args): '''Compute the string that will run the given command on the given remote system''' args= args[0] sysname = args[0] command = args[1] #print "sysname: %s, us: %s" % (sysname, self.OurNode) if sysname == None or string.lower(sysname) == self.OurNode or sysname == "localhost": ret = command else: ret = self.Command + " " + sysname + " '" + self._fixcmd(command) + "'" #print ("About to run %s\n" % ret) return ret def log(self, args): if not self.silent: if not self.Env: print (args) else: self.Env.log(args) def debug(self, args): if not self.silent: if not self.Env: print (args) else: self.Env.debug(args) def __call__(self, node, command, stdout=0, synchronous=1, silent=False, blocking=True): '''Run the given command on the given remote system If you call this class like a function, this is the function that gets called. It just runs it roughly as though it were a system() call on the remote machine. The first argument is name of the machine to run it on. ''' rc = 0 result = None if not synchronous: proc = Popen(self._cmd([node, command]), stdout = PIPE, stderr = PIPE, close_fds = True, shell = True) if not silent: self.debug("cmd: async: target=%s, rc=%d: %s" % (node, proc.pid, command)) if proc.pid > 0: return 0 return -1 proc = Popen(self._cmd([node, command]), stdout = PIPE, stderr = PIPE, close_fds = True, shell = True) #if not blocking: # fcntl.fcntl(proc.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) if proc.stdout: if stdout == 1: result = proc.stdout.readline() else: result = proc.stdout.readlines() proc.stdout.close() else: self.log("No stdout stream") rc = proc.wait() if not silent: self.debug("cmd: target=%s, rc=%d: %s" % (node, rc, command)) if stdout == 1: return result if proc.stderr: errors = proc.stderr.readlines() proc.stderr.close() if not silent: for err in errors: self.debug("cmd: stderr: %s" % err) if stdout == 0: if not silent and result: for line in result: self.debug("cmd: stdout: %s" % line) return rc return (rc, result) def cp(self, source, target, silent=False): '''Perform a remote copy''' cpstring = self.CpCommand + " \'" + source + "\'" + " \'" + target + "\'" rc = os.system(cpstring) if not silent: self.debug("cmd: rc=%d: %s" % (rc, cpstring)) return rc has_log_watcher = {} log_watcher_bin = "/tmp/cts_log_watcher.py" log_watcher = """ import sys, os, fcntl ''' Remote logfile reader for CTS Reads a specified number of lines from the supplied offset Returns the current offset Contains logic for handling truncation ''' -limit = 100 +limit = 0 offset = 0 prefix = '' filename = '/var/log/messages' skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == '-l' or args[i] == '--limit': skipthis=1 limit = int(args[i+1]) elif args[i] == '-f' or args[i] == '--filename': skipthis=1 filename = args[i+1] elif args[i] == '-o' or args[i] == '--offset': skipthis=1 offset = args[i+1] elif args[i] == '-p' or args[i] == '--prefix': skipthis=1 prefix = args[i+1] logfile=open(filename, 'r') logfile.seek(0, os.SEEK_END) newsize=logfile.tell() if offset != 'EOF': offset = int(offset) if newsize >= offset: logfile.seek(offset) else: print prefix + ('File truncated from %d to %d' % (offset, newsize)) if (newsize*1.05) < offset: logfile.seek(0) # else: we probably just lost a few logs after a fencing op # continue from the new end # TODO: accept a timestamp and discard all messages older than it # Don't block when we reach EOF fcntl.fcntl(logfile.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) -count = limit -while count > 0: - count -= 1 +count = 0 +while True: + if logfile.tell() > newsize: break + elif limit and count >= limit: break + line = logfile.readline() - if line: print line.strip() - else: break + if not line: break + + print line.strip() + count += 1 -print prefix + 'Last read: %d %d' % (logfile.tell(), limit - count) +print prefix + 'Last read: %d, limit=%d, count=%d' % (logfile.tell(), limit, count) logfile.close() """ class SearchObj: def __init__(self, Env, filename, host=None): self.Env = Env self.host = host self.filename = filename self.cache = [] self.offset = "EOF" if host == None: host = "localhost" global has_log_watcher if not has_log_watcher.has_key(host): global log_watcher global log_watcher_bin self.debug("Installing %s on %s" % (log_watcher_bin, host)) self.Env.rsh(host, '''echo "%s" > %s''' % (log_watcher, log_watcher_bin), silent=True) has_log_watcher[host] = 1 self.next() def __str__(self): if self.host: return "%s:%s" % (self.host, self.filename) return self.filename def log(self, args): message = "lw: %s: %s" % (self, args) if not self.Env: print (message) else: self.Env.log(message) def debug(self, args): message = "lw: %s: %s" % (self, args) if not self.Env: print (message) else: self.Env.debug(message) def next(self): cache = [] if not len(self.cache): global log_watcher_bin (rc, lines) = self.Env.rsh( self.host, "python %s -p CTSwatcher: -f %s -o %s" % (log_watcher_bin, self.filename, self.offset), stdout=None, silent=True, blocking=False) for line in lines: match = re.search("^CTSwatcher:Last read: (\d+)", line) if match: last_offset = self.offset self.offset = match.group(1) #if last_offset == "EOF": self.debug("Got %d lines, new offset: %s" % (len(lines), self.offset)) elif re.search("^CTSwatcher:.*truncated", line): self.log(line) elif re.search("^CTSwatcher:", line): self.debug("Got control line: "+ line) else: cache.append(line) return cache class LogWatcher(RemoteExec): '''This class watches logs for messages that fit certain regular expressions. Watching logs for events isn't the ideal way to do business, but it's better than nothing :-) On the other hand, this class is really pretty cool ;-) The way you use this class is as follows: Construct a LogWatcher object Call setwatch() when you want to start watching the log Call look() to scan the log looking for the patterns ''' def __init__(self, Env, log, regexes, name="Anon", timeout=10, debug_level=None, silent=False): '''This is the constructor for the LogWatcher class. It takes a log name to watch, and a list of regular expressions to watch for." ''' RemoteExec.__init__(self, Env) # Validate our arguments. Better sooner than later ;-) for regex in regexes: assert re.compile(regex) self.name = name self.regexes = regexes self.filename = log self.debug_level = debug_level self.whichmatch = -1 self.unmatched = None self.file_list = [] self.line_cache = [] if not silent: for regex in self.regexes: self.debug("Looking for regex: "+regex) self.Timeout = int(timeout) self.returnonlymatch = None def debug(self, args): message = "lw: %s: %s" % (self.name, args) if not self.Env: print (message) else: self.Env.debug(message) def setwatch(self): '''Mark the place to start watching the log from. ''' if self.Env["LogWatcher"] == "remote": for node in self.Env["nodes"]: self.file_list.append(SearchObj(self.Env, self.filename, node)) else: self.file_list.append(SearchObj(self.Env, self.filename)) def __del__(self): if self.debug_level > 1: self.debug("Destroy") def ReturnOnlyMatch(self, onlymatch=1): '''Specify one or more subgroups of the match to return rather than the whole string http://www.python.org/doc/2.5.2/lib/match-objects.html ''' self.returnonlymatch = onlymatch - def __get_line(self): + def __get_lines(self): + if not len(self.file_list): + raise ValueError("No sources to read from") - if not len(self.line_cache): - if not len(self.file_list): - raise ValueError("No sources to read from") - - for f in self.file_list: - lines = f.next() + for f in self.file_list: + lines = f.next() + if len(lines): self.line_cache.extend(lines) - - if self.line_cache: - line = self.line_cache[0] - self.line_cache.remove(line) - return line - - return None - + def look(self, timeout=None, silent=False): '''Examine the log looking for the given patterns. It starts looking from the place marked by setwatch(). This function looks in the file in the fashion of tail -f. It properly recovers from log file truncation, but not from removing and recreating the log. It would be nice if it recovered from this as well :-) We return the first line which matches any of our patterns. ''' if timeout == None: timeout = self.Timeout - now=time.time() - done=now+timeout+1 - if self.debug_level > 2: self.debug("starting single search: timeout=%d, now=%d, limit=%d" % (timeout, now, done)) + lines=0 + begin=time.time() + end=begin+timeout+1 + if self.debug_level > 2: self.debug("starting single search: timeout=%d, begin=%d, end=%d" % (timeout, begin, end)) + + self.__get_lines() + while True: + + if len(self.line_cache): + lines += 1 + line = self.line_cache[0] + self.line_cache.remove(line) - while (timeout <= 0 or time.time() <= done): - line = self.__get_line() - if line: which=-1 if re.search("CTS:", line): continue if self.debug_level > 2: self.debug("Processing: "+ line) for regex in self.regexes: which=which+1 if self.debug_level > 2: self.debug("Comparing line to: "+ regex) #matchobj = re.search(string.lower(regex), string.lower(line)) matchobj = re.search(regex, line) if matchobj: self.whichmatch=which if self.returnonlymatch: return matchobj.group(self.returnonlymatch) else: self.debug("Matched: "+line) if self.debug_level > 1: self.debug("With: "+ regex) return line - elif timeout > 0: + elif timeout > 0 and end > time.time(): time.sleep(1) - #time.sleep(0.025) + self.__get_lines() + + elif timeout > 0: + # Grab any relevant messages that might have arrived since + # the last time the buffer was populated + self.__get_lines() + + # Don't come back here again + timeout = 0 else: - self.debug("End of file") + self.debug("Single search terminated: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), lines)) return None - self.debug("Single search timed out: timeout=%d, start=%d, limit=%d, now=%d" % (timeout, now, done, time.time())) + self.debug("How did we get here") return None def lookforall(self, timeout=None, allow_multiple_matches=None, silent=False): '''Examine the log looking for ALL of the given patterns. It starts looking from the place marked by setwatch(). We return when the timeout is reached, or when we have found ALL of the regexes that were part of the watch ''' if timeout == None: timeout = self.Timeout save_regexes = self.regexes returnresult = [] if not silent: self.debug("starting search: timeout=%d" % timeout) for regex in self.regexes: if self.debug_level > 2: self.debug("Looking for regex: "+regex) while (len(self.regexes) > 0): oneresult = self.look(timeout) if not oneresult: self.unmatched = self.regexes self.matched = returnresult self.regexes = save_regexes return None returnresult.append(oneresult) if not allow_multiple_matches: del self.regexes[self.whichmatch] else: # Allow multiple regexes to match a single line tmp_regexes = self.regexes self.regexes = [] which = 0 for regex in tmp_regexes: matchobj = re.search(regex, oneresult) if not matchobj: self.regexes.append(regex) self.unmatched = None self.matched = returnresult self.regexes = save_regexes return returnresult class NodeStatus: def __init__(self, Env): self.Env = Env def IsNodeBooted(self, node): '''Return TRUE if the given node is booted (responds to pings)''' return self.Env.rsh("localhost", "ping -nq -c1 -w1 %s" % node, silent=True) == 0 def IsSshdUp(self, node): rc = self.Env.rsh(node, "true", silent=True) return rc == 0 def WaitForNodeToComeUp(self, node, Timeout=300): '''Return TRUE when given node comes up, or None/FALSE if timeout''' timeout=Timeout anytimeouts=0 while timeout > 0: if self.IsNodeBooted(node) and self.IsSshdUp(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) self.Env.debug("Node %s now up" % node) return 1 time.sleep(30) if (not anytimeouts): self.Env.debug("Waiting for node %s to come up" % node) anytimeouts=1 timeout = timeout - 1 self.Env.log("%s did not come up within %d tries" % (node, Timeout)) answer = raw_input('Continue? [nY]') if answer and answer == "n": raise ValueError("%s did not come up within %d tries" % (node, Timeout)) def WaitForAllNodesToComeUp(self, nodes, timeout=300): '''Return TRUE when all nodes come up, or FALSE if timeout''' for node in nodes: if not self.WaitForNodeToComeUp(node, timeout): return None return 1 class ClusterManager(UserDict): '''The Cluster Manager class. This is an subclass of the Python dictionary class. (this is because it contains lots of {name,value} pairs, not because it's behavior is that terribly similar to a dictionary in other ways.) This is an abstract class which class implements high-level operations on the cluster and/or its cluster managers. Actual cluster managers classes are subclassed from this type. One of the things we do is track the state we think every node should be in. ''' def __InitialConditions(self): #if os.geteuid() != 0: # raise ValueError("Must Be Root!") None def _finalConditions(self): for key in self.keys(): if self[key] == None: raise ValueError("Improper derivation: self[" + key + "] must be overridden by subclass.") def __init__(self, Environment, randseed=None): self.Env = Environment self.__InitialConditions() self.clear_cache = 0 self.TestLoggingLevel=0 self.data = { "up" : "up", # Status meaning up "down" : "down", # Status meaning down "StonithCmd" : "stonith -t baytech -p '10.10.10.100 admin admin' %s", "DeadTime" : 30, # Max time to detect dead node... "StartTime" : 90, # Max time to start up # # These next values need to be overridden in the derived class. # "Name" : None, "StartCmd" : None, "StopCmd" : None, "StatusCmd" : None, #"RereadCmd" : None, "BreakCommCmd" : None, "FixCommCmd" : None, #"TestConfigDir" : None, "LogFileName" : None, #"Pat:Master_started" : None, #"Pat:Slave_started" : None, "Pat:We_stopped" : None, "Pat:They_stopped" : None, "BadRegexes" : None, # A set of "bad news" regexes # to apply to the log } self.rsh = self.Env.rsh self.ShouldBeStatus={} self.ns = NodeStatus(self.Env) self.OurNode=string.lower(os.uname()[1]) def key_for_node(self, node): return node def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] def log(self, args): self.Env.log(args) def debug(self, args): self.Env.debug(args) def prepare(self): '''Finish the Initialization process. Prepare to test...''' for node in self.Env["nodes"]: if self.StataCM(node): self.ShouldBeStatus[node]="up" else: self.ShouldBeStatus[node]="down" self.unisolate_node(node) def upcount(self): '''How many nodes are up?''' count=0 for node in self.Env["nodes"]: if self.ShouldBeStatus[node]=="up": count=count+1 return count def install_helper(self, filename, nodes=None): file_with_path="%s/%s" % (CTSvars.CTS_home, filename) if not nodes: nodes = self.Env["nodes"] self.debug("Installing %s to %s on %s" % (filename, CTSvars.CTS_home, repr(self.Env["nodes"]))) for node in nodes: self.rsh(node, "mkdir -p %s" % CTSvars.CTS_home) self.rsh.cp(file_with_path, "root@%s:%s" % (node, file_with_path)) return file_with_path def install_config(self, node): return None def clear_all_caches(self): if self.clear_cache: for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "down": self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") else: self.debug("NOT Removing cache file on: "+node) def prepare_fencing_watcher(self, node): # If we don't have quorum now but get it as a result of starting this node, # then a bunch of nodes might get fenced if self.HasQuorum(None): return None if not self.has_key("Pat:They_fenced"): return None if not self.has_key("Pat:They_fenced_offset"): return None stonith = None stonithPats = [] for peer in self.Env["nodes"]: if peer != node and self.ShouldBeStatus[peer] != "up": stonithPats.append(self["Pat:They_fenced"] % peer) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith = LogWatcher(self.Env, self["LogFileName"], stonithPats, "StartaCM", 0) stonith.setwatch() return stonith def fencing_cleanup(self, node, stonith): peer_list = [] # If we just started a node, we may now have quorum (and permission to fence) # Make sure everyone is online before continuing self.ns.WaitForAllNodesToComeUp(self.Env["nodes"]) if not stonith: return peer_list if not self.HasQuorum(None): # We didn't gain quorum - we shouldn't have shot anyone return peer_list # Now see if any states need to be updated self.debug("looking for: " + repr(stonith.regexes)) - shot = stonith.look(60) + shot = stonith.look(0) while shot: line = repr(shot) self.debug("Found: "+ line) # Extract node name start = line.find(self["Pat:They_fenced_offset"]) + len(self["Pat:They_fenced_offset"]) peer = line[start:].split("' ")[0] self.debug("Found peer: "+ peer) peer_list.append(peer) self.ShouldBeStatus[peer]="down" self.log(" Peer %s was fenced as a result of %s starting" % (peer, node)) # Get the next one shot = stonith.look(60) # Poll until it comes up if self.Env["at-boot"]: if not self.StataCM(peer): time.sleep(self["StartTime"]) if not self.StataCM(peer): self.log("ERROR: Peer %s failed to restart after being fenced" % peer) return None self.ShouldBeStatus[peer]="up" return peer_list def StartaCM(self, node, verbose=False): '''Start up the cluster manager on a given node''' if verbose: self.log("Starting %s on node %s" %(self["Name"], node)) else: self.debug("Starting %s on node %s" %(self["Name"], node)) ret = 1 if not self.ShouldBeStatus.has_key(node): self.ShouldBeStatus[node] = "down" if self.ShouldBeStatus[node] != "down": return 1 patterns = [] # Technically we should always be able to notice ourselves starting patterns.append(self["Pat:Local_started"] % node) if self.upcount() == 0: patterns.append(self["Pat:Master_started"] % node) else: patterns.append(self["Pat:Slave_started"] % node) watch = LogWatcher( self.Env, self["LogFileName"], patterns, "StartaCM", self["StartTime"]+10) watch.setwatch() self.install_config(node) self.ShouldBeStatus[node] = "any" if self.StataCM(node) and self.cluster_stable(self["DeadTime"]): self.log ("%s was already started" %(node)) return 1 # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self["StartCmd"]) stonith = self.prepare_fencing_watcher(node) if self.rsh(node, startCmd) != 0: self.log ("Warn: Start command failed on node %s" %(node)) return None self.ShouldBeStatus[node]="up" watch_result = watch.lookforall() self.fencing_cleanup(node, stonith) if watch.unmatched: for regex in watch.unmatched: self.log ("Warn: Startup pattern not found: %s" %(regex)) if watch_result and self.cluster_stable(self["DeadTime"]): #self.debug("Found match: "+ repr(watch_result)) return 1 elif self.StataCM(node) and self.cluster_stable(self["DeadTime"]): return 1 self.log ("Warn: Start failed for node %s" %(node)) return None def StartaCMnoBlock(self, node, verbose=False): '''Start up the cluster manager on a given node with none-block mode''' if verbose: self.log("Starting %s on node %s" %(self["Name"], node)) else: self.debug("Starting %s on node %s" %(self["Name"], node)) # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self["StartCmd"]) self.rsh(node, startCmd, synchronous=0) self.ShouldBeStatus[node]="up" return 1 def StopaCM(self, node, verbose=False): '''Stop the cluster manager on a given node''' if verbose: self.log("Stopping %s on node %s" %(self["Name"], node)) else: self.debug("Stopping %s on node %s" %(self["Name"], node)) if self.ShouldBeStatus[node] != "up": return 1 if self.rsh(node, self["StopCmd"]) == 0: # Make sure we can continue even if corosync leaks self.rsh(node, "rm -f /dev/shm/fdata-*") self.ShouldBeStatus[node]="down" self.cluster_stable(self["DeadTime"]) return 1 else: self.log ("Could not stop %s on node %s" %(self["Name"], node)) return None def StopaCMnoBlock(self, node): '''Stop the cluster manager on a given node with none-block mode''' self.debug("Stopping %s on node %s" %(self["Name"], node)) self.rsh(node, self["StopCmd"], synchronous=0) self.ShouldBeStatus[node]="down" return 1 def cluster_stable(self, timeout = None): time.sleep(self["StableTime"]) return 1 def node_stable(self, node): return 1 def RereadCM(self, node): '''Force the cluster manager on a given node to reread its config This may be a no-op on certain cluster managers. ''' rc=self.rsh(node, self["RereadCmd"]) if rc == 0: return 1 else: self.log ("Could not force %s on node %s to reread its config" % (self["Name"], node)) return None def StataCM(self, node): '''Report the status of the cluster manager on a given node''' out=self.rsh(node, self["StatusCmd"], 1) ret= (string.find(out, 'stopped') == -1) try: if ret: if self.ShouldBeStatus[node] == "down": self.log( "Node status for %s is %s but we think it should be %s" % (node, "up", self.ShouldBeStatus[node])) else: if self.ShouldBeStatus[node] == "up": self.log( "Node status for %s is %s but we think it should be %s" % (node, "down", self.ShouldBeStatus[node])) except KeyError: pass if ret: self.ShouldBeStatus[node]="up" else: self.ShouldBeStatus[node]="down" return ret def startall(self, nodelist=None, verbose=False): '''Start the cluster manager on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' ret = 1 map = {} if not nodelist: nodelist=self.Env["nodes"] for node in nodelist: if self.ShouldBeStatus[node] == "down": if not self.StartaCM(node, verbose=verbose): ret = 0 return ret def stopall(self, nodelist=None, verbose=False): '''Stop the cluster managers on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' ret = 1 map = {} if not nodelist: nodelist=self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": if not self.StopaCM(node, verbose=verbose): ret = 0 return ret def rereadall(self, nodelist=None): '''Force the cluster managers on every node in the cluster to reread their config files. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist=self.Env["nodes"] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == "up": self.RereadCM(node) def statall(self, nodelist=None): '''Return the status of the cluster managers in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' result={} if not nodelist: nodelist=self.Env["nodes"] for node in nodelist: if self.StataCM(node): result[node] = "up" else: result[node] = "down" return result def isolate_node(self, target, nodes=None): '''isolate the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: rc = self.rsh(target, self["BreakCommCmd"] % self.key_for_node(node)) if rc != 0: self.log("Could not break the communication between %s and %s: %d" % (target, node, rc)) return None else: self.debug("Communication cut between %s and %s" % (target, node)) return 1 def unisolate_node(self, target, nodes=None): '''fix the communication between the nodes''' if not nodes: nodes = self.Env["nodes"] for node in nodes: if node != target: restored = 0 # Limit the amount of time we have asynchronous connectivity for # Restore both sides as simultaneously as possible self.rsh(target, self["FixCommCmd"] % self.key_for_node(node), synchronous=0) self.rsh(node, self["FixCommCmd"] % self.key_for_node(target), synchronous=0) self.debug("Communication restored between %s and %s" % (target, node)) def reducecomm_node(self,node): '''reduce the communication between the nodes''' rc = self.rsh(node, self["ReduceCommCmd"]%(self.Env["XmitLoss"],self.Env["RecvLoss"])) if rc == 0: return 1 else: self.log("Could not reduce the communication between the nodes from node: %s" % node) return None def restorecomm_node(self,node): '''restore the saved communication between the nodes''' rc = 0 if float(self.Env["XmitLoss"])!=0 or float(self.Env["RecvLoss"])!=0 : rc = self.rsh(node, self["RestoreCommCmd"]); if rc == 0: return 1 else: self.log("Could not restore the communication between the nodes from node: %s" % node) return None def HasQuorum(self, node_list): "Return TRUE if the cluster currently has quorum" # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes raise ValueError("Abstract Class member (HasQuorum)") def Components(self): raise ValueError("Abstract Class member (Components)") def oprofileStart(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStart(n) elif node in self.Env["oprofile"]: self.debug("Enabling oprofile on %s" % node) self.rsh(node, "opcontrol --init") self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") self.rsh(node, "opcontrol --start") self.rsh(node, "opcontrol --reset") def oprofileSave(self, test, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileSave(test, n) elif node in self.Env["oprofile"]: self.rsh(node, "opcontrol --dump") self.rsh(node, "opcontrol --save=cts.%d" % test) # Read back with: opreport -l session:cts.0 image:/usr/lib/heartbeat/c* if None: self.rsh(node, "opcontrol --reset") else: self.oprofileStop(node) self.oprofileStart(node) def oprofileStop(self, node=None): if not node: for n in self.Env["oprofile"]: self.oprofileStop(n) elif node in self.Env["oprofile"]: self.debug("Stopping oprofile on %s" % node) self.rsh(node, "opcontrol --reset") self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") class Resource: ''' This is an HA resource (not a resource group). A resource group is just an ordered list of Resource objects. ''' def __init__(self, cm, rsctype=None, instance=None): self.CM = cm self.ResourceType = rsctype self.Instance = instance self.needs_quorum = 1 def Type(self): return self.ResourceType def Instance(self, nodename): return self.Instance def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. It is analagous to the "status" operation on SystemV init scripts and heartbeat scripts. FailSafe calls it the "exclusive" operation. ''' raise ValueError("Abstract Class member (IsRunningOn)") return None def IsWorkingCorrectly(self, nodename): ''' This member function returns true if our resource is operating correctly on the given node in the cluster. Heartbeat does not require this operation, but it might be called the Monitor operation, which is what FailSafe calls it. For remotely monitorable resources (like IP addresses), they *should* be monitored remotely for testing. ''' raise ValueError("Abstract Class member (IsWorkingCorrectly)") return None def Start(self, nodename): ''' This member function starts or activates the resource. ''' raise ValueError("Abstract Class member (Start)") return None def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' raise ValueError("Abstract Class member (Stop)") return None def __repr__(self): if (self.Instance and len(self.Instance) > 1): return "{" + self.ResourceType + "::" + self.Instance + "}" else: return "{" + self.ResourceType + "}" class Component: def kill(self, node): None class Process(Component): def __init__(self, cm, name, process=None, dc_only=0, pats=[], dc_pats=[], badnews_ignore=[], triggersreboot=0): self.name = str(name) self.dc_only = dc_only self.pats = pats self.dc_pats = dc_pats self.CM = cm self.badnews_ignore = badnews_ignore self.triggersreboot = triggersreboot if process: self.proc = str(process) else: self.proc = str(name) self.KillCmd = "killall -9 " + self.proc def kill(self, node): if self.CM.rsh(node, self.KillCmd) != 0: self.CM.log ("ERROR: Kill %s failed on node %s" %(self.name,node)) return None return 1 diff --git a/fencing/admin.c b/fencing/admin.c index 59b13411a7..da7638a904 100644 --- a/fencing/admin.c +++ b/fencing/admin.c @@ -1,343 +1,347 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct crm_option long_options[] = { {"help", 0, 0, '?', "\tThis text"}, {"version", 0, 0, '$', "\tVersion information" }, {"verbose", 0, 0, 'V', "\tIncrease debug output"}, {"quiet", 0, 0, 'q', "\tPrint only essential output"}, {"list", 1, 0, 'l', "List devices that can terminate the specified host"}, {"list-registered", 0, 0, 'L', "List all registered devices"}, {"list-installed", 0, 0, 'I', "List all installed devices"}, {"metadata", 0, 0, 'M', "Check the device's metadata"}, {"query", 1, 0, 'Q', "Check the device's status"}, {"fence", 1, 0, 'F', "Fence the named host"}, {"unfence", 1, 0, 'U', "Unfence the named host"}, {"reboot", 1, 0, 'B', "Reboot the named host"}, {"confirm", 1, 0, 'C', "Confirm the named host is now safely down"}, {"history", 1, 0, 'H', "Retrieve last fencing operation"}, {"register", 1, 0, 'R', "Register a stonith device"}, {"deregister", 1, 0, 'D', "De-register a stonith device"}, {"env-option", 1, 0, 'e'}, {"option", 1, 0, 'o'}, {"agent", 1, 0, 'a'}, {"list-all", 0, 0, 'L', "legacy alias for --list-registered"}, {0, 0, 0, 0} }; int st_opts = st_opt_sync_call; int main(int argc, char ** argv) { int flag; int rc = 0; int quiet = 0; int verbose = 0; int argerr = 0; int option_index = 0; char name[512]; char value[512]; const char *agent = NULL; const char *device = NULL; const char *target = NULL; char action = 0; stonith_t *st = NULL; stonith_key_value_t *params = NULL; stonith_key_value_t *devices = NULL; stonith_key_value_t *dIter = NULL; crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv); crm_set_options(NULL, "mode [options]", long_options, "Provides access to the stonith-ng API.\n"); while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch(flag) { case 'V': verbose = 1; alter_debug(DEBUG_INC); cl_log_enable_stderr(1); break; case '$': case '?': crm_help(flag, LSB_EXIT_OK); break; case 'L': case 'I': action = flag; break; case 'q': quiet = 1; break; case 'Q': case 'R': case 'D': action = flag; device = optarg; break; case 'a': agent = optarg; break; case 'l': target = optarg; action = 'L'; break; case 'M': action = flag; break; case 'B': case 'F': case 'U': case 'C': case 'H': cl_log_enable_stderr(1); target = optarg; action = flag; break; case 'o': crm_info("Scanning: -o %s", optarg); rc = sscanf(optarg, "%[^=]=%[^=]", name, value); if(rc != 2) { crm_err("Invalid option: -o %s", optarg); ++argerr; } else { crm_info("Got: '%s'='%s'", name, value); stonith_key_value_add(params, name, value); } break; case 'e': { char *key = crm_concat("OCF_RESKEY", optarg, '_'); const char *env = getenv(key); if(env == NULL) { crm_err("Invalid option: -e %s", optarg); ++argerr; } else { crm_info("Got: '%s'='%s'", optarg, env); stonith_key_value_add( params, optarg, env); } } break; default: ++argerr; break; } } if (optind > argc) { ++argerr; } if (argerr) { crm_help('?', LSB_EXIT_GENERIC); } crm_debug("Create"); st = stonith_api_new(); if(action != 'M' && action != 'I') { rc = st->cmds->connect(st, crm_system_name, NULL); crm_debug("Connect: %d", rc); if(rc < 0) { goto done; } } switch(action) { case 'I': rc = st->cmds->list(st, st_opt_sync_call, NULL, &devices, 0); for(dIter = devices; dIter; dIter = dIter->next ) { fprintf( stdout, " %s\n", dIter->value ); } if(rc == 0) { fprintf(stderr, "No devices found\n"); } else if(rc > 0) { fprintf(stderr, "%d devices found\n", rc); rc = 0; } stonith_key_value_freeall(devices, 1, 1); break; case 'L': rc = st->cmds->query(st, st_opts, target, &devices, 10); for(dIter = devices; dIter; dIter = dIter->next ) { fprintf( stdout, " %s\n", dIter->value ); crm_free(dIter->value); } if(rc == 0) { fprintf(stderr, "No devices found\n"); } else if(rc > 0) { fprintf(stderr, "%d devices found\n", rc); rc = 0; } stonith_key_value_freeall(devices, 1, 1); break; case 'Q': rc = st->cmds->call(st, st_opts, device, "monitor", NULL, 10); if(rc < 0) { rc = st->cmds->call(st, st_opts, device, "list", NULL, 10); } break; case 'R': rc = st->cmds->register_device(st, st_opts, device, "stonith-ng", agent, params); break; case 'D': rc = st->cmds->remove_device(st, st_opts, device); break; case 'M': - { + if (agent == NULL) { + printf("Please specify an agent to query using -a,--agent [value]\n"); + return -1; + + } else { char *buffer = NULL; st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 0); printf("%s\n", buffer); crm_free(buffer); } break; case 'C': rc = st->cmds->confirm(st, st_opts, target); break; case 'B': rc = st->cmds->fence(st, st_opts, target, "reboot", 120); break; case 'F': rc = st->cmds->fence(st, st_opts, target, "off", 120); break; case 'U': rc = st->cmds->fence(st, st_opts, target, "on", 120); break; case 'H': { stonith_history_t *history, *hp, *latest = NULL; rc = st->cmds->history(st, st_opts, target, &history, 120); for(hp = history; hp; hp = hp->next) { char *action_s = NULL; time_t complete = hp->completed; if(hp->state == st_done) { latest = hp; } if(quiet || !verbose) { continue; } else if(hp->action == NULL) { action_s = crm_strdup("unknown"); } else if(hp->action[0] != 'r') { action_s = crm_concat("turn", hp->action, ' '); } else { action_s = crm_strdup(hp->action); } if(hp->state == st_failed) { printf("%s failed to %s node %s on behalf of %s at %s\n", hp->delegate?hp->delegate:"We", action_s, hp->target, hp->origin, ctime(&complete)); } else if(hp->state == st_done) { printf("%s was able to %s node %s on behalf of %s at %s\n", hp->delegate?hp->delegate:"We", action_s, hp->target, hp->origin, ctime(&complete)); } else { printf("%s wishes to %s node %s - %d %d\n", hp->origin, action_s, hp->target, hp->state, hp->completed); } crm_free(action_s); } if(latest) { if(quiet) { printf("%d\n", latest->completed); } else { char *action_s = NULL; time_t complete = latest->completed; if(latest->action == NULL) { action_s = crm_strdup("unknown"); } else if(latest->action[0] != 'r') { action_s = crm_concat("turn", latest->action, ' '); } else { action_s = crm_strdup(latest->action); } printf("%s was able to %s node %s on behalf of %s at %s\n", latest->delegate?latest->delegate:"We", action_s, latest->target, latest->origin, ctime(&complete)); crm_free(action_s); } } } break; } done: if(rc < 0) { printf("Command failed: %s\n", stonith_error2string(rc)); } stonith_key_value_freeall(params, 1, 1); st->cmds->disconnect(st); crm_debug("Disconnect: %d", rc); crm_debug("Destroy"); stonith_api_delete(st); return rc; } diff --git a/fencing/commands.c b/fencing/commands.c index 8267ce93db..ed013dd92a 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -1,954 +1,1010 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *device_list = NULL; static int active_children = 0; +static gboolean stonith_device_dispatch(gpointer user_data); static void exec_child_done(ProcTrack* proc, int status, int signo, int rc, int waslogged); static void exec_child_new(ProcTrack* p) { active_children++; } static const char *exec_child_name(ProcTrack* p) { async_command_t *cmd = proctrack_data(p); return cmd->client?cmd->client:cmd->remote; } static ProcTrack_ops StonithdProcessTrackOps = { exec_child_done, exec_child_new, exec_child_name, }; static void free_async_command(async_command_t *cmd) { + crm_free(cmd->device); crm_free(cmd->action); crm_free(cmd->victim); crm_free(cmd->remote); crm_free(cmd->client); crm_free(cmd->origin); crm_free(cmd->op); crm_free(cmd); } -static async_command_t *create_async_command(xmlNode *msg, const char *action) +static async_command_t *create_async_command(xmlNode *msg) { async_command_t *cmd = NULL; + xmlNode *op = get_xpath_object("//@"F_STONITH_ACTION, msg, LOG_ERR); + const char *action = crm_element_value(op, F_STONITH_ACTION); + CRM_CHECK(action != NULL, crm_log_xml_warn(msg, "NoAction"); return NULL); crm_malloc0(cmd, sizeof(async_command_t)); crm_element_value_int(msg, F_STONITH_CALLID, &(cmd->id)); crm_element_value_int(msg, F_STONITH_CALLOPTS, &(cmd->options)); crm_element_value_int(msg, F_STONITH_TIMEOUT, &(cmd->timeout)); + cmd->timeout *= 1000; cmd->origin = crm_element_value_copy(msg, F_ORIG); cmd->remote = crm_element_value_copy(msg, F_STONITH_REMOTE); cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID); cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION); cmd->action = crm_strdup(action); - cmd->victim = crm_element_value_copy(msg, F_STONITH_TARGET); + cmd->victim = crm_element_value_copy(op, F_STONITH_TARGET); cmd->pt_ops = &StonithdProcessTrackOps; CRM_CHECK(cmd->op != NULL, crm_log_xml_warn(msg, "NoOp"); free_async_command(cmd); return NULL); CRM_CHECK(cmd->client != NULL || cmd->remote != NULL, crm_log_xml_warn(msg, "NoClient")); - return cmd; } +static gboolean stonith_device_execute(stonith_device_t *device) +{ + int rc = 0; + int exec_rc = 0; + async_command_t *cmd = NULL; + CRM_CHECK(device != NULL, return FALSE); + + if(device->active_pid) { + crm_trace("%s is still active with pid %u", device->id, device->active_pid); + return TRUE; + } + + if(device->pending_ops) { + GList *first = device->pending_ops; + device->pending_ops = g_list_remove_link(device->pending_ops, first); + cmd = first->data; + g_list_free_1(first); + } + + if(cmd == NULL) { + crm_info("Nothing to do for %s", device->id); + return TRUE; + } + + cmd->device = crm_strdup(device->id); + exec_rc = run_stonith_agent(device->agent, cmd->action, cmd->victim, + device->params, device->aliases, &rc, NULL, cmd); + + if(exec_rc > 0) { + crm_debug("Operation %s%s%s on %s is active with pid: %d", + cmd->action, cmd->victim?" for node ":"", cmd->victim?cmd->victim:"", + device->id, exec_rc); + device->active_pid = exec_rc; + + } else { + struct _ProcTrack p; + memset(&p, 0, sizeof(struct _ProcTrack)); + p.privatedata = cmd; + + crm_warn("Operation %s%s%s on %s failed (%d/%d)", + cmd->action, cmd->victim?" for node ":"", cmd->victim?cmd->victim:"", + device->id, exec_rc, rc); + exec_child_done(&p, 0, 0, rc<0?rc:exec_rc, 0); + } + return TRUE; +} + +static gboolean stonith_device_dispatch(gpointer user_data) +{ + return stonith_device_execute(user_data); +} + +static void schedule_stonith_command(async_command_t *cmd, stonith_device_t *device) +{ + CRM_CHECK(cmd != NULL, return); + CRM_CHECK(device != NULL, return); + + crm_trace("Scheduling %s on %s", cmd->action, device->id); + device->pending_ops = g_list_append(device->pending_ops, cmd); + mainloop_set_trigger(device->work); +} + static void free_device(gpointer data) { + GListPtr gIter = NULL; stonith_device_t *device = data; g_hash_table_destroy(device->params); g_hash_table_destroy(device->aliases); + + for(gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) { + struct _ProcTrack p; + async_command_t *cmd = gIter->data; + + memset(&p, 0, sizeof(struct _ProcTrack)); + p.privatedata = cmd; + + crm_warn("Removal of device '%s' purged operation %s", device->id, cmd->action); + exec_child_done(&p, 0, 0, st_err_unknown_device, 0); + free_async_command(cmd); + } + g_list_free(device->pending_ops); + slist_basic_destroy(device->targets); crm_free(device->namespace); crm_free(device->agent); crm_free(device->id); crm_free(device); } static GHashTable *build_port_aliases(const char *hostmap, GListPtr *targets) { char *name = NULL; int last = 0, lpc = 0, max = 0, added = 0; GHashTable *aliases = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(hostmap == NULL) { return aliases; } max = strlen(hostmap); for(; lpc <= max; lpc++) { switch(hostmap[lpc]) { /* Assignment chars */ case '=': case ':': if(lpc > last) { crm_free(name); crm_malloc0(name, 1 + lpc - last); strncpy(name, hostmap + last, lpc - last); } last = lpc + 1; break; /* Delimeter chars */ /* case ',': Potentially used to specify multiple ports */ case 0: case ';': case ' ': case '\t': if(name) { char *value = NULL; crm_malloc0(value, 1 + lpc - last); strncpy(value, hostmap + last, lpc - last); crm_debug("Adding alias '%s'='%s'", name, value); g_hash_table_replace(aliases, name, value); if(targets) { *targets = g_list_append(*targets, crm_strdup(value)); } value=NULL; name=NULL; added++; } else if(lpc > last) { crm_debug("Parse error at offset %d near '%s'", lpc-last, hostmap+last); } last = lpc + 1; break; } if(hostmap[lpc] == 0) { break; } } if(added == 0) { crm_info("No host mappings detected in '%s'", hostmap); } crm_free(name); return aliases; } static void parse_host_line(const char *line, GListPtr *output) { int lpc = 0; int max = 0; int last = 0; if(line) { max = strlen(line); } else { return; } /* Check for any complaints about additional parameters that the device doesn't understand */ if(strstr(line, "invalid") || strstr(line, "variable")) { crm_debug("Skipping: %s", line); return; } crm_debug_2("Processing: %s", line); /* Skip initial whitespace */ for(lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { last = lpc+1; } /* Now the actual content */ for(lpc = 0; lpc <= max; lpc++) { gboolean a_space = isspace(line[lpc]); if(a_space && lpc < max && isspace(line[lpc+1])) { /* fast-forward to the end of the spaces */ } else if(a_space || line[lpc] == ',' || line[lpc] == 0) { int rc = 0; char *entry = NULL; crm_malloc0(entry, 1 + lpc - last); rc = sscanf(line+last, "%[a-zA-Z0-9_-.]", entry); if(rc != 1) { crm_warn("Could not parse (%d %d): %s", last, lpc, line+last); } else if(safe_str_neq(entry, "on") && safe_str_neq(entry, "off")) { crm_debug_2("Adding '%s'", entry); *output = g_list_append(*output, entry); entry = NULL; } crm_free(entry); last = lpc + 1; } } } static GListPtr parse_host_list(const char *hosts) { int lpc = 0; int max = 0; int last = 0; GListPtr output = NULL; if(hosts == NULL) { return output; } max = strlen(hosts); for(lpc = 0; lpc <= max; lpc++) { if(hosts[lpc] == '\n' || hosts[lpc] == 0) { char *line = NULL; crm_malloc0(line, 2 + lpc - last); snprintf(line, 1 + lpc - last, "%s", hosts+last); parse_host_line(line, &output); crm_free(line); last = lpc + 1; } } return output; } static stonith_device_t *build_device_from_xml(xmlNode *msg) { xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, msg, LOG_ERR); stonith_device_t *device = NULL; crm_malloc0(device, sizeof(stonith_device_t)); device->id = crm_element_value_copy(dev, XML_ATTR_ID); device->agent = crm_element_value_copy(dev, "agent"); device->namespace = crm_element_value_copy(dev, "namespace"); device->params = xml2list(dev); + device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device); /* TODO: Hook up priority */ return device; } static int stonith_device_register(xmlNode *msg) { const char *value = NULL; stonith_device_t *device = build_device_from_xml(msg); value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTLIST); if(value) { device->targets = parse_host_list(value); } value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTMAP); device->aliases = build_port_aliases(value, &(device->targets)); g_hash_table_replace(device_list, device->id, device); crm_info("Added '%s' to the device list (%d active devices)", device->id, g_hash_table_size(device_list)); return stonith_ok; } static int stonith_device_remove(xmlNode *msg) { xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, msg, LOG_ERR); const char *id = crm_element_value(dev, XML_ATTR_ID); if(g_hash_table_remove(device_list, id)) { crm_info("Removed '%s' from the device list (%d active devices)", id, g_hash_table_size(device_list)); } else { crm_info("Device '%s' not found (%d active devices)", id, g_hash_table_size(device_list)); } return stonith_ok; } static gboolean string_in_list(GListPtr list, const char *item) { int lpc = 0; int max = g_list_length(list); for(lpc = 0; lpc < max; lpc ++) { const char *value = g_list_nth_data(list, lpc); if(safe_str_eq(item, value)) { return TRUE; } } return FALSE; } static int stonith_device_action(xmlNode *msg, char **output) { int rc = stonith_ok; xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, msg, LOG_ERR); const char *id = crm_element_value(dev, F_STONITH_DEVICE); - const char *action = crm_element_value(dev, F_STONITH_ACTION); async_command_t *cmd = NULL; stonith_device_t *device = NULL; if(id) { crm_debug_2("Looking for '%s'", id); device = g_hash_table_lookup(device_list, id); - - } else { - CRM_CHECK(safe_str_eq(action, "metadata"), crm_log_xml_warn(msg, "StrangeOp")); - - device = build_device_from_xml(msg); - if(device != NULL && device->id == NULL) { - device->id = crm_strdup(device->agent); - } } if(device) { - int exec_rc = 0; - const char *victim = NULL; - - cmd = create_async_command(msg, action); + cmd = create_async_command(msg); if(cmd == NULL) { free_device(device); return st_err_internal; } - cmd->device = crm_strdup(device->id); - crm_debug("Calling '%s' with action '%s'%s%s", - device->id, action, victim?" on port ":"", victim?victim:""); - - exec_rc = run_stonith_agent( - device->agent, action, cmd->victim, device->params, device->aliases, &rc, output, cmd); - if(exec_rc < 0 || rc != 0) { - crm_warn("Operation %s on %s failed (%d/%d): %.100s", - action, device->id, exec_rc, rc, *output); - - } else if(exec_rc > 0) { - crm_debug("Operation %s on %s active with pid: %d", action, device->id, exec_rc); - rc = exec_rc; - - } else { - crm_info("Operation %s on %s passed: %.100s", action, device->id, *output); - } + schedule_stonith_command(cmd, device); + rc = stonith_pending; } else { - crm_notice("Device %s not found", id); + crm_notice("Device %s not found", id?id:""); rc = st_err_unknown_device; } - - if(id == NULL) { - free_device(device); - } return rc; } static gboolean can_fence_host_with_device(stonith_device_t *dev, const char *host) { gboolean can = FALSE; const char *alias = host; const char *check_type = NULL; if(dev == NULL) { return FALSE; } else if(host == NULL) { return TRUE; } if(g_hash_table_lookup(dev->aliases, host)) { alias = g_hash_table_lookup(dev->aliases, host); } check_type = g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTCHECK); if(check_type == NULL) { if(g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTLIST)) { check_type = "static-list"; } else { check_type = "dynamic-list"; } } if(safe_str_eq(check_type, "none")) { can = TRUE; } else if(safe_str_eq(check_type, "static-list")) { /* Presence in the hostmap is sufficient * Only use if all hosts on which the device can be active can always fence all listed hosts */ if(string_in_list(dev->targets, host)) { can = TRUE; } } else if(safe_str_eq(check_type, "dynamic-list")) { time_t now = time(NULL); /* Host/alias must be in the list output to be eligable to be fenced * * Will cause problems if down'd nodes aren't listed or (for virtual nodes) * if the guest is still listed despite being moved to another machine */ if(dev->targets_age < 0) { crm_trace("Port list queries disabled for %s", dev->id); } else if(dev->targets == NULL || dev->targets_age + 60 < now) { char *output = NULL; int rc = stonith_ok; int exec_rc = stonith_ok; /* Check for the target's presence in the output of the 'list' command */ slist_basic_destroy(dev->targets); dev->targets = NULL; exec_rc = run_stonith_agent(dev->agent, "list", NULL, dev->params, NULL, &rc, &output, NULL); if(exec_rc < 0 || rc != 0) { crm_notice("Disabling port list queries for %s (%d/%d): %s", dev->id, exec_rc, rc, output); dev->targets_age = -1; } else { crm_info("Refreshing port list for %s", dev->id); dev->targets = parse_host_list(output); dev->targets_age = now; } crm_free(output); } if(string_in_list(dev->targets, alias)) { can = TRUE; } } else if(safe_str_eq(check_type, "status")) { int rc = 0; int exec_rc = 0; /* Run the status operation for the device/target combination * Will cause problems if the device doesn't return 2 for down'd nodes or * (for virtual nodes) if the device doesn't return 1 for guests that * have been moved to another host */ exec_rc = run_stonith_agent( dev->agent, "status", host, dev->params, dev->aliases, &rc, NULL, NULL); if(exec_rc != 0) { crm_err("Could not invoke %s: rc=%d", dev->id, exec_rc); } else if(rc == 1 /* unkown */) { crm_debug_2("Host %s is not known by %s", host, dev->id); } else if(rc == 0 /* active */ || rc == 2 /* inactive */) { can = TRUE; } else { crm_err("Unkown result calling %s for %s with %s: rc=%d", "status", host, dev->id, rc); } } else { crm_err("Unknown check type: %s", check_type); } if(safe_str_eq(host, alias)) { crm_info("%s can%s fence %s: %s", dev->id, can?"":" not", host, check_type); } else { crm_info("%s can%s fence %s (aka. '%s'): %s", dev->id, can?"":" not", host, alias, check_type); } return can; } struct device_search_s { const char *host; GListPtr capable; }; static void search_devices( gpointer key, gpointer value, gpointer user_data) { stonith_device_t *dev = value; struct device_search_s *search = user_data; if(can_fence_host_with_device(dev, search->host)) { search->capable = g_list_append(search->capable, value); } } static int stonith_query(xmlNode *msg, xmlNode **list) { struct device_search_s search; int available_devices = 0; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_DEBUG_3); search.host = NULL; search.capable = NULL; if(dev) { search.host = crm_element_value(dev, F_STONITH_TARGET); } crm_log_xml_debug(msg, "Query"); g_hash_table_foreach(device_list, search_devices, &search); available_devices = g_list_length(search.capable); if(search.host) { crm_debug("Found %d matching devices for '%s'", available_devices, search.host); } else { crm_debug("%d devices installed", available_devices); } /* Pack the results into data */ if(list) { GListPtr lpc = NULL; *list = create_xml_node(NULL, __FUNCTION__); crm_xml_add(*list, F_STONITH_TARGET, search.host); crm_xml_add_int(*list, "st-available-devices", available_devices); for(lpc = search.capable; lpc != NULL; lpc = lpc->next) { stonith_device_t *device = (stonith_device_t*)lpc->data; dev = create_xml_node(*list, F_STONITH_DEVICE); crm_xml_add(dev, XML_ATTR_ID, device->id); crm_xml_add(dev, "namespace", device->namespace); crm_xml_add(dev, "agent", device->agent); if(search.host == NULL) { xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); g_hash_table_foreach(device->params, hash2field, attrs); } } } g_list_free(search.capable); return available_devices; } static void log_operation(async_command_t *cmd, int rc, int pid, const char *next, const char *output) { if(rc == 0) { next = NULL; } if(cmd->victim != NULL) { do_crm_log(rc==0?LOG_INFO:LOG_ERR, "Operation '%s' [%d] (call %d from %s) for host '%s' with device '%s' returned: %d%s%s", cmd->action, pid, cmd->id, cmd->client, cmd->victim, cmd->device, rc, next?". Trying: ":"", next?next:""); } else { do_crm_log(rc==0?LOG_DEBUG:LOG_NOTICE, "Operation '%s' [%d] for device '%s' returned: %d%s%s", cmd->action, pid, cmd->device, rc, next?". Trying: ":"", next?next:""); } if(output) { /* Logging the whole string confuses syslog when the string is xml */ char *local_copy = crm_strdup(output); int lpc = 0, last = 0, more = strlen(local_copy); for(lpc = 0; lpc < more; lpc++) { if(local_copy[lpc] == '\n' || local_copy[lpc] == 0) { local_copy[lpc] = 0; do_crm_log(rc==0?LOG_INFO:LOG_ERR, "%s: %s", cmd->device, local_copy+last); last = lpc+1; } } crm_debug("%s output: %s (total %d bytes)", cmd->device, local_copy+last, more); crm_free(local_copy); } } #define READ_MAX 500 static void exec_child_done(ProcTrack* proc, int status, int signum, int rc, int waslogged) { int len = 0; int more = 0; gboolean bcast = FALSE; char *output = NULL; xmlNode *data = NULL; xmlNode *reply = NULL; int pid = proctrack_pid(proc); + stonith_device_t *device = NULL; async_command_t *cmd = proctrack_data(proc); CRM_CHECK(cmd != NULL, return); active_children--; + /* The device is ready to do something else now */ + device = g_hash_table_lookup(device_list, cmd->device); + if(device) { + device->active_pid = 0; + mainloop_set_trigger(device->work); + } + if( signum ) { rc = st_err_signal; if( proctrack_timedout(proc) ) { crm_warn("Child '%d' performing action '%s' with '%s' timed out", pid, cmd->action, cmd->device); rc = st_err_timeout; } } do { char buffer[READ_MAX]; errno = 0; - memset(&buffer, 0, READ_MAX); - more = read(cmd->stdout, buffer, READ_MAX-1); - do_crm_log(status!=0?LOG_DEBUG:LOG_DEBUG_2, - "Got %d more bytes: %s", more, buffer); - + if(cmd->stdout > 0) { + memset(&buffer, 0, READ_MAX); + more = read(cmd->stdout, buffer, READ_MAX-1); + crm_trace("Got %d more bytes: %s", more, buffer); + } + if(more > 0) { crm_realloc(output, len + more + 1); sprintf(output+len, "%s", buffer); len += more; } } while (more == (READ_MAX-1) || (more < 0 && errno == EINTR)); if(cmd->stdout) { close(cmd->stdout); cmd->stdout = 0; } - while(rc != 0 && cmd->device_next) { - int exec_rc = 0; + if(rc != 0 && cmd->device_next) { stonith_device_t *dev = cmd->device_next->data; log_operation(cmd, rc, pid, dev->id, output); - cmd->device = dev->id; cmd->device_next = cmd->device_next->next; - - exec_rc = run_stonith_agent(dev->agent, cmd->action, cmd->victim, dev->params, dev->aliases, &rc, NULL, cmd); - if(exec_rc > 0) { - goto done; - } - pid = exec_rc; + schedule_stonith_command(cmd, dev); + goto done; } if(rc > 0) { rc = st_err_generic; } reply = stonith_construct_async_reply(cmd, output, data, rc); if(safe_str_eq(cmd->action, "metadata")) { /* Too verbose to log */ crm_free(output); output = NULL; } else if(crm_str_eq(cmd->action, "reboot", TRUE) || crm_str_eq(cmd->action, "poweroff", TRUE) || crm_str_eq(cmd->action, "poweron", TRUE) || crm_str_eq(cmd->action, "off", TRUE) || crm_str_eq(cmd->action, "on", TRUE)) { bcast = TRUE; } log_operation(cmd, rc, pid, NULL, output); crm_log_xml_debug_3(reply, "Reply"); - if(bcast) { + if(bcast && !stand_alone) { /* Send reply as T_STONITH_NOTIFY so everyone does notifications * Potentially limit to unsucessful operations to the originator? */ crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); } else if(cmd->origin) { send_cluster_message(cmd->origin, crm_msg_stonith_ng, reply, FALSE); } else { do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); } free_async_command(cmd); done: reset_proctrack_data(proc); crm_free(output); free_xml(reply); free_xml(data); } static gint sort_device_priority(gconstpointer a, gconstpointer b) { const stonith_device_t *dev_a = a; const stonith_device_t *dev_b = a; if(dev_a->priority > dev_b->priority) { return -1; } else if(dev_a->priority < dev_b->priority) { return 1; } return 0; } static int stonith_fence(xmlNode *msg) { - int rc = 0; struct device_search_s search; stonith_device_t *device = NULL; - async_command_t *cmd = create_async_command(msg, crm_element_value(msg, F_STONITH_ACTION)); + async_command_t *cmd = create_async_command(msg); xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_ERR); if(cmd == NULL) { return st_err_internal; } search.capable = NULL; search.host = crm_element_value(dev, F_STONITH_TARGET); crm_log_xml_info(msg, "Exec"); g_hash_table_foreach(device_list, search_devices, &search); crm_info("Found %d matching devices for '%s'", g_list_length(search.capable), search.host); if(g_list_length(search.capable) == 0) { free_async_command(cmd); return st_err_none_available; } /* Order based on priority */ search.capable = g_list_sort(search.capable, sort_device_priority); device = search.capable->data; cmd->device = device->id; if(g_list_length(search.capable) > 1) { cmd->device_list = search.capable; } - - return run_stonith_agent(device->agent, cmd->action, cmd->victim, device->params, device->aliases, &rc, NULL, cmd); + + schedule_stonith_command(cmd, device); + return stonith_pending; } xmlNode *stonith_construct_reply(xmlNode *request, char *output, xmlNode *data, int rc) { int lpc = 0; xmlNode *reply = NULL; const char *name = NULL; const char *value = NULL; const char *names[] = { F_STONITH_OPERATION, F_STONITH_CALLID, F_STONITH_CLIENTID, F_STONITH_REMOTE, F_STONITH_CALLOPTS }; crm_debug_4("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, "st_output", output); crm_xml_add_int(reply, F_STONITH_RC, rc); CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply); for(lpc = 0; lpc < DIMOF(names); lpc++) { name = names[lpc]; value = crm_element_value(request, name); crm_xml_add(reply, name, value); } if(data != NULL) { crm_debug_4("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } xmlNode *stonith_construct_async_reply(async_command_t *cmd, char *output, xmlNode *data, int rc) { xmlNode *reply = NULL; crm_debug_4("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); crm_xml_add(reply, F_STONITH_REMOTE, cmd->remote); crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client); crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); crm_xml_add_int(reply, F_STONITH_RC, rc); crm_xml_add(reply, "st_output", output); if(data != NULL) { crm_info("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } void stonith_command(stonith_client_t *client, xmlNode *request, const char *remote) { int call_options = 0; int rc = st_err_generic; gboolean is_reply = FALSE; gboolean always_reply = FALSE; xmlNode *reply = NULL; xmlNode *data = NULL; char *output = NULL; const char *op = crm_element_value(request, F_STONITH_OPERATION); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); if(get_xpath_object("//"T_STONITH_REPLY, request, LOG_DEBUG_3)) { is_reply = TRUE; } if(device_list == NULL) { device_list = g_hash_table_new_full( crm_str_hash, g_str_equal, NULL, free_device); } crm_debug("Processing %s%s from %s", op, is_reply?" reply":"", client?client->name:remote); if(crm_str_eq(op, CRM_OP_REGISTER, TRUE)) { return; } else if(crm_str_eq(op, STONITH_OP_DEVICE_ADD, TRUE)) { rc = stonith_device_register(request); do_stonith_notify(call_options, op, rc, request, NULL); } else if(crm_str_eq(op, STONITH_OP_DEVICE_DEL, TRUE)) { rc = stonith_device_remove(request); do_stonith_notify(call_options, op, rc, request, NULL); } else if(crm_str_eq(op, STONITH_OP_CONFIRM, TRUE)) { - async_command_t *cmd = create_async_command(request, crm_element_value(request, F_STONITH_ACTION)); + async_command_t *cmd = create_async_command(request); xmlNode *reply = stonith_construct_async_reply(cmd, NULL, NULL, 0); crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); crm_notice("Broadcasting manual fencing confirmation for node %s", cmd->victim); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); free_async_command(cmd); free_xml(reply); } else if(crm_str_eq(op, STONITH_OP_EXEC, TRUE)) { rc = stonith_device_action(request, &output); } else if(is_reply && crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { process_remote_stonith_query(request); return; } else if(crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */ rc = stonith_query(request, &data); always_reply = TRUE; } else if(is_reply && crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { process_remote_stonith_exec(request); return; } else if(crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { const char *flag_name = NULL; flag_name = crm_element_value(request, F_STONITH_NOTIFY_ACTIVATE); if(flag_name) { crm_debug("Setting %s callbacks for %s (%s): ON", flag_name, client->name, client->id); client->flags |= get_stonith_flag(flag_name); } flag_name = crm_element_value(request, F_STONITH_NOTIFY_DEACTIVATE); if(flag_name) { crm_debug("Setting %s callbacks for %s (%s): off", flag_name, client->name, client->id); client->flags |= get_stonith_flag(flag_name); } return; /* } else if(is_reply && crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { */ /* process_remote_stonith_exec(request); */ /* return; */ } else if(is_reply == FALSE && crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { - if(remote) { + if(remote || stand_alone) { rc = stonith_fence(request); } else if(call_options & st_opt_local_first) { rc = stonith_fence(request); if(rc < 0) { initiate_remote_stonith_op(client, request); + return; } } else { initiate_remote_stonith_op(client, request); + return; } - return; } else if (crm_str_eq(op, STONITH_OP_FENCE_HISTORY, TRUE)) { rc = stonith_fence_history(request, &data); always_reply = TRUE; } else { crm_err("Unknown %s%s from %s", op, is_reply?" reply":"", client?client->name:remote); crm_log_xml_warn(request, "UnknownOp"); } do_crm_log(rc>0?LOG_DEBUG:LOG_INFO,"Processed %s%s from %s: rc=%d", op, is_reply?" reply":"", client?client->name:remote, rc); - if(is_reply) { - /* Nothing */ + if(is_reply || rc == stonith_pending) { + /* Nothing (yet) */ } else if(remote) { reply = stonith_construct_reply(request, output, data, rc); send_cluster_message(remote, crm_msg_stonith_ng, reply, FALSE); free_xml(reply); } else if(rc <= 0 || always_reply) { reply = stonith_construct_reply(request, output, data, rc); do_local_reply(reply, client_id, call_options & st_opt_sync_call, remote!=NULL); free_xml(reply); } crm_free(output); free_xml(data); } diff --git a/fencing/internal.h b/fencing/internal.h index 94ac294119..335c2352e4 100644 --- a/fencing/internal.h +++ b/fencing/internal.h @@ -1,60 +1,64 @@ typedef struct stonith_device_s { char *id; char *agent; char *namespace; GListPtr targets; time_t targets_age; gboolean has_attr_map; guint priority; + guint active_pid; GHashTable *params; GHashTable *aliases; + GList *pending_ops; + crm_trigger_t *work; } stonith_device_t; typedef struct stonith_client_s { char *id; char *name; char *callback_id; const char *channel_name; IPC_Channel *channel; GCHSource *source; long long flags; } stonith_client_t; extern long long get_stonith_flag(const char *name); extern void stonith_command( stonith_client_t *client, xmlNode *op_request, const char *remote); extern void do_local_reply( xmlNode *notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer); extern xmlNode *stonith_construct_reply( xmlNode *request, char *output, xmlNode *data, int rc); extern xmlNode *stonith_construct_async_reply( async_command_t *cmd, char *output, xmlNode *data, int rc);; extern void do_stonith_notify( int options, const char *type, enum stonith_errors result, xmlNode *data, const char *remote); extern void initiate_remote_stonith_op(stonith_client_t *client, xmlNode *request); extern int process_remote_stonith_exec(xmlNode *msg); extern int process_remote_stonith_query(xmlNode *msg); extern void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer); extern int stonith_fence_history(xmlNode *msg, xmlNode **output); extern char *stonith_our_uname; +extern gboolean stand_alone; diff --git a/fencing/main.c b/fencing/main.c index ad48c4311d..13be53f454 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -1,679 +1,679 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *channel1 = NULL; char *channel2 = NULL; char *stonith_our_uname = NULL; GMainLoop *mainloop = NULL; GHashTable *client_list = NULL; +gboolean stand_alone = FALSE; gboolean stonith_shutdown_flag = FALSE; #if SUPPORT_HEARTBEAT ll_cluster_t *hb_conn = NULL; #endif static gboolean stonith_client_disconnect( IPC_Channel *channel, stonith_client_t *stonith_client) { if (channel == NULL) { CRM_LOG_ASSERT(stonith_client == NULL); } else if (stonith_client == NULL) { crm_err("No client"); } else { CRM_LOG_ASSERT(channel->ch_status != IPC_CONNECT); crm_debug_2("Cleaning up after client disconnect: %s/%s/%s", crm_str(stonith_client->name), stonith_client->channel_name, stonith_client->id); if(stonith_client->id != NULL) { if(!g_hash_table_remove(client_list, stonith_client->id)) { crm_err("Client %s not found in the hashtable", stonith_client->name); } } } return FALSE; } static gboolean stonith_client_callback(IPC_Channel *channel, gpointer user_data) { int lpc = 0; const char *value = NULL; xmlNode *request = NULL; gboolean keep_channel = TRUE; stonith_client_t *stonith_client = user_data; CRM_CHECK(stonith_client != NULL, crm_err("Invalid client"); return FALSE); CRM_CHECK(stonith_client->id != NULL, crm_err("Invalid client: %p", stonith_client); return FALSE); if(IPC_ISRCONN(channel) && channel->ops->is_message_pending(channel)) { lpc++; request = xmlfromIPC(channel, MAX_IPC_DELAY); if (request == NULL) { goto bail; } if(stonith_client->name == NULL) { value = crm_element_value(request, F_STONITH_CLIENTNAME); if(value == NULL) { stonith_client->name = crm_itoa(channel->farside_pid); } else { stonith_client->name = crm_strdup(value); } } crm_xml_add(request, F_STONITH_CLIENTID, stonith_client->id); crm_xml_add(request, F_STONITH_CLIENTNAME, stonith_client->name); if(stonith_client->callback_id == NULL) { value = crm_element_value(request, F_STONITH_CALLBACK_TOKEN); if(value != NULL) { stonith_client->callback_id = crm_strdup(value); } else { stonith_client->callback_id = crm_strdup(stonith_client->id); } } crm_log_xml(LOG_MSG, "Client[inbound]", request); stonith_command(stonith_client, request, NULL); free_xml(request); } bail: if(channel->ch_status != IPC_CONNECT) { crm_debug_2("Client disconnected"); keep_channel = stonith_client_disconnect(channel, stonith_client); } return keep_channel; } static void stonith_client_destroy(gpointer user_data) { stonith_client_t *stonith_client = user_data; if(stonith_client == NULL) { crm_debug_4("Destroying %p", user_data); return; } if(stonith_client->source != NULL) { crm_debug_4("Deleting %s (%p) from mainloop", stonith_client->name, stonith_client->source); G_main_del_IPC_Channel(stonith_client->source); stonith_client->source = NULL; } crm_debug_3("Destroying %s (%p)", stonith_client->name, user_data); crm_free(stonith_client->name); crm_free(stonith_client->callback_id); crm_free(stonith_client->id); crm_free(stonith_client); crm_debug_4("Freed the cib client"); return; } static gboolean stonith_client_connect(IPC_Channel *channel, gpointer user_data) { cl_uuid_t client_id; xmlNode *reg_msg = NULL; stonith_client_t *new_client = NULL; char uuid_str[UU_UNPARSE_SIZEOF]; const char *channel_name = user_data; crm_debug_3("Connecting channel"); CRM_CHECK(channel_name != NULL, return FALSE); if (channel == NULL) { crm_err("Channel was NULL"); return FALSE; } else if (channel->ch_status != IPC_CONNECT) { crm_err("Channel was disconnected"); return FALSE; } else if(stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", channel->farside_pid); return FALSE; } crm_malloc0(new_client, sizeof(stonith_client_t)); new_client->channel = channel; new_client->channel_name = channel_name; crm_debug_3("Created channel %p for channel %s", new_client, new_client->channel_name); channel->ops->set_recv_qlen(channel, 1024); channel->ops->set_send_qlen(channel, 1024); new_client->source = G_main_add_IPC_Channel( G_PRIORITY_DEFAULT, channel, FALSE, stonith_client_callback, new_client, stonith_client_destroy); crm_debug_3("Channel %s connected for client %s", new_client->channel_name, new_client->id); cl_uuid_generate(&client_id); cl_uuid_unparse(&client_id, uuid_str); CRM_CHECK(new_client->id == NULL, crm_free(new_client->id)); new_client->id = crm_strdup(uuid_str); /* make sure we can find ourselves later for sync calls * redirected to the master instance */ g_hash_table_insert(client_list, new_client->id, new_client); reg_msg = create_xml_node(NULL, "callback"); crm_xml_add(reg_msg, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(reg_msg, F_STONITH_CLIENTID, new_client->id); send_ipc_message(channel, reg_msg); free_xml(reg_msg); return TRUE; } static void stonith_peer_callback(xmlNode * msg, void* private_data) { const char *remote = crm_element_value(msg, F_ORIG); crm_log_xml(LOG_MSG, "Peer[inbound]", msg); stonith_command(NULL, msg, remote); } static void stonith_peer_hb_callback(HA_Message * msg, void* private_data) { xmlNode *xml = convert_ha_message(NULL, msg, __FUNCTION__); stonith_peer_callback(xml, private_data); free_xml(xml); } #if SUPPORT_COROSYNC static gboolean stonith_peer_ais_callback( AIS_Message *wrapper, char *data, int sender) { xmlNode *xml = NULL; if(wrapper->header.id == crm_class_cluster) { xml = string2xml(data); if(xml == NULL) { goto bail; } crm_xml_add(xml, F_ORIG, wrapper->sender.uname); crm_xml_add_int(xml, F_SEQ, wrapper->id); stonith_peer_callback(xml, NULL); } free_xml(xml); return TRUE; bail: crm_err("Invalid XML: '%.120s'", data); return TRUE; } static void stonith_peer_ais_destroy(gpointer user_data) { crm_err("AIS connection terminated"); ais_fd_sync = -1; exit(1); } #endif static void stonith_peer_hb_destroy(gpointer user_data) { if(stonith_shutdown_flag) { crm_info("Heartbeat disconnection complete... exiting"); } else { crm_err("Heartbeat connection lost! Exiting."); } crm_info("Exiting..."); if (mainloop != NULL && g_main_is_running(mainloop)) { g_main_quit(mainloop); } else { exit(LSB_EXIT_OK); } } static int send_via_callback_channel(xmlNode *msg, const char *token) { stonith_client_t *hash_client = NULL; enum stonith_errors rc = stonith_ok; crm_debug_3("Delivering msg %p to client %s", msg, token); if(token == NULL) { crm_err("No client id token, cant send message"); if(rc == stonith_ok) { rc = -1; } } else if(msg == NULL) { crm_err("No message to send"); rc = -1; } else { /* A client that left before we could reply is not really * _our_ error. Warn instead. */ hash_client = g_hash_table_lookup(client_list, token); if(hash_client == NULL) { crm_warn("Cannot find client for token %s", token); rc = -1; } else if (crm_str_eq(hash_client->channel_name, "remote", FALSE)) { /* just hope it's alive */ } else if(hash_client->channel == NULL) { crm_err("Cannot find channel for client %s", token); rc = -1; } } if(rc == stonith_ok) { crm_debug_3("Delivering reply to client %s (%s)", token, hash_client->channel_name); if(send_ipc_message(hash_client->channel, msg) == FALSE) { crm_warn("Delivery of reply to client %s/%s failed", hash_client->name, token); rc = -1; } } return rc; } void do_local_reply(xmlNode *notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer) { /* send callback to originating child */ stonith_client_t *client_obj = NULL; enum stonith_errors local_rc = stonith_ok; crm_debug_2("Sending response"); if(client_id != NULL) { client_obj = g_hash_table_lookup(client_list, client_id); } else { crm_debug_2("No client to sent the response to." " F_STONITH_CLIENTID not set."); } crm_debug_3("Sending callback to request originator"); if(client_obj == NULL) { local_rc = -1; } else { const char *client_id = client_obj->callback_id; crm_debug_2("Sending %ssync response to %s %s", sync_reply?"":"an a-", client_obj->name, from_peer?"(originator of delegated request)":""); if(sync_reply) { client_id = client_obj->id; } local_rc = send_via_callback_channel(notify_src, client_id); } if(local_rc != stonith_ok && client_obj != NULL) { crm_warn("%sSync reply to %s failed: %s", sync_reply?"":"A-", client_obj?client_obj->name:"", stonith_error2string(local_rc)); } } long long get_stonith_flag(const char *name) { if(safe_str_eq(name, STONITH_OP_FENCE)) { return 0x01; } else if(safe_str_eq(name, STONITH_OP_DEVICE_ADD)) { return 0x04; } else if(safe_str_eq(name, STONITH_OP_DEVICE_DEL)) { return 0x10; } return 0; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { IPC_Channel *ipc_client = NULL; xmlNode *update_msg = user_data; stonith_client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if(client == NULL) { crm_warn("Skipping NULL client"); return; } else if(client->channel == NULL) { crm_warn("Skipping client with NULL channel"); return; } else if(client->name == NULL) { crm_debug_2("Skipping unnammed client / comamnd channel"); return; } ipc_client = client->channel; if(client->flags & get_stonith_flag(type)) { crm_info("Sending %s-notification to client %s/%s", type, client->name, client->id); if(ipc_client->send_queue->current_qlen >= ipc_client->send_queue->max_qlen) { /* We never want the STONITH to exit because our client is slow */ crm_crit("%s-notification of client %s/%s failed - queue saturated", type, client->name, client->id); } else if(send_ipc_message(ipc_client, update_msg) == FALSE) { crm_warn("%s-Notification of client %s/%s failed", type, client->name, client->id); } } } void do_stonith_notify( int options, const char *type, enum stonith_errors result, xmlNode *data, const char *remote) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_CHECK(type != NULL, ;); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); crm_xml_add_int(update_msg, F_STONITH_RC, result); if(data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_debug_3("Notifying clients"); g_hash_table_foreach(client_list, stonith_notify_client, update_msg); free_xml(update_msg); crm_debug_3("Notify complete"); } static void stonith_shutdown(int nsig) { stonith_shutdown_flag = TRUE; crm_info("Terminating with %d clients", g_hash_table_size(client_list)); stonith_client_disconnect(NULL, NULL); exit(0); } static void stonith_cleanup(void) { crm_peer_destroy(); g_hash_table_destroy(client_list); crm_free(stonith_our_uname); #if HAVE_LIBXML2 crm_xml_cleanup(); #endif crm_free(channel1); } static struct crm_option long_options[] = { {"stand-alone", 0, 0, 's'}, {"verbose", 0, 0, 'V'}, {"version", 0, 0, '$'}, {"help", 0, 0, '?'}, {0, 0, 0, 0} }; int main(int argc, char ** argv) { int flag; int rc = 0; int lpc = 0; int argerr = 0; int option_index = 0; - gboolean stand_alone = FALSE; const char *actions[] = { "reboot", "poweroff", "list", "monitor", "status" }; set_crm_log_level(LOG_INFO); crm_system_name = "stonith-ng"; crm_set_options(NULL, "mode [options]", long_options, "Provides a summary of cluster's current state." "\n\nOutputs varying levels of detail in a number of different formats.\n"); while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch(flag) { case 'V': alter_debug(DEBUG_INC); cl_log_enable_stderr(1); break; case 's': stand_alone = TRUE; cl_log_enable_stderr(1); break; case '$': case '?': crm_help(flag, LSB_EXIT_OK); break; default: ++argerr; break; } } if(argc - optind == 1 && safe_str_eq("metadata", argv[optind])) { printf("\n"); printf("\n"); printf(" 1.0\n"); printf(" This is a fake resource that details the instance attributes handled by stonithd.\n"); printf(" Options available for all stonith resources\n"); printf(" \n"); printf(" \n"); printf(" How long to wait for the STONITH action to complete.\n"); printf(" Overrides the stonith-timeout cluster property\n"); printf(" \n"); printf(" \n"); printf(" \n"); printf(" The priority of the stonith resource. The lower the number, the higher the priority.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTARG); printf(" Advanced use only: An alternate parameter to supply instead of 'port'\n"); printf(" Some devices do not support the standard 'port' parameter or may provide additional ones.\n" "Use this to specify an alternate, device-specific, parameter that should indicate the machine to be fenced.\n" "A value of 'none' can be used to tell the cluster not to supply any additional parameters.\n" " \n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTMAP); printf(" A mapping of host names to ports numbers for devices that do not support host names.\n"); printf(" Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTLIST); printf(" A list of machines controlled by this device (Optional unless %s=static-list).\n", STONITH_ATTR_HOSTCHECK); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTCHECK); printf(" How to determin which machines are controlled by the device.\n"); printf(" Allowed values: dynamic-list (query the device), static-list (check the %s attribute), none (assume every device can fence every machine)\n", STONITH_ATTR_HOSTLIST); printf(" \n"); printf(" \n"); for(lpc = 0; lpc < DIMOF(actions); lpc++) { printf(" \n", actions[lpc]); printf(" Advanced use only: An alternate command to run instead of '%s'\n", actions[lpc]); printf(" Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the '%s' action.\n", actions[lpc]); printf(" \n", actions[lpc]); printf(" \n"); } printf(" \n"); printf("\n"); return 0; } if (optind != argc) { ++argerr; } if (argerr) { crm_help('?', LSB_EXIT_GENERIC); } crm_log_init("stonith-ng", crm_log_level, TRUE, TRUE, argc, argv); mainloop_add_signal(SIGTERM, stonith_shutdown); /* EnableProcLogging(); */ set_sigchld_proctrack(G_PRIORITY_HIGH,DEFAULT_MAXDISPATCHTIME); crm_peer_init(); client_list = g_hash_table_new(crm_str_hash, g_str_equal); if(stand_alone == FALSE) { void *dispatch = stonith_peer_hb_callback; void *destroy = stonith_peer_hb_destroy; if(is_openais_cluster()) { #if SUPPORT_COROSYNC destroy = stonith_peer_ais_destroy; dispatch = stonith_peer_ais_callback; #endif } if(crm_cluster_connect(&stonith_our_uname, NULL, dispatch, destroy, #if SUPPORT_HEARTBEAT &hb_conn #else NULL #endif ) == FALSE){ crm_crit("Cannot sign in to the cluster... terminating"); exit(100); } } else { stonith_our_uname = crm_strdup("localhost"); } channel1 = crm_strdup(stonith_channel); rc = init_server_ipc_comms( channel1, stonith_client_connect, default_ipc_connection_destroy); channel2 = crm_strdup(stonith_channel_callback); rc = init_server_ipc_comms( channel2, stonith_client_connect, default_ipc_connection_destroy); if(rc == 0) { /* Create the mainloop and run it... */ mainloop = g_main_new(FALSE); crm_info("Starting %s mainloop", crm_system_name); g_main_run(mainloop); } else { crm_err("Couldnt start all communication channels, exiting."); } stonith_cleanup(); #if SUPPORT_HEARTBEAT if(hb_conn) { hb_conn->llc_ops->delete(hb_conn); } #endif crm_info("Done"); return rc; } diff --git a/fencing/remote.c b/fencing/remote.c index de7a14cc04..eafbad1725 100644 --- a/fencing/remote.c +++ b/fencing/remote.c @@ -1,553 +1,553 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct st_query_result_s { char *host; int devices; } st_query_result_t; typedef struct remote_fencing_op_s { char *id; char *target; char *action; guint replies; guint op_timer; guint query_timer; guint base_timeout; char *delegate; time_t completed; long long call_options; enum op_state state; char *client_id; char *originator; GListPtr query_results; xmlNode *request; } remote_fencing_op_t; GHashTable *remote_op_list = NULL; static void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer); extern xmlNode *stonith_create_op( int call_id, const char *token, const char *op, xmlNode *data, int call_options); static void free_remote_query(gpointer data) { if(data) { st_query_result_t *query = data; crm_free(query->host); crm_free(query); } } static void free_remote_op(gpointer data) { remote_fencing_op_t *op = data; crm_log_xml_debug(op->request, "Destroying"); crm_free(op->id); crm_free(op->action); crm_free(op->target); crm_free(op->client_id); crm_free(op->originator); if(op->query_timer) { g_source_remove(op->query_timer); } if(op->op_timer) { g_source_remove(op->op_timer); } if(op->query_results) { slist_destroy(st_query_result_t, result, op->query_results, free_remote_query(result); ); } if(op->request) { free_xml(op->request); op->request = NULL; } crm_free(op); } static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc) { int call = 0; xmlNode *reply = NULL; xmlNode *local_data = NULL; xmlNode *notify_data = NULL; op->completed = time(NULL); if(op->request != NULL) { crm_element_value_int(op->request, F_STONITH_CALLID, &call); /* else: keep going, make sure the details are accurate for ops that arrive late */ } if(op->query_timer) { g_source_remove(op->query_timer); op->query_timer = 0; } if(op->op_timer) { g_source_remove(op->op_timer); op->op_timer = 0; } if(data == NULL) { data = create_xml_node(NULL, "remote-op"); local_data = data; } else { op->delegate = crm_element_value_copy(data, F_ORIG); } crm_xml_add_int(data, "state", op->state); crm_xml_add(data, F_STONITH_TARGET, op->target); crm_xml_add(data, F_STONITH_OPERATION, op->action); if(op->request != NULL) { reply = stonith_construct_reply(op->request, NULL, data, rc); crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); crm_info("Notifing clients of %s (%s of %s from %s by %s): %d, rc=%d", op->id, op->action, op->target, op->client_id, op->delegate, op->state, rc); } else { crm_err("We've already notified clients of %s (%s of %s from %s by %s): %d, rc=%d", op->id, op->action, op->target, op->client_id, op->delegate, op->state, rc); return; } if(call && reply) { /* Don't bother with this if there is no callid - and thus the op originated elsewhere */ do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); } /* Do notification with a clean data object */ notify_data = create_xml_node(NULL, "st-data"); crm_xml_add_int(notify_data, "state", op->state); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, op->target); crm_xml_add(notify_data, F_STONITH_OPERATION, op->action); crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); crm_xml_add(notify_data, F_STONITH_REMOTE, op->id); crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator); do_stonith_notify(0, STONITH_OP_FENCE, rc, notify_data, NULL); free_xml(notify_data); free_xml(local_data); free_xml(reply); /* Free non-essential parts of the record * Keep the record around so we can query the history */ if(op->query_results) { slist_destroy(st_query_result_t, result, op->query_results, free_remote_query(result); ); op->query_results = NULL; } if(op->request) { free_xml(op->request); op->request = NULL; } } static gboolean remote_op_timeout(gpointer userdata) { remote_fencing_op_t *op = userdata; op->query_timer = 0; if(op->state == st_done) { crm_debug("Action %s (%s) for %s already completed", op->action, op->id, op->target); return FALSE; } crm_err("Action %s (%s) for %s timed out", op->action, op->id, op->target); remote_op_done(op, NULL, st_err_timeout); op->state = st_failed; return FALSE; } static gboolean remote_op_query_timeout(gpointer data) { remote_fencing_op_t *op = data; op->query_timer = 0; if(op->state == st_done) { crm_debug("Operation %s for %s already completed", op->id, op->target); } else if(op->state == st_exec) { crm_debug("Operation %s for %s already in progress", op->id, op->target); } else if(op->query_results) { crm_info("Query %s for %s complete: %d", op->id, op->target, op->state); call_remote_stonith(op, NULL); } else { crm_err("Query %s for %s timed out", op->id, op->target); if(op->op_timer) { g_source_remove(op->op_timer); op->op_timer = 0; } remote_op_timeout(op); } return FALSE; } void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer) { remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, request, LOG_ERR); if(remote_op_list == NULL) { remote_op_list = g_hash_table_new_full( crm_str_hash, g_str_equal, NULL, free_remote_op); } if(peer) { const char *peer_id = crm_element_value(dev, F_STONITH_REMOTE); CRM_CHECK(peer_id != NULL, return NULL); op = g_hash_table_lookup(remote_op_list, peer_id); if(op) { crm_debug("%s already exists", peer_id); return op; } } crm_malloc0(op, sizeof(remote_fencing_op_t)); crm_element_value_int(request, F_STONITH_TIMEOUT, (int*)&(op->base_timeout)); if(peer) { op->id = crm_element_value_copy(dev, F_STONITH_REMOTE); } else { cl_uuid_t new_uuid; char uuid_str[UU_UNPARSE_SIZEOF]; cl_uuid_generate(&new_uuid); cl_uuid_unparse(&new_uuid, uuid_str); op->id = crm_strdup(uuid_str); } g_hash_table_replace(remote_op_list, op->id, op); op->state = st_query; op->action = crm_element_value_copy(dev, F_STONITH_ACTION); op->originator = crm_element_value_copy(dev, "src"); if(op->originator == NULL) { /* Local request */ op->originator = crm_strdup(stonith_our_uname); } op->client_id = crm_strdup(client); op->target = crm_element_value_copy(dev, F_STONITH_TARGET); op->request = copy_xml(request); /* TODO: Figure out how to avoid this */ crm_element_value_int(request, F_STONITH_CALLOPTS, (int*)&(op->call_options)); return op; } void initiate_remote_stonith_op(stonith_client_t *client, xmlNode *request) { xmlNode *query = NULL; remote_fencing_op_t *op = NULL; crm_log_xml_debug(request, "RemoteOp"); op = create_remote_stonith_op(client->id, request, FALSE); - op->op_timer = g_timeout_add(1000*op->base_timeout, remote_op_timeout, op); + op->op_timer = g_timeout_add(1200*op->base_timeout, remote_op_timeout, op); op->query_timer = g_timeout_add(100*op->base_timeout, remote_op_query_timeout, op); query = stonith_create_op(0, op->id, STONITH_OP_QUERY, NULL, 0); crm_xml_add(query, F_STONITH_REMOTE, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add(query, F_STONITH_CLIENTID, op->client_id); - crm_xml_add_int(query, F_STONITH_TIMEOUT, 100*op->base_timeout); + crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout); crm_info("Initiating remote operation %s for %s: %s", op->action, op->target, op->id); CRM_CHECK(op->action, return); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } static void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) { xmlNode *query = stonith_create_op(0, op->id, STONITH_OP_FENCE, NULL, 0);; crm_xml_add(query, F_STONITH_REMOTE, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add_int(query, F_STONITH_TIMEOUT, 900*op->base_timeout); op->state = st_exec; while(peer == NULL && op->query_results) { peer = g_list_nth_data(op->query_results, 0); op->query_results = g_list_remove(op->query_results, peer); if(peer && peer->devices < 1) { free_remote_query(peer); peer = NULL; } } if(peer) { crm_info("Requesting that %s perform op %s %s", peer->host, op->action, op->target); send_cluster_message(peer->host, crm_msg_stonith_ng, query, FALSE); } else if(op->query_timer == 0) { /* We've exhausted all available peers */ crm_info("No remaining peers capable of terminating %s", op->target); remote_op_timeout(op); } else { crm_info("Waiting for additional peers capable of terminating %s", op->target); } free_remote_query(peer); free_xml(query); } static gint sort_peers(gconstpointer a, gconstpointer b) { const st_query_result_t *peer_a = a; const st_query_result_t *peer_b = a; /* TODO: Factor in priority? */ if(peer_a->devices > peer_b->devices) { return -1; } else if(peer_a->devices > peer_b->devices) { return 1; } return 0; } int process_remote_stonith_query(xmlNode *msg) { int devices = 0; const char *id = NULL; remote_fencing_op_t *op = NULL; st_query_result_t *result = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR); crm_log_xml_debug(msg, "QueryResult"); CRM_CHECK(dev != NULL, return st_err_internal); id = crm_element_value(dev, F_STONITH_REMOTE); CRM_CHECK(id != NULL, return st_err_internal); dev = get_xpath_object("//@st-available-devices", msg, LOG_ERR); CRM_CHECK(dev != NULL, return st_err_internal); crm_element_value_int(dev, "st-available-devices", &devices); op = g_hash_table_lookup(remote_op_list, id); if(op == NULL) { crm_debug("Unknown or expired remote op: %s", id); return st_err_unknown_operation; } op->replies++; crm_malloc0(result, sizeof(st_query_result_t)); result->host = crm_element_value_copy(msg, F_ORIG); result->devices = devices; /* TODO: Implement options * A) If we have anyone that can do the job * B) If we have someone that can do the job and some percent of the known peers * C) If all known peers have responded * * Implement A first */ /* Track A */ if(result->devices > 0) { gboolean do_queue = FALSE; gboolean do_exec = FALSE; if(op->call_options & st_opt_allow_suicide) { crm_info("Allowing %s to potentialy fence itself", op->target); } else if(safe_str_eq(result->host, op->target)) { crm_info("Ignoring reply from %s, hosts are not permitted to commit suicide", op->target); free_remote_query(result); return 0; } switch(op->state) { case st_query: if( op->call_options & st_opt_all_replies ) { do_queue = TRUE; } else { do_exec = TRUE; } break; case st_exec: do_queue = TRUE; break; case st_failed: do_exec = TRUE; break; case st_done: crm_info("Discarding query result from %s (%d deices): Operation is in state %d", result->host, result->devices, op->state); break; } if(do_exec) { call_remote_stonith(op, result); } else if(do_queue) { crm_info("Queuing query result from %s (%d devices): %s", result->host, result->devices, op->state==st_query?"Waiting for remaining replies":"Operation is pending"); op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); } else { free_remote_query(result); } } else { free_remote_query(result); } return 0; } int process_remote_stonith_exec(xmlNode *msg) { int rc = 0; const char *id = NULL; remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR); crm_log_xml_info(msg, "ExecResult"); CRM_CHECK(dev != NULL, return st_err_internal); id = crm_element_value(dev, F_STONITH_REMOTE); CRM_CHECK(id != NULL, return st_err_internal); dev = get_xpath_object("//@"F_STONITH_RC, msg, LOG_ERR); CRM_CHECK(dev != NULL, return st_err_internal); if(remote_op_list) { op = g_hash_table_lookup(remote_op_list, id); } if(op == NULL) { /* Could be for an event that began before we started */ /* TODO: Record the op for later querying */ crm_info("Unknown or expired remote op: %s", id); return st_err_unknown_operation; } crm_element_value_int(dev, F_STONITH_RC, &rc); if(rc == stonith_ok || op->state != st_exec) { op->state = st_done; remote_op_done(op, msg, rc); } else if(rc < stonith_ok && op->state == st_exec) { call_remote_stonith(op, NULL); } return rc; } int stonith_fence_history(xmlNode *msg, xmlNode **output) { int rc = 0; const char *target = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_TRACE); if(dev) { target = crm_element_value(dev, F_STONITH_TARGET); } *output = create_xml_node(NULL, F_STONITH_HISTORY_LIST); if (remote_op_list) { GHashTableIter iter; remote_fencing_op_t *op = NULL; g_hash_table_iter_init(&iter, remote_op_list); while(g_hash_table_iter_next(&iter, NULL, (void**)&op)) { xmlNode *entry = NULL; if (target && strcmp(op->target, target) != 0) { continue; } rc = 0; entry = create_xml_node(*output, STONITH_OP_EXEC); crm_xml_add(entry, F_STONITH_TARGET, op->target); crm_xml_add(entry, F_STONITH_ACTION, op->action); crm_xml_add(entry, F_STONITH_ORIGIN, op->originator); crm_xml_add(entry, F_STONITH_DELEGATE, op->delegate); crm_xml_add_int(entry, F_STONITH_DATE, op->completed); crm_xml_add_int(entry, F_STONITH_STATE, op->state); } } return rc; } diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h index 2f4f40c129..aa706f31cb 100644 --- a/include/crm/stonith-ng.h +++ b/include/crm/stonith-ng.h @@ -1,205 +1,209 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef STONITH_NG__H #define STONITH_NG__H #include #include #include enum stonith_state { stonith_connected_command, stonith_connected_query, stonith_disconnected, }; enum stonith_call_options { st_opt_none = 0x00000000, st_opt_verbose = 0x00000001, st_opt_allow_suicide = 0x00000002, st_opt_local_first = 0x00000004, st_opt_discard_reply = 0x00000010, st_opt_all_replies = 0x00000020, st_opt_scope_local = 0x00000100, st_opt_sync_call = 0x00001000, }; #define stonith_default_options = stonith_none enum stonith_errors { stonith_ok = 0, - st_err_generic = -1, - st_err_internal = -2, - st_err_not_supported = -3, - st_err_connection = -4, - st_err_missing = -5, - st_err_exists = -6, - st_err_timeout = -7, - st_err_ipc = -8, - st_err_peer = -9, - st_err_unknown_operation = -10, - st_err_unknown_device = -11, - st_err_unknown_port = -12, - st_err_none_available = -13, - st_err_authentication = -14, - st_err_signal = -15, + stonith_pending = -1, + st_err_generic = -2, + st_err_internal = -3, + st_err_not_supported = -4, + st_err_connection = -5, + st_err_missing = -6, + st_err_exists = -7, + st_err_timeout = -8, + st_err_ipc = -9, + st_err_peer = -10, + st_err_unknown_operation = -11, + st_err_unknown_device = -12, + st_err_unknown_port = -13, + st_err_none_available = -14, + st_err_authentication = -15, + st_err_signal = -16, + st_err_agent_fork = -17, + st_err_agent_args = -18, + st_err_agent = -19, }; #define F_STONITH_CLIENTID "st_clientid" #define F_STONITH_CALLOPTS "st_callopt" #define F_STONITH_CALLID "st_callid" #define F_STONITH_CALLDATA "st_calldata" #define F_STONITH_OPERATION "st_op" #define F_STONITH_TARGET "st_target" #define F_STONITH_REMOTE "st_remote_op" #define F_STONITH_RC "st_rc" #define F_STONITH_TIMEOUT "st_timeout" #define F_STONITH_CALLBACK_TOKEN "st_async_id" #define F_STONITH_CLIENTNAME "st_clientname" #define F_STONITH_NOTIFY_TYPE "st_notify_type" #define F_STONITH_NOTIFY_ACTIVATE "st_notify_activate" #define F_STONITH_NOTIFY_DEACTIVATE "st_notify_deactivate" #define F_STONITH_DELEGATE "st_delegate" #define F_STONITH_ORIGIN "st_origin" #define F_STONITH_HISTORY_LIST "st_history" #define F_STONITH_DATE "st_date" #define F_STONITH_STATE "st_state" #define T_STONITH_NG "stonith-ng" #define T_STONITH_REPLY "st-reply" #define F_STONITH_DEVICE "st_device_id" #define F_STONITH_ACTION "st_device_action" #define T_STONITH_NOTIFY "st_notify" #define T_STONITH_NOTIFY_DISCONNECT "st_notify_disconnect" #define STONITH_ATTR_ARGMAP "pcmk_arg_map" #define STONITH_ATTR_HOSTARG "pcmk_host_argument" #define STONITH_ATTR_HOSTMAP "pcmk_host_map" #define STONITH_ATTR_HOSTLIST "pcmk_host_list" #define STONITH_ATTR_HOSTCHECK "pcmk_host_check" #define STONITH_ATTR_ACTION_OP "option" /* To be replaced by 'action' at some point */ #define STONITH_OP_EXEC "st_execute" #define STONITH_OP_QUERY "st_query" #define STONITH_OP_FENCE "st_fence" #define STONITH_OP_CONFIRM "st_confirm" #define STONITH_OP_DEVICE_ADD "st_device_register" #define STONITH_OP_DEVICE_DEL "st_device_remove" #define STONITH_OP_DEVICE_METADATA "st_device_metadata" #define STONITH_OP_FENCE_HISTORY "st_fence_history" #define stonith_channel "st_command" #define stonith_channel_callback "st_callback" enum op_state { st_query, st_exec, st_done, st_failed, }; typedef struct stonith_key_value_s { char *key; char *value; struct stonith_key_value_s *next; } stonith_key_value_t; typedef struct stonith_history_s { char *target; char *action; char *origin; char *delegate; int completed; int state; struct stonith_history_s *next; } stonith_history_t; typedef struct stonith_s stonith_t; typedef struct stonith_api_operations_s { int (*free) (stonith_t *st); int (*connect) (stonith_t *st, const char *name, int *stonith_fd); int (*disconnect)(stonith_t *st); int (*remove_device)( stonith_t *st, int options, const char *name); int (*register_device)( stonith_t *st, int options, const char *id, const char *namespace, const char *agent, stonith_key_value_t *params); int (*metadata)(stonith_t *st, int options, const char *device, const char *namespace, char **output, int timeout); int (*list)(stonith_t *stonith, int call_options, const char *namespace, stonith_key_value_t **devices, int timeout); int (*call)(stonith_t *st, int options, const char *id, const char *action, const char *port, int timeout); int (*query)(stonith_t *st, int options, const char *node, stonith_key_value_t **devices, int timeout); int (*fence)(stonith_t *st, int options, const char *node, const char *action, int timeout); int (*confirm)(stonith_t *st, int options, const char *node); int (*history)(stonith_t *st, int options, const char *node, stonith_history_t **output, int timeout); int (*register_notification)( stonith_t *st, const char *event, void (*notify)(stonith_t *st, const char *event, xmlNode *msg)); int (*remove_notification)(stonith_t *st, const char *event); int (*register_callback)( stonith_t *st, int call_id, int timeout, bool only_success, void *userdata, const char *callback_name, void (*callback)(stonith_t *st, const xmlNode *msg, int call, int rc, xmlNode *output, void *userdata)); int (*remove_callback)(stonith_t *st, int call_id, bool all_callbacks); } stonith_api_operations_t; struct stonith_s { enum stonith_state state; int call_id; int call_timeout; void *private; stonith_api_operations_t *cmds; }; /* Core functions */ extern stonith_t *stonith_api_new(void); extern void stonith_api_delete(stonith_t *st); extern const char *stonith_error2string(enum stonith_errors return_code); extern void stonith_dump_pending_callbacks(stonith_t *st); extern const char *get_stonith_provider(const char *agent, const char *provider); extern bool stonith_dispatch(stonith_t *st); extern stonith_key_value_t *stonith_key_value_add(stonith_key_value_t *kvp, const char *key, const char *value); extern void stonith_key_value_freeall(stonith_key_value_t *kvp, int keys, int values); #endif diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 21140e74b7..a2370b8d93 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -1,1819 +1,1817 @@ /* * Copyright (c) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include #include #include #include #include #include #include /* Add it for compiling on OSX */ #include #include #include #include #include #include -#define FE_AGENT_FORK -2 -#define FE_AGENT_ERROR -3 - CRM_TRACE_INIT_DATA(stonith); typedef struct stonith_private_s { char *token; IPC_Channel *command_channel; IPC_Channel *callback_channel; GCHSource *callback_source; GHashTable *stonith_op_callback_table; GList *notify_list; void (*op_callback)( stonith_t *st, const xmlNode *msg, int call, int rc, xmlNode *output, void *userdata); } stonith_private_t; typedef struct stonith_notify_client_s { const char *event; const char *obj_id; /* implement one day */ const char *obj_type; /* implement one day */ void (*notify)(stonith_t *st, const char *event, xmlNode *msg); } stonith_notify_client_t; typedef struct stonith_callback_client_s { void (*callback)( stonith_t *st, const xmlNode *msg, int call, int rc, xmlNode *output, void *userdata); const char *id; void *user_data; gboolean only_success; struct timer_rec_s *timer; } stonith_callback_client_t; struct notify_blob_s { stonith_t *stonith; xmlNode *xml; }; struct timer_rec_s { int call_id; int timeout; guint ref; stonith_t *stonith; }; typedef enum stonith_errors (*stonith_op_t)( const char *, int, const char *, xmlNode *, xmlNode*, xmlNode*, xmlNode**, xmlNode**); static const char META_TEMPLATE[] = "\n" "\n" "\n" " 1.0\n" " \n" "%s\n" " \n" " %s\n" "%s\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " 2.0\n" " \n" "\n"; bool stonith_dispatch(stonith_t *st); gboolean stonith_dispatch_internal(IPC_Channel *channel, gpointer user_data); void stonith_perform_callback(stonith_t *stonith, xmlNode *msg, int call_id, int rc); xmlNode *stonith_create_op( int call_id, const char *token, const char *op, xmlNode *data, int call_options); int stonith_send_command( stonith_t *stonith, const char *op, xmlNode *data, xmlNode **output_data, int call_options, int timeout); static void stonith_connection_destroy(gpointer user_data); static void stonith_send_notification(gpointer data, gpointer user_data); static void stonith_connection_destroy(gpointer user_data) { stonith_t *stonith = user_data; stonith_private_t *native = NULL; struct notify_blob_s blob; blob.stonith = stonith; blob.xml = create_xml_node(NULL, "notify"); native = stonith->private; native->callback_source = NULL; stonith->state = stonith_disconnected; crm_xml_add(blob.xml, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(blob.xml, F_SUBTYPE, T_STONITH_NOTIFY_DISCONNECT); g_list_foreach(native->notify_list, stonith_send_notification, &blob); free_xml(blob.xml); } static int stonith_api_register_device( stonith_t *stonith, int call_options, const char *id, const char *namespace, const char *agent, stonith_key_value_t *params) { int rc = 0; xmlNode *data = create_xml_node(NULL, F_STONITH_DEVICE); xmlNode *args = create_xml_node(data, XML_TAG_ATTRS); crm_xml_add(data, XML_ATTR_ID, id); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, "agent", agent); crm_xml_add(data, "namespace", namespace); for( ; params; params = params->next ) { hash2field( (gpointer)params->key, (gpointer)params->value, args ); } rc = stonith_send_command(stonith, STONITH_OP_DEVICE_ADD, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_remove_device( stonith_t *stonith, int call_options, const char *name) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, XML_ATTR_ID, name); rc = stonith_send_command(stonith, STONITH_OP_DEVICE_DEL, data, NULL, call_options, 0); free_xml(data); return rc; } static void append_arg( gpointer key, gpointer value, gpointer user_data) { int len = 3; /* =, \n, \0 */ int last = 0; char **args = user_data; CRM_CHECK(key != NULL, return); CRM_CHECK(value != NULL, return); if(strstr(key, "pcmk_")) { return; } else if(strstr(key, CRM_META)) { return; } else if(safe_str_eq(key, "crm_feature_set")) { return; } len += strlen(key); len += strlen(value); if(*args != NULL) { last = strlen(*args); } crm_realloc(*args, last+len); crm_debug_2("Appending: %s=%s", (char *)key, (char *)value); sprintf((*args)+last, "%s=%s\n", (char *)key, (char *)value); } static void append_const_arg(const char *key, const char *value, char **arg_list) { char *glib_sucks_key = crm_strdup(key); char *glib_sucks_value = crm_strdup(value); append_arg(glib_sucks_key, glib_sucks_value, arg_list); crm_free(glib_sucks_value); crm_free(glib_sucks_key); } static void append_host_specific_args(const char *victim, const char *map, GHashTable *params, char **arg_list) { char *name = NULL; int last = 0, lpc = 0, max = 0; if(map == NULL) { /* The best default there is for now... */ crm_debug("Using default arg map: port=uname"); append_const_arg("port", victim, arg_list); return; } max = strlen(map); crm_debug("Processing arg map: %s", map); for(; lpc < max + 1; lpc++) { if(isalpha(map[lpc])) { /* keep going */ } else if(map[lpc] == '=' || map[lpc] == ':') { crm_free(name); crm_malloc0(name, 1 + lpc - last); strncpy(name, map + last, lpc - last); crm_debug("Got name: %s", name); last = lpc + 1; } else if(map[lpc] == 0 || map[lpc] == ',' || isspace(map[lpc])) { char *param = NULL; const char *value = NULL; crm_malloc0(param, 1 + lpc - last); strncpy(param, map + last, lpc - last); last = lpc + 1; crm_debug("Got key: %s", param); if(name == NULL) { crm_err("Misparsed '%s', found '%s' without a name", map, param); crm_free(param); continue; } if(safe_str_eq(param, "uname")) { value = victim; } else { char *key = crm_meta_name(param); value = g_hash_table_lookup(params, key); crm_free(key); } if(value) { crm_debug("Setting '%s'='%s' (%s) for %s", name, value, param, victim); append_const_arg(name, value, arg_list); } else { crm_err("No node attribute '%s' for '%s'", name, victim); } crm_free(name); name=NULL; crm_free(param); if(map[lpc] == 0) { break; } } else if(isspace(map[lpc])) { last = lpc; } } crm_free(name); } static char *make_args(const char *action, const char *victim, GHashTable *device_args, GHashTable *port_map) { char buffer[512]; char *arg_list = NULL; const char *value = NULL; CRM_CHECK(action != NULL, return NULL); if(device_args) { g_hash_table_foreach(device_args, append_arg, &arg_list); } buffer[511] = 0; snprintf(buffer, 511, "pcmk_%s_action", action); if(device_args) { value = g_hash_table_lookup(device_args, buffer); } if(value == NULL && device_args) { /* Legacy support for early 1.1 releases - Remove for 1.2 */ snprintf(buffer, 511, "pcmk_%s_cmd", action); value = g_hash_table_lookup(device_args, buffer); } if(value) { crm_info("Substituting action '%s' for requested operation '%s'", value, action); action = value; } append_const_arg(STONITH_ATTR_ACTION_OP, action, &arg_list); if(victim && device_args) { const char *alias = victim; const char *param = g_hash_table_lookup(device_args, STONITH_ATTR_HOSTARG); if(port_map && g_hash_table_lookup(port_map, victim)) { alias = g_hash_table_lookup(port_map, victim); } /* Always supply the node's name too: * https://fedorahosted.org/cluster/wiki/FenceAgentAPI */ append_const_arg("nodename", victim, &arg_list); /* Check if we need to supply the victim in any other form */ if(param == NULL) { const char *map = g_hash_table_lookup(device_args, STONITH_ATTR_ARGMAP); if(map == NULL) { param = "port"; value = g_hash_table_lookup(device_args, param); } else { /* Legacy handling */ append_host_specific_args(alias, map, device_args, &arg_list); value = map; /* Nothing more to do */ } } else if(safe_str_eq(param, "none")) { value = param; /* Nothing more to do */ } else { value = g_hash_table_lookup(device_args, param); } /* Don't overwrite explictly set values for $param */ if(value == NULL || safe_str_eq(value, "dynamic")) { crm_info("%s-ing node '%s' as '%s=%s'", action, victim, param, alias); append_const_arg(param, alias, &arg_list); } } crm_debug_3("Calculated: %s", arg_list); return arg_list; } /* Borrowed from libfence and extended */ int run_stonith_agent( const char *agent, const char *action, const char *victim, GHashTable *device_args, GHashTable *port_map, int *agent_result, char **output, async_command_t *track) { char *args = make_args(action, victim, device_args, port_map); - int pid, status, len, rc = -1; + int pid, status, len, rc = st_err_internal; int p_read_fd, p_write_fd; /* parent read/write file descriptors */ int c_read_fd, c_write_fd; /* child read/write file descriptors */ int fd1[2]; int fd2[2]; c_read_fd = c_write_fd = p_read_fd = p_write_fd = -1; if (args == NULL || agent == NULL) goto fail; len = strlen(args); if (pipe(fd1)) goto fail; p_read_fd = fd1[0]; c_write_fd = fd1[1]; if (pipe(fd2)) goto fail; c_read_fd = fd2[0]; p_write_fd = fd2[1]; crm_debug("forking"); pid = fork(); if (pid < 0) { - *agent_result = FE_AGENT_FORK; + rc = st_err_agent_fork; goto fail; } if (pid) { /* parent */ int ret; int total = 0; fcntl(p_read_fd, F_SETFL, fcntl(p_read_fd, F_GETFL, 0) | O_NONBLOCK); do { crm_debug("sending args"); ret = write(p_write_fd, args+total, len-total); if(ret > 0) { total += ret; } } while (errno == EINTR && total < len); if (total != len) { crm_perror(LOG_ERR, "Sent %d not %d bytes", total, len); if(ret >= 0) { - rc = st_err_generic; + rc = st_err_agent_args; } goto fail; } close(p_write_fd); if(track) { track->stdout = p_read_fd; NewTrackedProc(pid, 0, PT_LOGNORMAL, track, track->pt_ops); crm_trace("Op: %s on %s, timeout: %d", action, agent, track->timeout); if(track->timeout) { track->killseq[0].mstimeout = track->timeout; /* after timeout send TERM */ track->killseq[0].signalno = SIGTERM; track->killseq[1].mstimeout = 5000; /* after another 5s remove it */ track->killseq[1].signalno = SIGKILL; track->killseq[2].mstimeout = 5000; /* if it's still there after another 5s, complain */ track->killseq[2].signalno = 0; SetTrackedProcTimeouts(pid, track->killseq); } else { crm_err("No timeout set for stonith operation %s with device %s", action, agent); } close(c_write_fd); close(c_read_fd); crm_free(args); return pid; } else { waitpid(pid, &status, 0); if(output != NULL) { len = 0; do { char buf[500]; ret = read(p_read_fd, buf, 500); if(ret > 0) { buf[ret] = 0; crm_realloc(*output, len + ret + 1); sprintf((*output)+len, "%s", buf); crm_debug("%d: %s", ret, (*output)+len); len += ret; } } while (ret == 500 || (ret < 0 && errno == EINTR)); } - - *agent_result = FE_AGENT_ERROR; + + rc = st_err_agent; + *agent_result = st_err_agent; if (WIFEXITED(status)) { crm_debug("result = %d", WEXITSTATUS(status)); *agent_result = -WEXITSTATUS(status); rc = 0; } } } else { /* child */ close(1); if (dup(c_write_fd) < 0) goto fail; close(2); if (dup(c_write_fd) < 0) goto fail; close(0); if (dup(c_read_fd) < 0) goto fail; /* keep c_write_fd open so parent can report all errors. */ close(c_read_fd); close(p_read_fd); close(p_write_fd); execlp(agent, agent, NULL); exit(EXIT_FAILURE); } fail: crm_free(args); if(p_read_fd >= 0) { close(p_read_fd); } if(p_write_fd >= 0) { close(p_write_fd); } if(c_read_fd >= 0) { close(c_read_fd); } if(c_write_fd >= 0) { close(c_write_fd); } - + return rc; } static int stonith_api_device_list( stonith_t *stonith, int call_options, const char *namespace, stonith_key_value_t **devices, int timeout) { int count = 0; if ( devices == NULL ) { crm_err("Parameter error: stonith_api_device_list"); return -2; } /* Include Heartbeat agents */ if(namespace == NULL || safe_str_eq("heartbeat", namespace)) { char **entry = NULL; char **type_list = stonith_types(); for(entry = type_list; entry != NULL && *entry; ++entry) { crm_trace("Added: %s", *entry); *devices = stonith_key_value_add(*devices, NULL, *entry); count++; } if(type_list) { stonith_free_hostlist(type_list); } } /* Include Red Hat agents, basically: ls -1 @sbin_dir@/fence_* */ if(namespace == NULL || safe_str_eq("redhat", namespace)) { struct dirent **namelist; int file_num = scandir(RH_STONITH_DIR, &namelist, 0, alphasort); if (file_num > 0) { struct stat prop; char buffer[FILENAME_MAX+1]; while (file_num--) { if ('.' == namelist[file_num]->d_name[0]) { free(namelist[file_num]); continue; } else if(0 != strncmp(RH_STONITH_PREFIX, namelist[file_num]->d_name, strlen(RH_STONITH_PREFIX))) { free(namelist[file_num]); continue; } snprintf(buffer,FILENAME_MAX,"%s/%s", RH_STONITH_DIR, namelist[file_num]->d_name); if(stat(buffer, &prop) == 0 && S_ISREG(prop.st_mode)) { *devices = stonith_key_value_add(*devices, NULL, namelist[file_num]->d_name); count++; } free(namelist[file_num]); } free(namelist); } } return count; } static int stonith_api_device_metadata( stonith_t *stonith, int call_options, const char *agent, const char *namespace, char **output, int timeout) { int rc = 0; int bufferlen = 0; char *buffer = NULL; char *xml_meta_longdesc = NULL; char *xml_meta_shortdesc = NULL; char *meta_param = NULL; char *meta_longdesc = NULL; char *meta_shortdesc = NULL; const char *provider = get_stonith_provider(agent, namespace); Stonith *stonith_obj = NULL; static const char *no_parameter_info = ""; crm_info("looking up %s/%s metadata", agent, provider); /* By having this in a library, we can access it from stonith_admin * when neither lrmd or stonith-ng are running * Important for the crm shell's validations... */ if(safe_str_eq(provider, "redhat")) { int exec_rc = run_stonith_agent( agent, "metadata", NULL, NULL, NULL, &rc, &buffer, NULL); if(exec_rc < 0 || rc != 0 || buffer == NULL) { /* failed */ crm_debug("Query failed: %d %d: %s", exec_rc, rc, crm_str(buffer)); /* provide a fake metadata entry */ meta_longdesc = crm_strdup(no_parameter_info); meta_shortdesc = crm_strdup(no_parameter_info); meta_param = crm_strdup( " \n" " \n" " \n" " \n" " Fencing action (null, off, on, [reboot], status, hostlist, devstatus)\n" " \n" " "); goto build; } } else { stonith_obj = stonith_new(agent); meta_longdesc = crm_strdup(stonith_get_info(stonith_obj, ST_DEVICEDESCR)); if (meta_longdesc == NULL) { crm_warn("no long description in %s's metadata.", agent); meta_longdesc = crm_strdup(no_parameter_info); } meta_shortdesc = crm_strdup(stonith_get_info(stonith_obj, ST_DEVICEID)); if (meta_shortdesc == NULL) { crm_warn("no short description in %s's metadata.", agent); meta_shortdesc = crm_strdup(no_parameter_info); } meta_param = crm_strdup(stonith_get_info(stonith_obj, ST_CONF_XML)); if (meta_param == NULL) { crm_warn("no list of parameters in %s's metadata.", agent); meta_param = crm_strdup(no_parameter_info); } build: xml_meta_longdesc = (char *)xmlEncodeEntitiesReentrant(NULL, (const unsigned char *)meta_longdesc); xml_meta_shortdesc = (char *)xmlEncodeEntitiesReentrant(NULL, (const unsigned char *)meta_shortdesc); bufferlen = strlen(META_TEMPLATE) + strlen(agent) + strlen(xml_meta_longdesc) + strlen(xml_meta_shortdesc) + strlen(meta_param) + 1; crm_malloc0(buffer, bufferlen); snprintf(buffer, bufferlen-1, META_TEMPLATE, agent, xml_meta_longdesc, xml_meta_shortdesc, meta_param); xmlFree(xml_meta_longdesc); xmlFree(xml_meta_shortdesc); if(stonith_obj) { stonith_delete(stonith_obj); } crm_free(meta_shortdesc); crm_free(meta_longdesc); crm_free(meta_param); } if(output) { *output = buffer; } else { crm_free(buffer); } return rc; } static int stonith_api_confirm( stonith_t *stonith, int call_options, const char *target) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, target); rc = stonith_send_command(stonith, STONITH_OP_FENCE, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_query( stonith_t *stonith, int call_options, const char *target, stonith_key_value_t **devices, int timeout) { int rc = 0, lpc = 0, max = 0; xmlNode *data = NULL; xmlNode *output = NULL; xmlXPathObjectPtr xpathObj = NULL; CRM_CHECK(devices != NULL, return st_err_missing); data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, target); rc = stonith_send_command(stonith, STONITH_OP_QUERY, data, &output, call_options, timeout); if(rc < 0) { return rc; } xpathObj = xpath_search(output, "//@agent"); if(xpathObj) { max = xpathObj->nodesetval->nodeNr; for(lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_CHECK(match != NULL, continue); crm_info("%s[%d] = %s", "//@agent", lpc, xmlGetNodePath(match)); *devices = stonith_key_value_add(*devices, NULL, crm_element_value(match, XML_ATTR_ID)); } } free_xml(output); free_xml(data); return max; } static int stonith_api_call( stonith_t *stonith, int call_options, const char *id, const char *action, const char *victim, int timeout) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, F_STONITH_DEVICE, id); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add(data, F_STONITH_TARGET, victim); rc = stonith_send_command(stonith, STONITH_OP_EXEC, data, NULL, call_options, timeout); free_xml(data); return rc; } static int stonith_api_fence( stonith_t *stonith, int call_options, const char *node, const char *action, int timeout) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add_int(data, F_STONITH_TIMEOUT, timeout); rc = stonith_send_command(stonith, STONITH_OP_FENCE, data, NULL, call_options, timeout); free_xml(data); return rc; } static int stonith_api_history( stonith_t *stonith, int call_options, const char *node, stonith_history_t **history, int timeout) { int rc = 0; xmlNode *data = NULL; xmlNode *output = NULL; stonith_history_t *last = NULL; *history = NULL; if(node) { data = create_xml_node(NULL, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); } rc = stonith_send_command(stonith, STONITH_OP_FENCE_HISTORY, data, &output, call_options | st_opt_sync_call, timeout); free_xml(data); if (rc == 0) { xmlNode *op = NULL; xmlNode *reply = get_xpath_object("//"F_STONITH_HISTORY_LIST, output, LOG_ERR); for(op = __xml_first_child(reply); op != NULL; op = __xml_next(op)) { stonith_history_t *kvp; crm_malloc0(kvp, sizeof(stonith_history_t)); kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); kvp->action = crm_element_value_copy(op, F_STONITH_ACTION); kvp->origin = crm_element_value_copy(op, F_STONITH_ORIGIN); kvp->delegate = crm_element_value_copy(op, F_STONITH_DELEGATE); crm_element_value_int(op, F_STONITH_DATE, &kvp->completed); crm_element_value_int(op, F_STONITH_STATE, &kvp->state); if (last) { last->next = kvp; } else { *history = kvp; } last = kvp; } } return rc; } const char * stonith_error2string(enum stonith_errors return_code) { const char *error_msg = NULL; switch(return_code) { case stonith_ok: error_msg = "OK"; break; case st_err_not_supported: error_msg = "Not supported"; break; case st_err_authentication: error_msg = "Not authenticated"; break; case st_err_generic: error_msg = "Generic error"; break; case st_err_internal: error_msg = "Internal error"; break; case st_err_unknown_device: error_msg = "Unknown device"; break; case st_err_unknown_operation: error_msg = "Unknown operation"; break; case st_err_unknown_port: error_msg = "Unknown victim"; break; case st_err_none_available: error_msg = "No available fencing devices"; break; case st_err_connection: error_msg = "Not connected"; break; case st_err_missing: error_msg = "Missing input"; break; case st_err_exists: error_msg = "Device exists"; break; case st_err_timeout: error_msg = "Operation timed out"; break; case st_err_signal: error_msg = "Killed by signal"; break; case st_err_ipc: error_msg = "IPC connection failed"; break; case st_err_peer: error_msg = "Error from peer"; break; } if(error_msg == NULL) { crm_err("Unknown Stonith error code: %d", return_code); error_msg = ""; } return error_msg; } gboolean is_redhat_agent(const char *agent) { int rc = 0; struct stat prop; char buffer[FILENAME_MAX+1]; snprintf(buffer,FILENAME_MAX,"%s/%s", RH_STONITH_DIR, agent); rc = stat(buffer, &prop); if (rc >= 0 && S_ISREG(prop.st_mode)) { return TRUE; } return FALSE; } const char *get_stonith_provider(const char *agent, const char *provider) { /* This function sucks */ if(is_redhat_agent(agent)) { return "redhat"; } else { Stonith *stonith_obj = stonith_new(agent); if(stonith_obj) { stonith_delete(stonith_obj); return "heartbeat"; } } crm_err("No such device: %s", agent); return NULL; } static gint stonithlib_GCompareFunc(gconstpointer a, gconstpointer b) { int rc = 0; const stonith_notify_client_t *a_client = a; const stonith_notify_client_t *b_client = b; CRM_CHECK(a_client->event != NULL && b_client->event != NULL, return 0); rc = strcmp(a_client->event, b_client->event); if(rc == 0) { if(a_client->notify == NULL || b_client->notify == NULL) { return 0; } else if(a_client->notify == b_client->notify) { return 0; } else if(((long)a_client->notify) < ((long)b_client->notify)) { crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return -1; } crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return 1; } return rc; } static int get_stonith_token(IPC_Channel *ch, char **token) { int rc = stonith_ok; xmlNode *reg_msg = NULL; const char *msg_type = NULL; const char *tmp_ticket = NULL; CRM_CHECK(ch != NULL, return st_err_missing); CRM_CHECK(token != NULL, return st_err_missing); crm_debug_4("Waiting for msg on command channel"); reg_msg = xmlfromIPC(ch, MAX_IPC_DELAY); if(ch->ops->get_chan_status(ch) != IPC_CONNECT) { crm_err("No reply message - disconnected"); free_xml(reg_msg); return st_err_connection; } else if(reg_msg == NULL) { crm_err("No reply message - empty"); return st_err_ipc; } msg_type = crm_element_value(reg_msg, F_STONITH_OPERATION); tmp_ticket = crm_element_value(reg_msg, F_STONITH_CLIENTID); if(safe_str_neq(msg_type, CRM_OP_REGISTER) ) { crm_err("Invalid registration message: %s", msg_type); rc = st_err_internal; } else if(tmp_ticket == NULL) { crm_err("No registration token provided"); crm_log_xml_warn(reg_msg, "Bad reply"); rc = st_err_internal; } else { crm_debug("Obtained registration token: %s", tmp_ticket); *token = crm_strdup(tmp_ticket); } free_xml(reg_msg); return rc; } xmlNode *stonith_create_op( int call_id, const char *token, const char *op, xmlNode *data, int call_options) { int rc = HA_OK; xmlNode *op_msg = create_xml_node(NULL, "stonith_command"); CRM_CHECK(op_msg != NULL, return NULL); CRM_CHECK(token != NULL, return NULL); crm_xml_add(op_msg, F_XML_TAGNAME, "stonith_command"); crm_xml_add(op_msg, F_TYPE, T_STONITH_NG); crm_xml_add(op_msg, F_STONITH_CALLBACK_TOKEN, token); crm_xml_add(op_msg, F_STONITH_OPERATION, op); crm_xml_add_int(op_msg, F_STONITH_CALLID, call_id); crm_debug_4("Sending call options: %.8lx, %d", (long)call_options, call_options); crm_xml_add_int(op_msg, F_STONITH_CALLOPTS, call_options); if(data != NULL) { add_message_xml(op_msg, F_STONITH_CALLDATA, data); } if (rc != HA_OK) { crm_err("Failed to create STONITH operation message"); crm_log_xml(LOG_ERR, "BadOp", op_msg); free_xml(op_msg); return NULL; } return op_msg; } static void stonith_destroy_op_callback(gpointer data) { stonith_callback_client_t *blob = data; if(blob->timer && blob->timer->ref > 0) { g_source_remove(blob->timer->ref); } crm_free(blob->timer); crm_free(blob); } static int stonith_api_signoff(stonith_t* stonith) { stonith_private_t *native = stonith->private; crm_debug("Signing out of the STONITH Service"); /* close channels */ if (native->command_channel != NULL) { native->command_channel->ops->destroy( native->command_channel); native->command_channel = NULL; } if (native->callback_source != NULL) { G_main_del_IPC_Channel(native->callback_source); native->callback_source = NULL; } if (native->callback_channel != NULL) { #ifdef BUG native->callback_channel->ops->destroy( native->callback_channel); #endif native->callback_channel = NULL; } stonith->state = stonith_disconnected; return stonith_ok; } static int stonith_api_signon( stonith_t* stonith, const char *name, int *stonith_fd) { int rc = stonith_ok; xmlNode *hello = NULL; char *uuid_ticket = NULL; stonith_private_t *native = stonith->private; crm_debug_4("Connecting command channel"); stonith->state = stonith_connected_command; native->command_channel = init_client_ipc_comms_nodispatch(stonith_channel); if(native->command_channel == NULL) { crm_debug("Connection to command channel failed"); rc = st_err_connection; } else if(native->command_channel->ch_status != IPC_CONNECT) { crm_err("Connection may have succeeded," " but authentication to command channel failed"); rc = st_err_authentication; } if(rc == stonith_ok) { rc = get_stonith_token(native->command_channel, &uuid_ticket); if(rc == stonith_ok) { native->token = uuid_ticket; uuid_ticket = NULL; } else { stonith->state = stonith_disconnected; native->command_channel->ops->disconnect(native->command_channel); return rc; } } native->callback_channel = init_client_ipc_comms_nodispatch( stonith_channel_callback); if(native->callback_channel == NULL) { crm_debug("Connection to callback channel failed"); rc = st_err_connection; } else if(native->callback_channel->ch_status != IPC_CONNECT) { crm_err("Connection may have succeeded," " but authentication to command channel failed"); rc = st_err_authentication; } if(rc == stonith_ok) { native->callback_channel->send_queue->max_qlen = 500; rc = get_stonith_token(native->callback_channel, &uuid_ticket); if(rc == stonith_ok) { crm_free(native->token); native->token = uuid_ticket; } } if(rc == stonith_ok) { CRM_CHECK(native->token != NULL, ;); hello = stonith_create_op(0, native->token, CRM_OP_REGISTER, NULL, 0); crm_xml_add(hello, F_STONITH_CLIENTNAME, name); if(send_ipc_message(native->command_channel, hello) == FALSE) { rc = st_err_internal; } free_xml(hello); } if(rc == stonith_ok) { if(stonith_fd != NULL) { *stonith_fd = native->callback_channel->ops->get_recv_select_fd( native->callback_channel); } else { /* do mainloop */ crm_debug_4("Connecting callback channel"); native->callback_source = G_main_add_IPC_Channel( G_PRIORITY_HIGH, native->callback_channel, FALSE, stonith_dispatch_internal, stonith, default_ipc_connection_destroy); if(native->callback_source == NULL) { crm_err("Callback source not recorded"); rc = st_err_connection; } else { set_IPC_Channel_dnotify( native->callback_source, stonith_connection_destroy); } } } if(rc == stonith_ok) { #if HAVE_MSGFROMIPC_TIMEOUT stonith->call_timeout = MAX_IPC_DELAY; #endif crm_debug("Connection to STONITH successful"); return stonith_ok; } crm_debug("Connection to STONITH failed: %s", stonith_error2string(rc)); stonith->cmds->disconnect(stonith); return rc; } static int stonith_set_notification(stonith_t* stonith, const char *callback, int enabled) { xmlNode *notify_msg = create_xml_node(NULL, __FUNCTION__); stonith_private_t *native = stonith->private; if(stonith->state != stonith_disconnected) { crm_xml_add(notify_msg, F_STONITH_OPERATION, T_STONITH_NOTIFY); if(enabled) { crm_xml_add(notify_msg, F_STONITH_NOTIFY_ACTIVATE, callback); } else { crm_xml_add(notify_msg, F_STONITH_NOTIFY_DEACTIVATE, callback); } send_ipc_message(native->callback_channel, notify_msg); } free_xml(notify_msg); return stonith_ok; } static int stonith_api_add_notification( stonith_t *stonith, const char *event, void (*callback)(stonith_t *stonith, const char *event, xmlNode *msg)) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; private = stonith->private; crm_debug_2("Adding callback for %s events (%d)", event, g_list_length(private->notify_list)); crm_malloc0(new_client, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = callback; list_item = g_list_find_custom( private->notify_list, new_client, stonithlib_GCompareFunc); if(list_item != NULL) { crm_warn("Callback already present"); crm_free(new_client); return st_err_exists; } else { private->notify_list = g_list_append(private->notify_list, new_client); stonith_set_notification(stonith, event, 1); crm_debug_3("Callback added (%d)", g_list_length(private->notify_list)); } return stonith_ok; } static int stonith_api_del_notification(stonith_t *stonith, const char *event) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; crm_debug("Removing callback for %s events", event); private = stonith->private; crm_malloc0(new_client, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = NULL; list_item = g_list_find_custom( private->notify_list, new_client, stonithlib_GCompareFunc); stonith_set_notification(stonith, event, 0); if(list_item != NULL) { stonith_notify_client_t *list_client = list_item->data; private->notify_list = g_list_remove(private->notify_list, list_client); crm_free(list_client); crm_debug_3("Removed callback"); } else { crm_debug_3("Callback not present"); } crm_free(new_client); return stonith_ok; } static gboolean stonith_async_timeout_handler(gpointer data) { struct timer_rec_s *timer = data; crm_debug("Async call %d timed out after %dms", timer->call_id, timer->timeout); stonith_perform_callback(timer->stonith, NULL, timer->call_id, st_err_timeout); /* Always return TRUE, never remove the handler * We do that in stonith_del_callback() */ return TRUE; } static int stonith_api_add_callback( stonith_t *stonith, int call_id, int timeout, bool only_success, void *user_data, const char *callback_name, void (*callback)( stonith_t *st, const xmlNode *msg, int call, int rc, xmlNode *output, void *userdata)) { stonith_callback_client_t *blob = NULL; stonith_private_t *private = NULL; CRM_CHECK(stonith != NULL, return st_err_missing); CRM_CHECK(stonith->private != NULL, return st_err_missing); private = stonith->private; if(call_id == 0) { private->op_callback = callback; } else if(call_id < 0) { if(only_success == FALSE) { callback(stonith, NULL, call_id, call_id, NULL, user_data); } else { crm_warn("STONITH call failed: %s", stonith_error2string(call_id)); } return FALSE; } crm_malloc0(blob, sizeof(stonith_callback_client_t)); blob->id = callback_name; blob->only_success = only_success; blob->user_data = user_data; blob->callback = callback; if(timeout > 0) { struct timer_rec_s *async_timer = NULL; crm_malloc0(async_timer, sizeof(struct timer_rec_s)); blob->timer = async_timer; async_timer->stonith = stonith; async_timer->call_id = call_id; async_timer->timeout = timeout*1100; async_timer->ref = g_timeout_add( async_timer->timeout, stonith_async_timeout_handler, async_timer); } g_hash_table_insert(private->stonith_op_callback_table, GINT_TO_POINTER(call_id), blob); return TRUE; } static int stonith_api_del_callback(stonith_t *stonith, int call_id, bool all_callbacks) { stonith_private_t *private = stonith->private; if(all_callbacks) { private->op_callback = NULL; g_hash_table_destroy(private->stonith_op_callback_table); private->stonith_op_callback_table = g_hash_table_new_full( g_direct_hash, g_direct_equal, NULL, stonith_destroy_op_callback); } else if(call_id == 0) { private->op_callback = NULL; } else { g_hash_table_remove(private->stonith_op_callback_table, GINT_TO_POINTER(call_id)); } return stonith_ok; } static void stonith_dump_pending_op( gpointer key, gpointer value, gpointer user_data) { int call = GPOINTER_TO_INT(key); stonith_callback_client_t *blob = value; crm_debug("Call %d (%s): pending", call, crm_str(blob->id)); } void stonith_dump_pending_callbacks(stonith_t *stonith) { stonith_private_t *private = stonith->private; if(private->stonith_op_callback_table == NULL) { return; } return g_hash_table_foreach( private->stonith_op_callback_table, stonith_dump_pending_op, NULL); } void stonith_perform_callback(stonith_t *stonith, xmlNode *msg, int call_id, int rc) { xmlNode *output = NULL; stonith_private_t *private = NULL; stonith_callback_client_t *blob = NULL; stonith_callback_client_t local_blob; CRM_CHECK(stonith != NULL, return); CRM_CHECK(stonith->private != NULL, return); private = stonith->private; local_blob.id = NULL; local_blob.callback = NULL; local_blob.user_data = NULL; local_blob.only_success = FALSE; if(msg != NULL) { crm_element_value_int(msg, F_STONITH_RC, &rc); crm_element_value_int(msg, F_STONITH_CALLID, &call_id); output = get_message_xml(msg, F_STONITH_CALLDATA); } CRM_CHECK(call_id > 0, crm_warn("Strange or missing call-id")); blob = g_hash_table_lookup( private->stonith_op_callback_table, GINT_TO_POINTER(call_id)); if(blob != NULL) { local_blob = *blob; blob = NULL; stonith_api_del_callback(stonith, call_id, FALSE); } else { crm_debug_2("No callback found for call %d", call_id); local_blob.callback = NULL; } if(stonith == NULL) { crm_debug("No stonith object supplied"); } if(local_blob.callback != NULL && (rc == stonith_ok || local_blob.only_success == FALSE)) { crm_debug_2("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); local_blob.callback(stonith, msg, call_id, rc, output, local_blob.user_data); } else if(private->op_callback == NULL && rc != stonith_ok) { crm_warn("STONITH command failed: %s", stonith_error2string(rc)); crm_log_xml(LOG_DEBUG, "Failed STONITH Update", msg); } if(private->op_callback != NULL) { crm_debug_2("Invoking global callback for call %d", call_id); private->op_callback(stonith, msg, call_id, rc, output, NULL); } crm_debug_4("OP callback activated."); } static void stonith_send_notification(gpointer data, gpointer user_data) { struct notify_blob_s *blob = user_data; stonith_notify_client_t *entry = data; const char *event = NULL; if(blob->xml == NULL) { crm_warn("Skipping callback - NULL message"); return; } event = crm_element_value(blob->xml, F_SUBTYPE); if(entry == NULL) { crm_warn("Skipping callback - NULL callback client"); return; } else if(entry->notify == NULL) { crm_warn("Skipping callback - NULL callback"); return; } else if(safe_str_neq(entry->event, event)) { crm_debug_4("Skipping callback - event mismatch %p/%s vs. %s", entry, entry->event, event); return; } crm_debug_4("Invoking callback for %p/%s event...", entry, event); entry->notify(blob->stonith, event, blob->xml); crm_debug_4("Callback invoked..."); } int stonith_send_command( stonith_t *stonith, const char *op, xmlNode *data, xmlNode **output_data, int call_options, int timeout) { int rc = HA_OK; xmlNode *op_msg = NULL; xmlNode *op_reply = NULL; stonith_private_t *native = stonith->private; if(stonith->state == stonith_disconnected) { return st_err_connection; } if(output_data != NULL) { *output_data = NULL; } if(op == NULL) { crm_err("No operation specified"); return st_err_missing; } stonith->call_id++; /* prevent call_id from being negative (or zero) and conflicting * with the stonith_errors enum * use 2 because we use it as (stonith->call_id - 1) below */ if(stonith->call_id < 1) { stonith->call_id = 1; } CRM_CHECK(native->token != NULL, ;); op_msg = stonith_create_op(stonith->call_id, native->token, op, data, call_options); if(op_msg == NULL) { return st_err_missing; } crm_xml_add_int(op_msg, F_STONITH_TIMEOUT, timeout); crm_debug_3("Sending %s message to STONITH service, Timeout: %d", op, timeout); if(send_ipc_message(native->command_channel, op_msg) == FALSE) { crm_err("Sending message to STONITH service FAILED"); free_xml(op_msg); return st_err_ipc; } else { crm_debug_3("Message sent"); } free_xml(op_msg); if((call_options & st_opt_discard_reply)) { crm_debug_3("Discarding reply"); return stonith_ok; } else if(!(call_options & st_opt_sync_call)) { crm_debug_3("Async call, returning"); CRM_CHECK(stonith->call_id != 0, return st_err_ipc); return stonith->call_id; } rc = IPC_OK; crm_debug_3("Waiting for a syncronous reply"); rc = stonith_ok; while(IPC_ISRCONN(native->command_channel)) { int reply_id = -1; int msg_id = stonith->call_id; op_reply = xmlfromIPC(native->command_channel, timeout); if(op_reply == NULL) { rc = st_err_peer; break; } crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); if(reply_id <= 0) { rc = st_err_peer; break; } else if(reply_id == msg_id) { crm_debug_3("Syncronous reply received"); crm_log_xml(LOG_MSG, "Reply", op_reply); if(crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { rc = st_err_peer; } if(output_data != NULL && is_not_set(call_options, st_opt_discard_reply)) { *output_data = op_reply; op_reply = NULL; } goto done; } else if(reply_id < msg_id) { crm_debug("Recieved old reply: %d (wanted %d)", reply_id, msg_id); crm_log_xml(LOG_MSG, "Old reply", op_reply); } else if((reply_id - 10000) > msg_id) { /* wrap-around case */ crm_debug("Recieved old reply: %d (wanted %d)", reply_id, msg_id); crm_log_xml(LOG_MSG, "Old reply", op_reply); } else { crm_err("Received a __future__ reply:" " %d (wanted %d)", reply_id, msg_id); } free_xml(op_reply); op_reply = NULL; } if(op_reply == NULL && stonith->state == stonith_disconnected) { rc = st_err_connection; } else if(rc == stonith_ok && op_reply == NULL) { rc = st_err_peer; } done: if(IPC_ISRCONN(native->command_channel) == FALSE) { crm_err("STONITH disconnected: %d", native->command_channel->ch_status); stonith->state = stonith_disconnected; } free_xml(op_reply); return rc; } static gboolean stonith_msgready(stonith_t* stonith) { stonith_private_t *private = NULL; if (stonith == NULL) { crm_err("No STONITH!"); return FALSE; } private = stonith->private; if(private->command_channel != NULL) { /* drain the channel */ IPC_Channel *cmd_ch = private->command_channel; xmlNode *cmd_msg = NULL; while(cmd_ch->ch_status != IPC_DISCONNECT && cmd_ch->ops->is_message_pending(cmd_ch)) { /* this will happen when the STONITH exited from beneath us */ cmd_msg = xmlfromIPC(cmd_ch, MAX_IPC_DELAY); free_xml(cmd_msg); } } else { crm_err("No command channel"); } if(private->callback_channel == NULL) { crm_err("No callback channel"); return FALSE; } else if(private->callback_channel->ch_status == IPC_DISCONNECT) { crm_info("Lost connection to the STONITH service [%d].", private->callback_channel->farside_pid); return FALSE; } else if(private->callback_channel->ops->is_message_pending( private->callback_channel)) { crm_debug_4("Message pending on command channel [%d]", private->callback_channel->farside_pid); return TRUE; } crm_debug_3("No message pending"); return FALSE; } static int stonith_rcvmsg(stonith_t* stonith) { const char *type = NULL; stonith_private_t *private = NULL; struct notify_blob_s blob; if (stonith == NULL) { crm_err("No STONITH!"); return FALSE; } blob.stonith = stonith; private = stonith->private; /* if it is not blocking mode and no message in the channel, return */ if (stonith_msgready(stonith) == FALSE) { crm_debug_3("No message ready and non-blocking..."); return 0; } /* IPC_INTR is not a factor here */ blob.xml = xmlfromIPC(private->callback_channel, MAX_IPC_DELAY); if (blob.xml == NULL) { crm_warn("Received a NULL msg from STONITH service."); return 0; } /* do callbacks */ type = crm_element_value(blob.xml, F_TYPE); crm_debug_4("Activating %s callbacks...", type); if(safe_str_eq(type, T_STONITH_NG)) { stonith_perform_callback(stonith, blob.xml, 0, 0); } else if(safe_str_eq(type, T_STONITH_NOTIFY)) { g_list_foreach(private->notify_list, stonith_send_notification, &blob); } else { crm_err("Unknown message type: %s", type); crm_log_xml_warn(blob.xml, "BadReply"); } free_xml(blob.xml); return 1; } bool stonith_dispatch(stonith_t *st) { stonith_private_t *private = NULL; bool stay_connected = TRUE; CRM_CHECK(st != NULL, return FALSE); private = st->private; stay_connected = stonith_dispatch_internal(private->callback_channel, (gpointer *)st); return stay_connected; } gboolean stonith_dispatch_internal(IPC_Channel *channel, gpointer user_data) { stonith_t *stonith = user_data; stonith_private_t *private = NULL; gboolean stay_connected = TRUE; CRM_CHECK(stonith != NULL, return FALSE); private = stonith->private; CRM_CHECK(private->callback_channel == channel, return FALSE); while(stonith_msgready(stonith)) { /* invoke the callbacks but dont block */ int rc = stonith_rcvmsg(stonith); if( rc < 0) { crm_err("Message acquisition failed: %d", rc); break; } else if(rc == 0) { break; } } if(private->callback_channel && private->callback_channel->ch_status != IPC_CONNECT) { crm_crit("Lost connection to the STONITH service [%d/callback].", channel->farside_pid); private->callback_source = NULL; stay_connected = FALSE; } if(private->command_channel && private->command_channel->ch_status != IPC_CONNECT) { crm_crit("Lost connection to the STONITH service [%d/command].", channel->farside_pid); private->callback_source = NULL; stay_connected = FALSE; } return stay_connected; } static int stonith_api_free (stonith_t* stonith) { int rc = stonith_ok; if(stonith->state != stonith_disconnected) { rc = stonith->cmds->disconnect(stonith); } if(stonith->state == stonith_disconnected) { stonith_private_t *private = stonith->private; g_hash_table_destroy(private->stonith_op_callback_table); crm_free(private->token); crm_free(stonith->private); crm_free(stonith->cmds); crm_free(stonith); } return rc; } void stonith_api_delete(stonith_t *stonith) { stonith_private_t* private = stonith->private; GList *list = private->notify_list; while(list != NULL) { stonith_notify_client_t *client = g_list_nth_data(list, 0); list = g_list_remove(list, client); crm_free(client); } stonith->cmds->free(stonith); stonith = NULL; } stonith_t *stonith_api_new(void) { stonith_t* new_stonith = NULL; stonith_private_t* private = NULL; crm_malloc0(new_stonith, sizeof(stonith_t)); crm_malloc0(private, sizeof(stonith_private_t)); new_stonith->private = private; private->stonith_op_callback_table = g_hash_table_new_full( g_direct_hash, g_direct_equal, NULL, stonith_destroy_op_callback); private->notify_list = NULL; new_stonith->call_id = 1; new_stonith->state = stonith_disconnected; crm_malloc0(new_stonith->cmds, sizeof(stonith_api_operations_t)); new_stonith->cmds->free = stonith_api_free; new_stonith->cmds->connect = stonith_api_signon; new_stonith->cmds->disconnect = stonith_api_signoff; new_stonith->cmds->call = stonith_api_call; new_stonith->cmds->fence = stonith_api_fence; new_stonith->cmds->confirm = stonith_api_confirm; new_stonith->cmds->history = stonith_api_history; new_stonith->cmds->list = stonith_api_device_list; new_stonith->cmds->metadata = stonith_api_device_metadata; new_stonith->cmds->query = stonith_api_query; new_stonith->cmds->remove_device = stonith_api_remove_device; new_stonith->cmds->register_device = stonith_api_register_device; new_stonith->cmds->remove_callback = stonith_api_del_callback; new_stonith->cmds->register_callback = stonith_api_add_callback; new_stonith->cmds->remove_notification = stonith_api_del_notification; new_stonith->cmds->register_notification = stonith_api_add_notification; return new_stonith; } stonith_key_value_t *stonith_key_value_add(stonith_key_value_t *kvp, const char *key, const char *value) { stonith_key_value_t *p; crm_malloc(p, sizeof(stonith_key_value_t)); p->next = kvp; p->key = crm_strdup(key); p->value = crm_strdup(value); return( p ); } void stonith_key_value_freeall(stonith_key_value_t *kvp, int keys, int values) { stonith_key_value_t *p; while(kvp) { p = kvp->next; if(keys) { free(kvp->key); } if(values) { free(kvp->value); } free(kvp); kvp = p; } } diff --git a/pacemaker.spec.in b/pacemaker.spec.in index d64d7e3ece..58db7a3a73 100644 --- a/pacemaker.spec.in +++ b/pacemaker.spec.in @@ -1,703 +1,702 @@ %global gname haclient %global uname hacluster %global pcmk_docdir %{_docdir}/%{name} %global specversion 0 %global upstream_version tip %global upstream_prefix pacemaker # Compatibility macros for distros (fedora) that don't provide Python macros by default # Do this instead of trying to conditionally include {_rpmconfigdir}/macros.python -%{!?py_ver: %{expand: %%global py_ver %%(echo `python -c "import sys; print sys.version[:3]"`)}} -%{!?py_prefix: %{expand: %%global py_prefix %%(echo `python -c "import sys; print sys.prefix"`)}} -%{!?py_libdir: %{expand: %%global py_libdir %%{expand:%%%%{py_prefix}/%%%%{_lib}/python%%%%{py_ver}}}} -%{!?py_sitedir: %{expand: %%global py_sitedir %%{expand:%%%%{py_libdir}/site-packages}}} +%{!?py_ver: %{expand: %%global py_ver %%(echo `python -c "import sys; print sys.version[:3]"`)}} +%{!?py_prefix: %{expand: %%global py_prefix %%(echo `python -c "import sys; print sys.prefix"`)}} +%{!?py_libdir: %{expand: %%global py_libdir %%{expand:%%%%{py_prefix}/%%%%{_lib}/python%%%%{py_ver}}}} +%{!?py_sitedir: %{expand: %%global py_sitedir %%{expand:%%%%{py_libdir}/site-packages}}} # Compatibility macro wrappers for legacy RPM versions that do not # support conditional builds -%{!?bcond_without: %{expand: %%global bcond_without() %%{expand:%%%%{!?_without_%%{1}:%%%%global with_%%{1} 1}}}} -%{!?bcond_with: %{expand: %%global bcond_with() %%{expand:%%%%{?_with_%%{1}:%%%%global with_%%{1} 1}}}} -%{!?with: %{expand: %%global with() %%{expand:%%%%{?with_%%{1}:1}%%%%{!?with_%%{1}:0}}}} -%{!?without: %{expand: %%global without() %%{expand:%%%%{?with_%%{1}:0}%%%%{!?with_%%{1}:1}}}} +%{!?bcond_without: %{expand: %%global bcond_without() %%{expand:%%%%{!?_without_%%{1}:%%%%global with_%%{1} 1}}}} +%{!?bcond_with: %{expand: %%global bcond_with() %%{expand:%%%%{?_with_%%{1}:%%%%global with_%%{1} 1}}}} +%{!?with: %{expand: %%global with() %%{expand:%%%%{?with_%%{1}:1}%%%%{!?with_%%{1}:0}}}} +%{!?without: %{expand: %%global without() %%{expand:%%%%{?with_%%{1}:0}%%%%{!?with_%%{1}:1}}}} # Conditionals # Invoke "rpmbuild --without " or "rpmbuild --with " # to disable or enable specific features # Supported cluster stacks, must support at least one %bcond_with cman %bcond_without doc %bcond_without ais %bcond_without heartbeat # ESMTP is not available in RHEL, only in EPEL. Allow people to build # the RPM without ESMTP in case they choose not to use EPEL packages %bcond_without esmtp %bcond_without snmp # Build with/without support for profiling tools %bcond_with profiling %bcond_with gcov # Support additional trace logging %bcond_without tracedata # We generate some docs using Publican, but its not available everywhere %bcond_without publican # Use a different versioning scheme %bcond_with pre_release %if %{with profiling} # This disables -debuginfo package creation and also the stripping binaries/libraries # Useful if you want sane profiling data %global debug_package %{nil} %endif %if %{with pre_release} %global pcmk_release 0.%{specversion}.%{upstream_version}.hg %else %global pcmk_release %{specversion} %endif Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: 1.1.6 Release: %{pcmk_release}%{?dist} License: GPLv2+ and LGPLv2+ Url: http://www.clusterlabs.org Group: System Environment/Daemons Source0: pacemaker-%{upstream_version}.tar.bz2 BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) AutoReqProv: on Requires(pre): cluster-glue Requires: resource-agents Requires: python >= 2.4 -Conflicts: heartbeat < 2.99 +Conflicts: heartbeat < 2.99 %if 0%{?fedora} || 0%{?centos} > 4 || 0%{?rhel} > 4 -Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version)) -BuildRequires: help2man libtool-ltdl-devel +Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version)) +BuildRequires: help2man libtool-ltdl-devel %endif %if 0%{?suse_version} # net-snmp-devel on SLES10 does not suck in tcpd-devel automatically -BuildRequires: help2man tcpd-devel +BuildRequires: help2man tcpd-devel # Suse splits this off into a separate package -Requires: python-curses python-xml -BuildRequires: python-curses python-xml +Requires: python-curses python-xml +BuildRequires: python-curses python-xml %endif # Required for core functionality -BuildRequires: automake autoconf libtool pkgconfig python +BuildRequires: automake autoconf libtool pkgconfig python BuildRequires: glib2-devel cluster-glue-libs-devel libxml2-devel libxslt-devel BuildRequires: pkgconfig python-devel gcc-c++ bzip2-devel gnutls-devel pam-devel # Enables optional functionality BuildRequires: ncurses-devel openssl-devel libselinux-devel docbook-style-xsl resource-agents %ifarch alpha %{ix86} x86_64 -BuildRequires: lm_sensors-devel +BuildRequires: lm_sensors-devel %endif %if %{with cman} BuildRequires: clusterlib-devel %endif %if %{with esmtp} BuildRequires: libesmtp-devel %endif %if %{with snmp} BuildRequires: net-snmp-devel %endif %if %{with ais} # Do not require corosync, the admin should select which stack to use and install it BuildRequires: corosynclib-devel %endif %if %{with heartbeat} # Do not require heartbeat, the admin should select which stack to use and install it BuildRequires: heartbeat-devel heartbeat-libs >= 3.0.0 %endif %if %{with doc} BuildRequires: asciidoc help2man %if %{with publican} BuildRequires: publican inkscape %endif %endif %description Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or OpenAIS. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. Available rpmbuild rebuild options: --without : heartbeat ais snmp esmtp publican %package cli License: GPLv2+ and LGPLv2+ Summary: Command line tools for controlling the Pacemaker cluster resource manager Group: System Environment/Daemons Requires: %{name}-libs = %{version}-%{release} %description cli Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or OpenAIS. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. %package -n %{name}-libs License: GPLv2+ and LGPLv2+ Summary: Libraries used by the Pacemaker cluster resource manager and its clients Group: System Environment/Daemons %description -n %{name}-libs Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or OpenAIS. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. %package -n %{name}-libs-devel License: GPLv2+ and LGPLv2+ Summary: Pacemaker development package Group: Development/Libraries Requires: %{name}-libs = %{version}-%{release} Requires: cluster-glue-libs-devel libtool-ltdl-devel Requires: libxml2-devel libxslt-devel bzip2-devel glib2-devel %if %{with ais} Requires: corosynclib-devel %endif %if %{with heartbeat} Requires: heartbeat-devel %endif %description -n %{name}-libs-devel Headers and shared libraries for developing tools for Pacemaker. Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or OpenAIS. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. %package cts License: GPLv2+ and LGPLv2+ Summary: Test framework for cluster-related technologies like Pacemaker Group: System Environment/Daemons Requires: python %description cts Test framework for cluster-related technologies like Pacemaker %package doc License: GPLv2+ and LGPLv2+ Summary: Documentation for Pacemaker Group: Documentation %description doc Documentation for Pacemaker. Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or OpenAIS. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. %prep %setup -q -n %{upstream_prefix}-%{upstream_version} # Force the local time # # 'hg archive' sets the file date to the date of the last commit. # This can result in files having been created in the future # when building on machines in timezones 'behind' the one the # commit occurred in - which seriously confuses 'make' find . -exec touch \{\} \; %build ./autogen.sh %if %{with snmp} eval `objdump --headers --private-headers /usr/bin/perl | grep RPATH | awk '{print "export LD_LIBRARY_PATH="$2}'` %endif # RHEL <= 5 does not support --docdir docdir=%{pcmk_docdir} %{configure} \ - %{!?with_heartbeat: --without-heartbeat} \ - %{!?with_ais: --without-ais} \ - %{!?with_esmtp: --without-esmtp} \ - %{!?with_snmp: --without-snmp} \ - %{?with_cman: --with-cman} \ - %{?with_profiling: --with-profiling} \ - %{?with_gcov: --with-gcov} \ - %{?with_tracedata: --with-tracedata} \ - --with-initdir=%{_initrddir} \ + %{!?with_heartbeat: --without-heartbeat} \ + %{!?with_ais: --without-ais} \ + %{!?with_esmtp: --without-esmtp} \ + %{!?with_snmp: --without-snmp} \ + %{?with_cman: --with-cman} \ + %{?with_profiling: --with-profiling} \ + %{?with_gcov: --with-gcov} \ + %{?with_tracedata: --with-tracedata} \ + --with-initdir=%{_initrddir} \ --localstatedir=%{_var} \ --with-version=%{version}-%{release} \ --enable-fatal-warnings=no make %{_smp_mflags} docdir=%{pcmk_docdir} %install rm -rf %{buildroot} make DESTDIR=%{buildroot} docdir=%{pcmk_docdir} install mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig install -m 644 mcp/pacemaker.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/pacemaker # Scripts that need should be executable chmod a+x %{buildroot}/%{_libdir}/heartbeat/hb2openais-helper.py chmod a+x %{buildroot}/%{_datadir}/pacemaker/tests/cts/CTSlab.py chmod a+x %{buildroot}/%{_datadir}/pacemaker/tests/cts/extracttests.py # These are not actually scripts find %{buildroot} -name '*.xml' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.xsl' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.rng' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.dtd' -type f -print0 | xargs -0 chmod a-x # Dont package static libs find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f # Do not package these either rm -f %{buildroot}/%{_libdir}/heartbeat/crm_primitive.* rm -f %{buildroot}/%{_libdir}/heartbeat/atest rm -f %{buildroot}/%{_libdir}/service_crm.so %if %{with gcov} GCOV_BASE=%{buildroot}/%{_var}/lib/pacemaker/gcov mkdir -p $GCOV_BASE find . -name '*.gcno' -type f | while read F ; do - D=`dirname $F` - mkdir -p ${GCOV_BASE}/$D - cp $F ${GCOV_BASE}/$D + D=`dirname $F` + mkdir -p ${GCOV_BASE}/$D + cp $F ${GCOV_BASE}/$D done %endif %clean rm -rf %{buildroot} %post /sbin/chkconfig --add pacemaker || : %preun if [ $1 -eq 0 ]; then - /sbin/service pacemaker stop &>/dev/null || : - /sbin/chkconfig --del pacemaker || : + /sbin/service pacemaker stop &>/dev/null || : + /sbin/chkconfig --del pacemaker || : fi %post -n %{name}-libs -p /sbin/ldconfig %postun -n %{name}-libs -p /sbin/ldconfig %files ########################################################### %defattr(-,root,root) %exclude %{_datadir}/pacemaker/tests %config(noreplace) %{_sysconfdir}/sysconfig/pacemaker %{_sbindir}/pacemakerd %{_initrddir}/pacemaker %if %{defined _unitdir} %{_unitdir}/pacemaker.service %endif %{_datadir}/pacemaker %{_datadir}/snmp/mibs/PCMK-MIB.txt %{_libdir}/heartbeat/* %{_sbindir}/crm_attribute %{_sbindir}/crm_master %{_sbindir}/attrd_updater %{_sbindir}/fence_legacy %{_sbindir}/stonith_admin %doc COPYING %doc AUTHORS %doc ChangeLog %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/heartbeat/crm %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pengine %dir %attr (750, %{uname}, %{gname}) %{_var}/run/crm %dir /usr/lib/ocf %dir /usr/lib/ocf/resource.d /usr/lib/ocf/resource.d/pacemaker %if %{with ais} %{_libexecdir}/lcrso/pacemaker.lcrso %endif %files cli %{_sbindir}/cibadmin %{_sbindir}/crm_diff %{_sbindir}/crm_failcount %{_sbindir}/crm_mon %{_sbindir}/crm %{_sbindir}/crm_resource %{_sbindir}/crm_standby %{_sbindir}/crm_verify %{_sbindir}/crmadmin %{_sbindir}/iso8601 %{_sbindir}/ptest %{_sbindir}/crm_shadow %{_sbindir}/cibpipe %{_sbindir}/crm_node %{_sbindir}/crm_simulate %{_sbindir}/crm_report %{_sbindir}/crm_ticket %{py_sitedir}/crm %doc %{_mandir} %if %{with heartbeat} %{_sbindir}/crm_uuid %else %exclude %{_sbindir}/crm_uuid %endif %doc COPYING %doc AUTHORS %doc ChangeLog %files -n %{name}-libs %defattr(-,root,root) %{_libdir}/libcib.so.* %{_libdir}/libcrmcommon.so.* %{_libdir}/libcrmcluster.so.* %{_libdir}/libpe_status.so.* %{_libdir}/libpe_rules.so.* %{_libdir}/libpengine.so.* %{_libdir}/libtransitioner.so.* %{_libdir}/libstonithd.so.* %doc COPYING.LIB %doc AUTHORS %files doc %defattr(-,root,root) %doc %{pcmk_docdir} %files cts %defattr(-,root,root) %{py_sitedir}/cts %{_datadir}/pacemaker/tests/cts %doc COPYING.LIB %doc AUTHORS %files -n %{name}-libs-devel %defattr(-,root,root) %exclude %{_datadir}/pacemaker/tests/cts %{_datadir}/pacemaker/tests %{_includedir}/pacemaker %{_libdir}/*.so %if %{with gcov} -%dir %{_var}/lib/pacemaker %{_var}/lib/pacemaker %endif %{_libdir}/pkgconfig/*.pc %doc COPYING.LIB %doc AUTHORS %changelog * Fri Feb 11 2011 Andrew Beekhof 1.1.5-1 - Update source tarball to revision: baad6636a053 - Statistics: Changesets: 184 Diff: 605 files changed, 46103 insertions(+), 26417 deletions(-) - See included ChangeLog file or http://hg.clusterlabs.org/pacemaker/1.1/file/tip/ChangeLog for details * Wed Oct 20 2010 Andrew Beekhof 1.1.4-1 - Moved all the interesting parts of the changelog into a separate file as per the Fedora policy :-/ - Update source tarball to revision: 75406c3eb2c1 tip - Significant performance enhancements to the Policy Engine and CIB - Statistics: Changesets: 169 Diff: 772 files changed, 56172 insertions(+), 39309 deletions(-) - See included ChangeLog file or http://hg.clusterlabs.org/pacemaker/1.1/file/tip/ChangeLog for details * Tue Sep 21 2010 Andrew Beekhof 1.1.3-1 - Update source tarball to revision: e3bb31c56244 tip - Statistics: Changesets: 352 Diff: 481 files changed, 14130 insertions(+), 11156 deletions(-) * Wed May 12 2010 Andrew Beekhof 1.1.2-1 - Update source tarball to revision: c25c972a25cc tip - Statistics: Changesets: 339 Diff: 708 files changed, 37918 insertions(+), 10584 deletions(-) * Tue Feb 16 2010 Andrew Beekhof - 1.1.1-1 - First public release of Pacemaker 1.1 - Package reference documentation in a doc subpackage - Move cts into a subpackage so that it can be easily consumed by others - Update source tarball to revision: 17d9cd4ee29f + New stonith daemon that supports global notifications + Service placement influenced by the physical resources + A new tool for simulating failures and the cluster’s reaction to them + Ability to serialize an otherwise unrelated a set of resource actions (eg. Xen migrations) * Wed Feb 10 2010 Andrew Beekhof - 1.0.7-4 - Rebuild for heartbeat 3.0.2-2 * Wed Feb 10 2010 Andrew Beekhof - 1.0.7-3 - Rebuild for cluster-glue 1.0.3 * Tue Jan 19 2010 Andrew Beekhof - 1.0.7-2 - Rebuild for corosync 1.2.0 * Mon Jan 18 2010 Andrew Beekhof - 1.0.7-1 - Update source tarball to revision: 2eed906f43e9 (stable-1.0) tip - Statistics: Changesets: 193 Diff: 220 files changed, 15933 insertions(+), 8782 deletions(-) * Thu Oct 29 2009 Andrew Beekhof - 1.0.5-4 - Include the fixes from CoroSync integration testing - Move the resource templates - they are not documentation - Ensure documentation is placed in a standard location - Exclude documentation that is included elsewhere in the package - Update the tarball from upstream to version ee19d8e83c2a + High: cib: Correctly clean up when both plaintext and tls remote ports are requested + High: PE: Bug bnc#515172 - Provide better defaults for lt(e) and gt(e) comparisions + High: PE: Bug lf#2197 - Allow master instances placemaker to be influenced by colocation constraints + High: PE: Make sure promote/demote pseudo actions are created correctly + High: PE: Prevent target-role from promoting more than master-max instances + High: ais: Bug lf#2199 - Prevent expected-quorum-votes from being populated with garbage + High: ais: Prevent deadlock - dont try to release IPC message if the connection failed + High: cib: For validation errors, send back the full CIB so the client can display the errors + High: cib: Prevent use-after-free for remote plaintext connections + High: crmd: Bug lf#2201 - Prevent use-of-NULL when running heartbeat * Wed Oct 13 2009 Andrew Beekhof - 1.0.5-3 - Update the tarball from upstream to version 38cd629e5c3c + High: Core: Bug lf#2169 - Allow dtd/schema validation to be disabled + High: PE: Bug lf#2106 - Not all anonymous clone children are restarted after configuration change + High: PE: Bug lf#2170 - stop-all-resources option had no effect + High: PE: Bug lf#2171 - Prevent groups from starting if they depend on a complex resource which can not + High: PE: Disable resource management if stonith-enabled=true and no stonith resources are defined + High: PE: do not include master score if it would prevent allocation + High: ais: Avoid excessive load by checking for dead children every 1s (instead of 100ms) + High: ais: Bug rh#525589 - Prevent shutdown deadlocks when running on CoroSync + High: ais: Gracefully handle changes to the AIS nodeid + High: crmd: Bug bnc#527530 - Wait for the transition to complete before leaving S_TRANSITION_ENGINE + High: crmd: Prevent use-after-free with LOG_DEBUG_3 + Medium: xml: Mask the "symmetrical" attribute on rsc_colocation constraints (bnc#540672) + Medium (bnc#520707): Tools: crm: new templates ocfs2 and clvm + Medium: Build: Invert the disable ais/heartbeat logic so that --without (ais|heartbeat) is available to rpmbuild + Medium: PE: Bug lf#2178 - Indicate unmanaged clones + Medium: PE: Bug lf#2180 - Include node information for all failed ops + Medium: PE: Bug lf#2189 - Incorrect error message when unpacking simple ordering constraint + Medium: PE: Correctly log resources that would like to start but can not + Medium: PE: Stop ptest from logging to syslog + Medium: ais: Include version details in plugin name + Medium: crmd: Requery the resource metadata after every start operation * Fri Aug 21 2009 Tomas Mraz - 1.0.5-2.1 - rebuilt with new openssl * Wed Aug 19 2009 Andrew Beekhof - 1.0.5-2 - Add versioned perl dependency as specified by https://fedoraproject.org/wiki/Packaging/Perl#Packages_that_link_to_libperl - No longer remove RPATH data, it prevents us finding libperl.so and no other libraries were being hardcoded - Compile in support for heartbeat - Conditionally add heartbeat-devel and corosynclib-devel to the -devel requirements depending on which stacks are supported * Mon Aug 17 2009 Andrew Beekhof - 1.0.5-1 - Add dependency on resource-agents - Use the version of the configure macro that supplies --prefix, --libdir, etc - Update the tarball from upstream to version 462f1569a437 (Pacemaker 1.0.5 final) + High: Tools: crm_resource - Advertise --move instead of --migrate + Medium: Extra: New node connectivity RA that uses system ping and attrd_updater + Medium: crmd: Note that dc-deadtime can be used to mask the brokeness of some switches * Tue Aug 11 2009 Ville Skyttä - 1.0.5-0.7.c9120a53a6ae.hg - Use bzipped upstream tarball. * Wed Jul 29 2009 Andrew Beekhof - 1.0.5-0.6.c9120a53a6ae.hg - Add back missing build auto* dependancies - Minor cleanups to the install directive * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.5.c9120a53a6ae.hg - Add a leading zero to the revision when alphatag is used * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.4.c9120a53a6ae.hg - Incorporate the feedback from the cluster-glue review - Realistically, the version is a 1.0.5 pre-release - Use the global directive instead of define for variables - Use the haclient/hacluster group/user instead of daemon - Use the _configure macro - Fix install dependancies * Fri Jul 24 2009 Andrew Beekhof - 1.0.4-3 - Initial Fedora checkin - Include an AUTHORS and license file in each package - Change the library package name to pacemaker-libs to be more Fedora compliant - Remove execute permissions from xml related files - Reference the new cluster-glue devel package name - Update the tarball from upstream to version c9120a53a6ae + High: PE: Only prevent migration if the clone dependency is stopping/starting on the target node + High: PE: Bug 2160 - Dont shuffle clones due to colocation + High: PE: New implementation of the resource migration (not stop/start) logic + Medium: Tools: crm_resource - Prevent use-of-NULL by requiring a resource name for the -A and -a options + Medium: PE: Prevent use-of-NULL in find_first_action() * Tue Jul 14 2009 Andrew Beekhof - 1.0.4-2 - Reference authors from the project AUTHORS file instead of listing in description - Change Source0 to reference the Mercurial repo - Cleaned up the summaries and descriptions - Incorporate the results of Fedora package self-review * Thu Jun 04 2009 Andrew Beekhof - 1.0.4-1 - Update source tarball to revision: 1d87d3e0fc7f (stable-1.0) - Statistics: Changesets: 209 Diff: 266 files changed, 12010 insertions(+), 8276 deletions(-) * Wed Apr 08 2009 Andrew Beekhof - 1.0.3-1 - Update source tarball to revision: b133b3f19797 (stable-1.0) tip - Statistics: Changesets: 383 Diff: 329 files changed, 15471 insertions(+), 15119 deletions(-) * Mon Feb 16 2009 Andrew Beekhof - 1.0.2-1 - Update source tarball to revision: d232d19daeb9 (stable-1.0) tip - Statistics: Changesets: 441 Diff: 639 files changed, 20871 insertions(+), 21594 deletions(-) * Tue Nov 18 2008 Andrew Beekhof - 1.0.1-1 - Update source tarball to revision: 6fc5ce8302ab (stable-1.0) tip - Statistics: Changesets: 170 Diff: 816 files changed, 7633 insertions(+), 6286 deletions(-) * Thu Oct 16 2008 Andrew Beekhof - 1.0.0-1 - Update source tarball to revision: 388654dfef8f tip - Statistics: Changesets: 261 Diff: 3021 files changed, 244985 insertions(+), 111596 deletions(-) * Mon Sep 22 2008 Andrew Beekhof - 0.7.3-1 - Update source tarball to revision: 33e677ab7764+ tip - Statistics: Changesets: 133 Diff: 89 files changed, 7492 insertions(+), 1125 deletions(-) * Wed Aug 20 2008 Andrew Beekhof - 0.7.1-1 - Update source tarball to revision: f805e1b30103+ tip - Statistics: Changesets: 184 Diff: 513 files changed, 43408 insertions(+), 43783 deletions(-) * Fri Jul 18 2008 Andrew Beekhof - 0.7.0-19 - Update source tarball to revision: 007c3a1c50f5 (unstable) tip - Statistics: Changesets: 108 Diff: 216 files changed, 4632 insertions(+), 4173 deletions(-) * Wed Jun 25 2008 Andrew Beekhof - 0.7.0-1 - Update source tarball to revision: bde0c7db74fb tip - Statistics: Changesets: 439 Diff: 676 files changed, 41310 insertions(+), 52071 deletions(-) * Thu Jun 19 2008 Andrew Beekhof - 0.6.5-1 - Update source tarball to revision: b9fe723d1ac5 tip - Statistics: Changesets: 48 Diff: 37 files changed, 1204 insertions(+), 234 deletions(-) * Thu May 22 2008 Andrew Beekhof - 0.6.4-1 - Update source tarball to revision: 226d8e356924 tip - Statistics: Changesets: 55 Diff: 199 files changed, 7103 insertions(+), 12378 deletions(-) * Wed Apr 23 2008 Andrew Beekhof - 0.6.3-1 - Update source tarball to revision: fd8904c9bc67 tip - Statistics: Changesets: 117 Diff: 354 files changed, 19094 insertions(+), 11338 deletions(-) * Thu Feb 14 2008 Andrew Beekhof - 0.6.2-1 - Update source tarball to revision: 28b1a8c1868b tip - Statistics: Changesets: 11 Diff: 7 files changed, 58 insertions(+), 18 deletions(-) * Tue Feb 12 2008 Andrew Beekhof - 0.6.1-1 - Update source tarball to revision: e7152d1be933 tip - Statistics: Changesets: 25 Diff: 37 files changed, 1323 insertions(+), 227 deletions(-) * Mon Jan 14 2008 Andrew Beekhof - 0.6.0-2 - This is the first release of the Pacemaker Cluster Resource Manager formerly part of Heartbeat. - For those looking for the GUI, mgmtd, CIM or TSA components, they are now found in the new pacemaker-pygui project. Build dependancies prevent them from being included in Heartbeat (since the built-in CRM is no longer supported) and, being non-core components, are not included with Pacemaker. - Update source tarball to revision: c94b92d550cf - Statistics: Changesets: 347 Diff: 2272 files changed, 132508 insertions(+), 305991 deletions(-) - Test hardware: + 6-node vmware cluster (sles10-sp1/256Mb/vmware stonith) on a single host (opensuse10.3/2Gb/2.66Ghz Quad Core2) + 7-node EMC Centera cluster (sles10/512Mb/2Ghz Xeon/ssh stonith) - Notes: Heartbeat Stack + All testing was performed with STONITH enabled + The CRM was enabled using the "crm respawn" directive - Notes: OpenAIS Stack + This release contains a preview of support for the OpenAIS cluster stack + The current release of the OpenAIS project is missing two important patches that we require. OpenAIS packages containing these patches are available for most major distributions at: http://download.opensuse.org/repositories/server:/ha-clustering + The OpenAIS stack is not currently recommended for use in clusters that have shared data as STONITH support is not yet implimented + pingd is not yet available for use with the OpenAIS stack + 3 significant OpenAIS issues were found during testing of 4 and 6 node clusters. We are activly working together with the OpenAIS project to get these resolved. - Pending bugs encountered during testing: + OpenAIS #1736 - Openais membership took 20s to stabilize + Heartbeat #1750 - ipc_bufpool_update: magic number in head does not match + OpenAIS #1793 - Assertion failure in memb_state_gather_enter() + OpenAIS #1796 - Cluster message corruption * Mon Dec 10 2007 Andrew Beekhof - 0.6.0-1 - Initial opensuse package check-in