diff --git a/.travis.yml b/.travis.yml index 0ef2be1..3079e44 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,49 +1,49 @@ sudo: required language: c os: - linux - linux-ppc64le env: global: - PACKAGE=sbd - BUILD_OS_TYPE=fedora BUILD_OS_DIST= BUILD_OS_VERSION=29 matrix: exclude: - os: linux - os: linux-ppc64le include: - os: linux env: OS_ARCH=x86_64 OS_TYPE=centos OS_MOCK=epel OS_DIST=centos OS_VERSION=7 - os: linux env: OS_ARCH=x86_64 OS_TYPE=centos OS_MOCK=epel OS_DIST=centos OS_VERSION=6 - os: linux env: OS_ARCH=x86_64 OS_TYPE=fedora OS_MOCK=fedora OS_DIST= OS_VERSION=29 - os: linux env: OS_ARCH=x86_64 OS_TYPE=fedora OS_MOCK=fedora OS_DIST= OS_VERSION=30 - os: linux env: OS_ARCH=x86_64 OS_TYPE=fedora OS_MOCK=fedora OS_DIST= OS_VERSION=rawhide - os: linux-ppc64le env: OS_ARCH=ppc64le OS_TYPE=fedora OS_MOCK=fedora OS_DIST= OS_VERSION=30 services: - docker install: true script: - make -f Makefile.am srpm PACKAGE=${PACKAGE} - docker pull ${BUILD_OS_TYPE}:${BUILD_OS_DIST}${BUILD_OS_VERSION} - docker run --privileged -v ${PWD}:/rpms ${BUILD_OS_TYPE}:${BUILD_OS_DIST}${BUILD_OS_VERSION} /bin/bash -c "dnf install -y mock dnf-utils && if test $OS_VERSION = rawhide; then sed -i /etc/mock/${OS_MOCK}-${OS_VERSION}-${OS_ARCH}.cfg -e s/gpgcheck.*/gpgcheck=0/g; fi && mock --no-clean -r ${OS_MOCK}-${OS_VERSION}-${OS_ARCH} --resultdir=/rpms --disable-plugin=yum_cache --disable-plugin=selinux --no-bootstrap-chroot --old-chroot /rpms/sbd*.src.rpm" - ls ${PWD}/${PACKAGE}*.${OS_ARCH}.rpm - docker pull ${OS_TYPE}:${OS_DIST}${OS_VERSION} - - docker run --privileged -v ${PWD}:/rpms -v ${PWD}/tests:/tests ${OS_TYPE}:${OS_DIST}${OS_VERSION} /bin/bash -c "if test $OS_VERSION = rawhide; then yum update -y --nogpgcheck; fi && yum install -y device-mapper /rpms/${PACKAGE}*.${OS_ARCH}.rpm && /tests/regressions.sh && touch /rpms/regressions.sh.SUCCESS" + - docker run --privileged -v ${PWD}:/rpms ${OS_TYPE}:${OS_DIST}${OS_VERSION} /bin/bash -c "if test $OS_VERSION = rawhide; then yum update -y --nogpgcheck; fi && yum install -y device-mapper /rpms/${PACKAGE}*.${OS_ARCH}.rpm && /usr/share/sbd/regressions.sh && touch /rpms/regressions.sh.SUCCESS" - ls ${PWD}/regressions.sh.SUCCESS addons: apt: packages: - rpm diff --git a/Makefile.am b/Makefile.am index ada021d..db1f989 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,84 +1,87 @@ -SUBDIRS = src agent man +SUBDIRS = src agent man tests # .gz because github doesn't support .xz yet :-( # this is modified # TAG ?= $(shell git log --pretty="format:%H" -n 1 || sed -n -e "s/%global commit //p" sbd.spec)$(shell test -n "$$(git status -s)" && echo -n "-mod") distdir = $(PACKAGE)-$(TAG) TARFILE = $(distdir).tar.gz DIST_ARCHIVES = $(TARFILE) KEEP_EXISTING_TAR = no INJECT_GIT_COMMIT = yes DISTCLEANFILES = sbd-* sbd-*/ CLEANFILES = *.rpm *.tar.* sbd-* RPM_ROOT = $(shell pwd) RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ --define "_specdir $(RPM_ROOT)" \ --define "_srcrpmdir $(RPM_ROOT)" \ --define "_builddir $(RPM_ROOT)" \ --define "_rpmdir $(RPM_ROOT)" MOCK_TARGET ?= rhel-7.1-candidate-x86_64 MOCK_OPTIONS ?= --resultdir=$(RPM_ROOT)/mock --no-cleanup-after BUILD_COUNTER ?= build.counter LAST_COUNT = $(shell test ! -e $(BUILD_COUNTER) && echo 0; test -e $(BUILD_COUNTER) && cat $(BUILD_COUNTER)) COUNT = $(shell expr 1 + $(LAST_COUNT)) TESTS = tests/regressions.sh export SBD_BINARY := src/sbd +export SBD_PRELOAD := tests/.libs/libsbdtestbed.so +export SBD_USE_DM := no + EXTRA_DIST = sbd.spec tests/regressions.sh export: rm -f $(PACKAGE)-HEAD.tar.* if test "$(KEEP_EXISTING_TAR)" != "yes"; then \ rm -f $(TARFILE); \ fi; ! (git status -s | grep "??" && echo "untracked files present in git-repo" ) if [ -f $(TARFILE) ]; then \ echo `date`: Using existing tarball: $(TARFILE); \ else \ rm -f $(PACKAGE).tar.*; \ (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \ if test -n "$$(git status -s)" || test "$(INJECT_GIT_COMMIT)" = "yes"; then \ if test -n "$$(git status -s)"; then git diff HEAD --name-only|grep -v "^\."|xargs -n1 git diff HEAD > uncommitted.diff; fi; \ rm -rf $(distdir); tar -xzf $(TARFILE); rm $(TARFILE); \ cd $(distdir); \ if test -n "$$(git status -s)"; then patch -p1 -i ../uncommitted.diff; fi; \ cd ..; \ sed -i 's/global\ commit.*/global\ commit\ $(TAG)/' $(distdir)/$(PACKAGE).spec; \ tar -czf $(TARFILE) $(distdir); rm -rf $(distdir); \ rm -f uncommitted.diff; \ fi; \ echo `date`: Rebuilt $(TARFILE); \ fi #replace commit id in sbd.spec spec: rm -f *.src.rpm rm -rf $(distdir) mkdir $(distdir) cp $(PACKAGE).spec $(distdir) sed -i 's/global\ commit.*/global\ commit\ $(TAG)/' $(distdir)/$(PACKAGE).spec srpm: export spec if [ -e $(BUILD_COUNTER) ]; then \ sed -i 's/global\ buildnum.*/global\ buildnum\ $(COUNT)/' $(distdir)/$(PACKAGE).spec; \ echo $(COUNT) > $(BUILD_COUNTER); \ fi rpmbuild $(RPM_OPTS) -bs $(distdir)/$(PACKAGE).spec rpm: export spec rpmbuild $(RPM_OPTS) -ba $(distdir)/$(PACKAGE).spec mock: srpm -rm -rf $(RPM_ROOT)/mock @echo "mock --root=$* --rebuild $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm" mock --root=$(MOCK_TARGET) --rebuild $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm beekhof: mock cluster-helper -- 'rm -f sbd-*.x86_64.rpm' cluster-helper --copy $(RPM_ROOT)/mock/sbd-*.x86_64.rpm {}: cluster-helper -- yum install -y sbd-*.x86_64.rpm diff --git a/autogen.sh b/autogen.sh index ec867a2..e599760 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,4 +1,10 @@ #!/bin/sh +am_ver=`automake --version | sed -n 1p` +case $am_ver in + *\ 1.11*|*\ 1.12*) echo 'm4_define([TESTS_OPTION], [])';; + *) echo 'm4_define([TESTS_OPTION], [serial-tests])';; +esac > tests-opt.m4 +cat tests-opt.m4 # Run this to generate all the initial makefiles, etc. autoreconf -i -v && echo Now run ./configure and make diff --git a/configure.ac b/configure.ac index 401cb93..a9ac83c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,253 +1,274 @@ dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT([sbd], [1.4.0], [lmb@suse.com]) +m4_include([tests-opt.m4]) AC_CANONICAL_HOST AC_CONFIG_AUX_DIR(.) AC_CONFIG_HEADERS(config.h) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([no])]) -AM_INIT_AUTOMAKE +AM_INIT_AUTOMAKE(1.11.1 foreign TESTS_OPTION) AM_PROG_CC_C_O PKG_CHECK_MODULES(glib, [glib-2.0]) -dnl PKG_CHECK_MODULES(libcoroipcc, [libcoroipcc]) +PKG_CHECK_MODULES(libxml, [libxml-2.0]) PKG_CHECK_MODULES(cmap, [libcmap], HAVE_cmap=1, HAVE_cmap=0) PKG_CHECK_MODULES(votequorum, [libvotequorum], HAVE_votequorum=1, HAVE_votequorum=0) dnl pacemaker > 1.1.8 PKG_CHECK_MODULES(pacemaker, [pacemaker, pacemaker-cib], HAVE_pacemaker=1, HAVE_pacemaker=0) dnl pacemaker <= 1.1.8 PKG_CHECK_MODULES(pcmk, [pcmk, pcmk-cib], HAVE_pcmk=1, HAVE_pcmk=0) + PKG_CHECK_MODULES(libqb, [libqb]) -CPPFLAGS="$CPPFLAGS -Werror" +CPPFLAGS="$CPPFLAGS -Werror $glib_CFLAGS $libxml_CFLAGS" +LIBS="$LIBS $glib_LIBS $libxml_LIBS" + if test $HAVE_pacemaker = 0 -a $HAVE_pcmk = 0; then AC_MSG_ERROR(No package 'pacemaker' found) elif test $HAVE_pacemaker = 1; then CPPFLAGS="$CPPFLAGS $glib_CFLAGS $pacemaker_CFLAGS" if test $HAVE_cmap = 0; then AC_MSG_NOTICE(No library 'cmap' found) else CPPFLAGS="$CPPFLAGS $cmap_CFLAGS" LIBS="$LIBS $cmap_LIBS" fi if test $HAVE_votequorum = 0; then AC_MSG_NOTICE(No library 'votequorum' found) else CPPFLAGS="$CPPFLAGS $votequorum_CFLAGS" LIBS="$LIBS $votequorum_LIBS" fi fi -PKG_CHECK_MODULES(libxml, [libxml-2.0]) -CPPFLAGS="$CPPFLAGS $libxml_CFLAGS $libqb_CFLAGS $pacemaker_CFLAGS $pcmk_CFLAGS" -LIBS="$LIBS $libxml_LIBS $libqb_LIBS $pacemaker_LIBS $pcmk_LIBS" +CPPFLAGS="$CPPFLAGS $libqb_CFLAGS $pacemaker_CFLAGS $pcmk_CFLAGS" +LIBS="$LIBS $libqb_LIBS $pacemaker_LIBS $pcmk_LIBS" dnl checks for libraries +AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc... +AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux) AC_CHECK_LIB(aio, io_setup, , missing="yes") AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set, , missing="yes") AC_CHECK_LIB(cib, cib_new, , missing="yes") AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes") AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0) AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0) dnl pacemaker >= 1.1.8 AC_CHECK_HEADERS(crm/cluster.h) AC_CHECK_LIB(crmcommon, pcmk_strerror, , missing="yes") AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes") dnl pacemaker-2.0 removed support for corosync 1 cluster layer AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,, [#include ]) dnl check for new pe-API AC_CHECK_FUNCS(pe_new_working_set) if test "$missing" = "yes"; then AC_MSG_ERROR([Missing required libraries or functions.]) fi AC_PATH_PROGS(POD2MAN, pod2man, pod2man) AC_ARG_ENABLE([shared-disk], [ --enable-shared-disk Turn on functionality that requires shared disk [default=yes]]) DISK=0 if test "x${enable_shared_disk}" != xno ; then DISK=1 fi AC_DEFINE_UNQUOTED(SUPPORT_SHARED_DISK, $DISK, Turn on functionality that requires shared disk) AM_CONDITIONAL(SUPPORT_SHARED_DISK, test "$DISK" = "1") if test -e /proc/$$ then echo "/proc/{pid} is supported" AC_DEFINE_UNQUOTED(HAVE_PROC_PID, 1, Define to 1 if /proc/{pid} is supported.) fi AC_DEFINE_UNQUOTED(CHECK_TWO_NODE, $HAVE_cmap, Turn on checking for 2-node cluster) AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1") AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle) AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1") CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR Directory for SBD configuration file [${CONFIGDIR}]], [ CONFIGDIR="$withval" ] ) +# +# Where is dlopen? +# +if test "$ac_cv_lib_c_dlopen" = yes; then + LIBADD_DL="" +elif test "$ac_cv_lib_dl_dlopen" = yes; then + LIBADD_DL=-ldl +else + LIBADD_DL=${lt_cv_dlopen_libs} +fi + + dnl ********************************************************************** dnl Check for various argv[] replacing functions on various OSs dnl dnl Borrowed from Proftpd dnl Proftpd is Licenced under the terms of the GNU General Public Licence dnl and is available from http://www.proftpd.org/ dnl AC_CHECK_FUNCS(setproctitle) AC_CHECK_HEADERS(libutil.h) AC_CHECK_LIB(util, setproctitle, [AC_DEFINE(HAVE_SETPROCTITLE,1,[ ]) ac_cv_func_setproctitle="yes" ; LIBS="$LIBS -lutil"]) if test "$ac_cv_func_setproctitle" = "yes"; then pf_argv_set="PF_ARGV_NONE" fi if test "$pf_argv_set" = ""; then AC_CHECK_HEADERS(sys/pstat.h) if test "$ac_cv_header_pstat_h" = "yes"; then AC_CHECK_FUNCS(pstat) if test "$ac_cv_func_pstat" = "yes"; then pf_argv_set="PF_ARGV_PSTAT" else pf_argv_set="PF_ARGV_WRITEABLE" fi fi if test "$pf_argv_set" = ""; then AC_EGREP_HEADER([#define.*PS_STRINGS.*],sys/exec.h, have_psstrings="yes",have_psstrings="no") if test "$have_psstrings" = "yes"; then pf_argv_set="PF_ARGV_PSSTRINGS" fi fi if test "$pf_argv_set" = ""; then AC_CACHE_CHECK(whether __progname and __progname_full are available, pf_cv_var_progname, AC_TRY_LINK([extern char *__progname, *__progname_full;], [__progname = "foo"; __progname_full = "foo bar";], pf_cv_var_progname="yes", pf_cv_var_progname="no")) if test "$pf_cv_var_progname" = "yes"; then AC_DEFINE(HAVE___PROGNAME,1,[ ]) fi AC_CACHE_CHECK(which argv replacement method to use, pf_cv_argv_type, AC_EGREP_CPP(yes,[ #if defined(__GNU_HURD__) yes #endif ],pf_cv_argv_type="new", pf_cv_argv_type="writeable")) if test "$pf_cv_argv_type" = "new"; then pf_argv_set="PF_ARGV_NEW" fi if test "$pf_argv_set" = ""; then pf_argv_set="PF_ARGV_WRITEABLE" fi fi fi AC_DEFINE_UNQUOTED(PF_ARGV_TYPE, $pf_argv_set, mechanism to pretty-print ps output: setproctitle-equivalent) dnl End of tests borrowed from Proftpd AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr dnl Fix default variables - "prefix" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi ;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac dnl Expand autoconf variables so that we dont end up with '${prefix}' dnl in #defines and python scripts dnl NOTE: Autoconf deliberately leaves them unexpanded to allow dnl make exec_prefix=/foo install dnl No longer being able to do this seems like no great loss to me... eval prefix="`eval echo ${prefix}`" eval exec_prefix="`eval echo ${exec_prefix}`" eval bindir="`eval echo ${bindir}`" eval sbindir="`eval echo ${sbindir}`" eval libexecdir="`eval echo ${libexecdir}`" eval datadir="`eval echo ${datadir}`" eval sysconfdir="`eval echo ${sysconfdir}`" eval sharedstatedir="`eval echo ${sharedstatedir}`" eval localstatedir="`eval echo ${localstatedir}`" eval libdir="`eval echo ${libdir}`" eval includedir="`eval echo ${includedir}`" eval oldincludedir="`eval echo ${oldincludedir}`" eval infodir="`eval echo ${infodir}`" eval mandir="`eval echo ${mandir}`" +AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries + if test x"${CONFIGDIR}" = x""; then CONFIGDIR="${sysconfdir}/sysconfig" fi AC_SUBST(CONFIGDIR) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES([Makefile src/Makefile agent/Makefile man/Makefile agent/sbd src/sbd.service src/sbd_remote.service src/sbd.sh]) +AC_CONFIG_SUBDIRS([tests]) + dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() diff --git a/man/sbd.8.pod b/man/sbd.8.pod index 8e4a676..377c579 100644 --- a/man/sbd.8.pod +++ b/man/sbd.8.pod @@ -1,668 +1,668 @@ =head1 NAME sbd - STONITH Block Device daemon =head1 SYNOPSIS sbd <-d F> [options] C =head1 SUMMARY SBD provides a node fencing mechanism (Shoot the other node in the head, STONITH) for Pacemaker-based clusters through the exchange of messages via shared block storage such as for example a SAN, iSCSI, FCoE. This isolates the fencing mechanism from changes in firmware version or dependencies on specific firmware controllers, and it can be used as a STONITH mechanism in all configurations that have reliable shared storage. SBD can also be used without any shared storage. In this mode, the watchdog device will be used to reset the node if it loses quorum, if any monitored daemon is lost and not recovered or if Pacemaker decides that the node requires fencing. The F binary implements both the daemon that watches the message slots as well as the management tool for interacting with the block storage device(s). This mode of operation is specified via the C parameter; some of these modes take additional parameters. To use SBD with shared storage, you must first C the messaging layout on one to three block devices. Second, configure F to list those devices (and possibly adjust other options), and restart the cluster stack on each node to ensure that C is started. Third, configure the C fencing resource in the Pacemaker CIB. Each of these steps is documented in more detail below the description of the command options. C can only be used as root. =head2 GENERAL OPTIONS =over =item B<-d> F Specify the block device(s) to be used. If you have more than one, specify this option up to three times. This parameter is mandatory for all modes, since SBD always needs a block device to interact with. This man page uses F, F, and F as example device names for brevity. However, in your production environment, you should instead always refer to them by using the long, stable device name (e.g., F). =item B<-v|-vv|-vvv> Enable verbose|debug|debug-library logging (optional) =item B<-h> Display a concise summary of C options. =item B<-n> I Set local node name; defaults to C. This should not need to be set. =item B<-R> Do B enable realtime priority. By default, C runs at realtime priority, locks itself into memory, and also acquires highest IO priority to protect itself against interference from other processes on the system. This is a debugging-only option. =item B<-I> I Async IO timeout (defaults to 3 seconds, optional). You should not need to adjust this unless your IO setup is really very slow. (In daemon mode, the watchdog is refreshed when the majority of devices could be read within this time.) =back =head2 create Example usage: sbd -d /dev/sdc2 -d /dev/sdd3 create If you specify the I command, sbd will write a metadata header to the device(s) specified and also initialize the messaging slots for up to 255 nodes. B: This command will not prompt for confirmation. Roughly the first megabyte of the specified block device(s) will be overwritten immediately and without backup. This command accepts a few options to adjust the default timings that are written to the metadata (to ensure they are identical across all nodes accessing the device). =over =item B<-1> I Set watchdog timeout to N seconds. This depends mostly on your storage latency; the majority of devices must be successfully read within this time, or else the node will self-fence. If your sbd device(s) reside on a multipath setup or iSCSI, this should be the time required to detect a path failure. You may be able to reduce this if your device outages are independent, or if you are using the Pacemaker integration. =item B<-2> I Set slot allocation timeout to N seconds. You should not need to tune this. =item B<-3> I Set daemon loop timeout to N seconds. You should not need to tune this. =item B<-4> I Set I timeout to N seconds. This should be twice the I timeout. This is the time after which a message written to a node's slot will be considered delivered. (Or long enough for the node to detect that it needed to self-fence.) This also affects the I in Pacemaker's CIB; see below. =back =head2 list Example usage: # sbd -d /dev/sda1 list 0 hex-0 clear 1 hex-7 clear 2 hex-9 clear List all allocated slots on device, and messages. You should see all cluster nodes that have ever been started against this device. Nodes that are currently running should have a I state; nodes that have been fenced, but not yet restarted, will show the appropriate fencing message. =head2 dump Example usage: # sbd -d /dev/sda1 dump ==Dumping header on disk /dev/sda1 Header version : 2 Number of slots : 255 Sector size : 512 Timeout (watchdog) : 15 Timeout (allocate) : 2 Timeout (loop) : 1 Timeout (msgwait) : 30 ==Header on disk /dev/sda1 is dumped Dump meta-data header from device. =head2 watch Example usage: sbd -d /dev/sdc2 -d /dev/sdd3 -P watch This command will make C start in daemon mode. It will constantly monitor the message slot of the local node for incoming messages, reachability, and optionally take Pacemaker's state into account. C B be started on boot before the cluster stack! See below for enabling this according to your boot environment. The options for this mode are rarely specified directly on the commandline directly, but most frequently set via F. It also constantly monitors connectivity to the storage device, and self-fences in case the partition becomes unreachable, guaranteeing that it does not disconnect from fencing messages. A node slot is automatically allocated on the device(s) the first time the daemon starts watching the device; hence, manual allocation is not usually required. If a watchdog is used together with the C as is strongly recommended, the watchdog is activated at initial start of the sbd daemon. The watchdog is refreshed every time the majority of SBD devices has been successfully read. Using a watchdog provides additional protection against C crashing. If the Pacemaker integration is activated, C will B self-fence if device majority is lost, if: =over =item 1. The partition the node is in is still quorate according to the CIB; =item 2. it is still quorate according to Corosync's node count; =item 3. the node itself is considered online and healthy by Pacemaker. =back This allows C to survive temporary outages of the majority of devices. However, while the cluster is in such a degraded state, it can neither successfully fence nor be shutdown cleanly (as taking the cluster below the quorum threshold will immediately cause all remaining nodes to self-fence). In short, it will not tolerate any further faults. Please repair the system before continuing. There is one C process that acts as a master to which all watchers report; one per device to monitor the node's slot; and, optionally, one that handles the Pacemaker integration. =over =item B<-W> Enable or disable use of the system watchdog to protect against the sbd processes failing and the node being left in an undefined state. Specify this once to enable, twice to disable. Defaults to I. =item B<-w> F This can be used to override the default watchdog device used and should not usually be necessary. =item B<-p> F This option can be used to specify a pidfile for the main sbd process. =item B<-F> I Number of failures before a failing servant process will not be restarted immediately until the dampening delay has expired. If set to zero, servants will be restarted immediately and indefinitely. If set to one, a failed servant will be restarted once every B<-t> seconds. If set to a different value, the servant will be restarted that many times within the dampening period and then delay. Defaults to I<1>. =item B<-t> I Dampening delay before faulty servants are restarted. Combined with C<-F 1>, the most logical way to tune the restart frequency of servant processes. Default is 5 seconds. If set to zero, processes will be restarted indefinitely and immediately. =item B<-P> Enable Pacemaker integration which checks Pacemaker quorum and node health. Specify this once to enable, twice to disable. Defaults to I. =item B<-S> I Set the start mode. (Defaults to I<0>.) If this is set to zero, sbd will always start up unconditionally, regardless of whether the node was previously fenced or not. If set to one, sbd will only start if the node was previously shutdown cleanly (as indicated by an exit request message in the slot), or if the slot is empty. A reset, crashdump, or power-off request in any slot will halt the start up. This is useful to prevent nodes from rejoining if they were faulty. The node must be manually "unfenced" by sending an empty message to it: sbd -d /dev/sda1 message node1 clear =item B<-s> I Set the start-up wait time for devices. (Defaults to I<120>.) Dynamic block devices such as iSCSI might not be fully initialized and present yet. This allows one to set a timeout for waiting for devices to appear on start-up. If set to 0, start-up will be aborted immediately if no devices are available. =item B<-Z> Enable trace mode. B Specifying this once will turn all reboots or power-offs, be they caused by self-fence decisions or messages, into a crashdump. Specifying this twice will just log them but not continue running. =item B<-T> By default, the daemon will set the watchdog timeout as specified in the device metadata. However, this does not work for every watchdog device. In this case, you must manually ensure that the watchdog timeout used by the system correctly matches the SBD settings, and then specify this option to allow C to continue with start-up. =item B<-5> I Warn if the time interval for tickling the watchdog exceeds this many seconds. Since the node is unable to log the watchdog expiry (it reboots immediately without a chance to write its logs to disk), this is very useful for getting an indication that the watchdog timeout is too short for the IO load of the system. Default is 3 seconds, set to zero to disable. =item B<-C> I Watchdog timeout to set before crashdumping. If SBD is set to crashdump instead of reboot - either via the trace mode settings or the I fencing agent's parameter -, SBD will adjust the watchdog timeout to this setting before triggering the dump. Otherwise, the watchdog might trigger and prevent a successful crashdump from ever being written. -Defaults to 240 seconds. Set to zero to disable. +Set to zero (= default) to disable. =item B<-r> I Actions to be executed when the watchers don't timely report to the sbd master process or one of the watchers detects that the master process has died. Set timeout-action to comma-separated combination of noflush|flush plus reboot|crashdump|off. If just one of both is given the other stays at the default. This doesn't affect actions like off, crashdump, reboot explicitly triggered via message slots. And it does as well not configure the action a watchdog would trigger should it run off (there is no generic interface). Defaults to flush,reboot. =back =head2 allocate Example usage: sbd -d /dev/sda1 allocate node1 Explicitly allocates a slot for the specified node name. This should rarely be necessary, as every node will automatically allocate itself a slot the first time it starts up on watch mode. =head2 message Example usage: sbd -d /dev/sda1 message node1 test Writes the specified message to node's slot. This is rarely done directly, but rather abstracted via the C fencing agent configured as a cluster resource. Supported message types are: =over =item test This only generates a log message on the receiving node and can be used to check if SBD is seeing the device. Note that this could overwrite a fencing request send by the cluster, so should not be used during production. =item reset Reset the target upon receipt of this message. =item off Power-off the target. =item crashdump Cause the target node to crashdump. =item exit This will make the C daemon exit cleanly on the target. You should B send this message manually; this is handled properly during shutdown of the cluster stack. Manually stopping the daemon means the node is unprotected! =item clear This message indicates that no real message has been sent to the node. You should not set this manually; C will clear the message slot automatically during start-up, and setting this manually could overwrite a fencing message by the cluster. =back =head2 query-watchdog Example usage: sbd query-watchdog Check for available watchdog devices and print some info. B: This command will arm the watchdog during query, and if your watchdog refuses disarming (for example, if its kernel module has the 'nowayout' parameter set) this will reset your system. =head2 test-watchdog Example usage: sbd test-watchdog [-w /dev/watchdog3] Test specified watchdog device (/dev/watchdog by default). B: This command will arm the watchdog and have your system reset in case your watchdog is working properly! If issued from an interactive session, it will prompt for confirmation. =head1 Base system configuration =head2 Configure a watchdog It is highly recommended that you configure your Linux system to load a watchdog driver with hardware assistance (as is available on most modern systems), such as I, I, or others. As a fall-back, you can use the I module. No other software must access the watchdog timer; it can only be accessed by one process at any given time. Some hardware vendors ship systems management software that use the watchdog for system resets (f.e. HP ASR daemon). Such software has to be disabled if the watchdog is to be used by SBD. =head2 Choosing and initializing the block device(s) First, you have to decide if you want to use one, two, or three devices. If you are using multiple ones, they should reside on independent storage setups. Putting all three of them on the same logical unit for example would not provide any additional redundancy. The SBD device can be connected via Fibre Channel, Fibre Channel over Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of network-based quorum server; the advantage is that it does not require a smart host at your third location, just block storage. The SBD partitions themselves B be mirrored (via MD, DRBD, or the storage layer itself), since this could result in a split-mirror scenario. Nor can they reside on cLVM2 volume groups, since they must be accessed by the cluster stack before it has started the cLVM2 daemons; hence, these should be either raw partitions or logical units on (multipath) storage. The block device(s) must be accessible from all nodes. (While it is not necessary that they share the same path name on all nodes, this is considered a very good idea.) SBD will only use about one megabyte per device, so you can easily create a small partition, or very small logical units. (The size of the SBD device depends on the block size of the underlying device. Thus, 1MB is fine on plain SCSI devices and SAN storage with 512 byte blocks. On the IBM s390x architecture in particular, disks default to 4k blocks, and thus require roughly 4MB.) The number of devices will affect the operation of SBD as follows: =over =item One device In its most simple implementation, you use one device only. This is appropriate for clusters where all your data is on the same shared storage (with internal redundancy) anyway; the SBD device does not introduce an additional single point of failure then. If the SBD device is not accessible, the daemon will fail to start and inhibit startup of cluster services. =item Two devices This configuration is a trade-off, primarily aimed at environments where host-based mirroring is used, but no third storage device is available. SBD will not commit suicide if it loses access to one mirror leg; this allows the cluster to continue to function even in the face of one outage. However, SBD will not fence the other side while only one mirror leg is available, since it does not have enough knowledge to detect an asymmetric split of the storage. So it will not be able to automatically tolerate a second failure while one of the storage arrays is down. (Though you can use the appropriate crm command to acknowledge the fence manually.) It will not start unless both devices are accessible on boot. =item Three devices In this most reliable and recommended configuration, SBD will only self-fence if more than one device is lost; hence, this configuration is resilient against temporary single device outages (be it due to failures or maintenance). Fencing messages can still be successfully relayed if at least two devices remain accessible. This configuration is appropriate for more complex scenarios where storage is not confined to a single array. For example, host-based mirroring solutions could have one SBD per mirror leg (not mirrored itself), and an additional tie-breaker on iSCSI. It will only start if at least two devices are accessible on boot. =back After you have chosen the devices and created the appropriate partitions and perhaps multipath alias names to ease management, use the C command described above to initialize the SBD metadata on them. =head3 Sharing the block device(s) between multiple clusters It is possible to share the block devices between multiple clusters, provided the total number of nodes accessing them does not exceed I<255> nodes, and they all must share the same SBD timeouts (since these are part of the metadata). If you are using multiple devices this can reduce the setup overhead required. However, you should B share devices between clusters in different security domains. =head2 Configure SBD to start on boot On systems using C, the C or C system start-up scripts must handle starting or stopping C as required before starting the rest of the cluster stack. For C, sbd simply has to be enabled using systemctl enable sbd.service The daemon is brought online on each node before corosync and Pacemaker are started, and terminated only after all other cluster components have been shut down - ensuring that cluster resources are never activated without SBD supervision. =head2 Configuration via sysconfig The system instance of C is configured via F. In this file, you must specify the device(s) used, as well as any options to pass to the daemon: SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" SBD_PACEMAKER="true" C will fail to start if no C is specified. See the installed template for more options that can be configured here. In general configuration done via parameters takes precedence over the configuration from the configuration file. =head2 Testing the sbd installation After a restart of the cluster stack on this node, you can now try sending a test message to it as root, from this or any other node: sbd -d /dev/sda1 message node1 test The node will acknowledge the receipt of the message in the system logs: Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 This confirms that SBD is indeed up and running on the node, and that it is ready to receive messages. Make B that F is identical on all cluster nodes, and that all cluster nodes are running the daemon. =head1 Pacemaker CIB integration =head2 Fencing resource Pacemaker can only interact with SBD to issue a node fence if there is a configure fencing resource. This should be a primitive, not a clone, as follows: primitive fencing-sbd stonith:external/sbd \ params pcmk_delay_max=30 This will automatically use the same devices as configured in F. While you should not configure this as a clone (as Pacemaker will register the fencing device on each node automatically), the I setting enables random fencing delay which ensures, in a scenario where a split-brain scenario did occur in a two node cluster, that one of the nodes has a better chance to survive to avoid double fencing. SBD also supports turning the reset request into a crash request, which may be helpful for debugging if you have kernel crashdumping configured; then, every fence request will cause the node to dump core. You can enable this via the C parameter on the fencing resource. This is B recommended for production use, but only for debugging phases. =head2 General cluster properties You must also enable STONITH in general, and set the STONITH timeout to be at least twice the I timeout you have configured, to allow enough time for the fencing message to be delivered. If your I timeout is 60 seconds, this is a possible configuration: property stonith-enabled="true" property stonith-timeout="120s" B: if I is too low for I and the system overhead, sbd will never be able to successfully complete a fence request. This will create a fencing loop. Note that the sbd fencing agent will try to detect this and automatically extend the I setting to a reasonable value, on the assumption that sbd modifying your configuration is preferable to not fencing. =head1 Management tasks =head2 Recovering from temporary SBD device outage If you have multiple devices, failure of a single device is not immediately fatal. C will retry to restart the monitor for the device every 5 seconds by default. However, you can tune this via the options to the I command. In case you wish the immediately force a restart of all currently disabled monitor processes, you can send a I to the SBD I process. =head1 LICENSE Copyright (C) 2008-2013 Lars Marowsky-Bree This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. For details see the GNU General Public License at http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). diff --git a/sbd.spec b/sbd.spec index 245c2ef..372878f 100644 --- a/sbd.spec +++ b/sbd.spec @@ -1,161 +1,186 @@ # # spec file for package sbd # # Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. # Copyright (c) 2013 Lars Marowsky-Bree # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed # upon. The license for this file, and modifications and additions to the # file, is the same license as for the pristine package itself (unless the # license for the pristine package is not an Open Source License, in which # case the license is the MIT License). An "Open Source License" is a # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. # Please submit bugfixes or comments via http://bugs.opensuse.org/ # %global commit 5705703da3db01bb4c34fd73ae33f24b43a16b78-mod %global shortcommit %(echo %{commit}|cut -c1-8) %global modified %(echo %{commit}-|cut -f2 -d-) %global github_owner beekhof %global buildnum 1 Name: sbd Summary: Storage-based death License: GPLv2+ Group: System Environment/Daemons Version: 1.4.0 Release: 99.%{buildnum}.%{shortcommit}.%{modified}git%{?dist} Url: https://github.com/%{github_owner}/%{name} Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: autoconf BuildRequires: automake BuildRequires: libuuid-devel BuildRequires: glib2-devel BuildRequires: libaio-devel -BuildRequires: corosynclib-devel +BuildRequires: corosync-devel %if 0%{?suse_version} BuildRequires: libpacemaker-devel %else BuildRequires: pacemaker-libs-devel %endif BuildRequires: libtool BuildRequires: libuuid-devel BuildRequires: libxml2-devel BuildRequires: pkgconfig BuildRequires: make Conflicts: fence-agents-sbd < 4.5.0 %if 0%{?rhel} > 0 ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le %endif %if %{defined systemd_requires} %systemd_requires %endif %description This package contains the storage-based death functionality. +%package tests +Summary: Storage-based death environment for regression tests +License: GPLv2+ +Group: System Environment/Daemons + +%description tests +This package provides an environment + testscripts for +regression-testing sbd. + %prep ########################################################### # %setup -n sbd-%{version} -q %setup -q -n %{name}-%{commit} +%ifarch s390x s390 +sed -i src/sbd.sysconfig -e "s/Default: 5/Default: 15/" +sed -i src/sbd.sysconfig -e "s/SBD_WATCHDOG_TIMEOUT=5/SBD_WATCHDOG_TIMEOUT=15/" +%endif ########################################################### %build -autoreconf -i +./autogen.sh export CFLAGS="$RPM_OPT_FLAGS -Wall -Werror" %configure make %{?_smp_mflags} ########################################################### %install ########################################################### make DESTDIR=$RPM_BUILD_ROOT LIBDIR=%{_libdir} install rm -rf ${RPM_BUILD_ROOT}%{_libdir}/stonith install -D -m 0755 src/sbd.sh $RPM_BUILD_ROOT/usr/share/sbd/sbd.sh +install -D -m 0755 tests/regressions.sh $RPM_BUILD_ROOT/usr/share/sbd/regressions.sh %if %{defined _unitdir} install -D -m 0644 src/sbd.service $RPM_BUILD_ROOT/%{_unitdir}/sbd.service install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote.service %endif mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd +# Don't package static libs +find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f +find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f + %clean rm -rf %{buildroot} %if %{defined _unitdir} %post %systemd_post sbd.service %systemd_post sbd_remote.service %preun %systemd_preun sbd.service %systemd_preun sbd_remote.service %postun %systemd_postun sbd.service %systemd_postun sbd_remote.service %endif %files ########################################################### %defattr(-,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/sbd %{_sbindir}/sbd %{_datadir}/sbd +%exclude %{_datadir}/sbd/regressions.sh %doc %{_mandir}/man8/sbd* %if %{defined _unitdir} %{_unitdir}/sbd.service %{_unitdir}/sbd_remote.service %endif %doc COPYING +%files tests +%defattr(-,root,root) +%dir %{_datadir}/sbd +%{_datadir}/sbd/regressions.sh +%{_libdir}/libsbdtestbed* + %changelog * Mon Jan 14 2019 - 1.4.0-0.1.2d595fdd.git - updated travis-CI (ppc64le-build, fedora29, remove need for alectolytic-build-container) - make watchdog-device-query easier to be handled by an SELinux-policy - configurable delay value for SBD_DELAY_START - use pacemaker's new pe api with constructors/destructors - make timeout-action executed by sbd configurable - init script for sysv systems - version bump to v1.4.0 to denote Pacemaker 2.0.0 compatibility * Fri Jun 29 2018 - 1.3.1-0.1.e102d9ed.git - removed unneeded python-devel build-requirement - changed legacy corosync-devel to corosynclib-devel * Fri Nov 3 2017 - 1.3.1-0.1.a180176c.git - Add commands to test/query watchdogs - Allow 2-node-operation with a single shared-disk - Overhaul of the command-line options & config-file - Proper handling of off instead of reboot - Refactored disk-servant for more robust communication with parent - Fix config for Debian + configurable location of config - Fixes in sbd.sh - multiple SBD devices and others * Sun Mar 27 2016 - 1.3.0-0.1.4ee36fa3.git - Changes since v1.2.0 like adding the possibility to have a watchdog-only setup without shared-block-devices legitimate a bump to v1.3.0. * Mon Oct 13 2014 - 1.2.1-0.4.3de531ed.git - Fixes for suitability to the el7 environment * Tue Sep 30 2014 - 1.2.1-0.3.8f912945.git - Only build on archs supported by the HA Add-on * Fri Aug 29 2014 - 1.2.1-0.2.8f912945.git - Remove some additional SUSE-isms * Fri Aug 29 2014 - 1.2.1-0.1.8f912945.git - Prepare for package review Resolves: rhbz#1134245 diff --git a/src/Makefile.am b/src/Makefile.am index ec51dde..db10c71 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,15 +1,13 @@ AM_CFLAGS = -D_GNU_SOURCE -DCHECK_AIS -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS = -I$(includedir)/pacemaker \ -I$(includedir)/heartbeat \ $(glib_CFLAGS) sbin_PROGRAMS = sbd sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig if SUPPORT_SHARED_DISK sbd_SOURCES += sbd-md.c endif -sbd_LDADD = $(glib_LIBS) $(libcoroipcc_LIBS) - diff --git a/src/sbd-common.c b/src/sbd-common.c index 6c6ac2e..9ec43b2 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,1185 +1,1193 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #include #include #ifdef __GLIBC__ #include #endif #include #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ #if defined(__s390__) || defined(__s390x__) unsigned long timeout_watchdog = 15; int timeout_msgwait = 30; #else unsigned long timeout_watchdog = 5; int timeout_msgwait = 10; #endif unsigned long timeout_watchdog_warn = 3; int timeout_allocate = 2; int timeout_loop = 1; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; -unsigned long timeout_watchdog_crashdump = 240; +unsigned long timeout_watchdog_crashdump = 0; int skip_rt = 0; int debug = 0; int debug_mode = 0; char *watchdogdev = NULL; bool watchdogdev_is_default = false; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; int servant_health = 0; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v|-vv|-vvv Enable verbose|debug|debug-library logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" -"-C Watchdog timeout to set before crashdumping (def: 240s, optional)\n" +"-C Watchdog timeout to set before crashdumping\n" +" (def: 0s = disable gracefully, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "-r Set timeout-action to comma-separated combination of\n" " noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" -"message (test|reset|off|clear|exit)\n" +"message (test|reset|off|crashdump|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" "query-watchdog Check for available watchdog-devices and print some info\n" "test-watchdog Test the watchdog-device selected.\n" " Attention: This will arm the watchdog and have your system reset\n" " in case your watchdog is working properly!\n" , cmdname); } static int watchdog_init_interval_fd(int wdfd, int timeout) { if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } return 0; } int watchdog_init_interval(void) { if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) { return -1; } cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); return 0; } static int watchdog_tickle_fd(int wdfd, char *wddev) { if (write(wdfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", wddev); return -1; } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { return watchdog_tickle_fd(watchdogfd, watchdogdev); } return 0; } static int watchdog_init_fd(char *wddev, int timeout) { int wdfd; wdfd = open(wddev, O_WRONLY); if (wdfd >= 0) { if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0)) || (watchdog_tickle_fd(wdfd, wddev) < 0)) { close(wdfd); return -1; } } else { cl_perror("Cannot open watchdog device '%s'", wddev); return -1; } return wdfd; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { int timeout = timeout_watchdog; if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); timeout = -1; } watchdogfd = watchdog_init_fd(watchdogdev, timeout); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); if (watchdog_set_timeout) { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); } } else { return -1; } } return 0; } static void watchdog_close_fd(int wdfd, char *wddev, bool disarm) { if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", wddev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(wdfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", wddev); } } if (close(wdfd) < 0) { cl_perror("Watchdog close(%d) failed", wdfd); } } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } watchdog_close_fd(watchdogfd, watchdogdev, disarm); watchdogfd = -1; } #define MAX_WATCHDOGS 64 #define SYS_CLASS_WATCHDOG "/sys/class/watchdog" #define SYS_CHAR_DEV_DIR "/sys/dev/char" #define WATCHDOG_NODEDIR "/dev/" #define WATCHDOG_NODEDIR_LEN 5 struct watchdog_list_item { dev_t dev; char *dev_node; char *dev_ident; char *dev_driver; struct watchdog_list_item *next; }; struct link_list_item { char *dev_node; char *link_name; struct link_list_item *next; }; static struct watchdog_list_item *watchdog_list = NULL; static int watchdog_list_items = 0; static void watchdog_populate_list(void) { dev_t watchdogs[MAX_WATCHDOGS + 1] = {makedev(10,130), 0}; int num_watchdogs = 1; struct dirent *entry; char entry_name[280]; DIR *dp; char buf[280] = ""; struct link_list_item *link_list = NULL; if (watchdog_list != NULL) { return; } /* get additional devices from /sys/class/watchdog */ dp = opendir(SYS_CLASS_WATCHDOG); if (dp) { while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { FILE *file; snprintf(entry_name, sizeof(entry_name), SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); file = fopen(entry_name, "r"); if (file) { int major, minor; if (fscanf(file, "%d:%d", &major, &minor) == 2) { watchdogs[num_watchdogs++] = makedev(major, minor); } fclose(file); if (num_watchdogs == MAX_WATCHDOGS) { break; } } } } closedir(dp); } /* search for watchdog nodes in /dev */ dp = opendir(WATCHDOG_NODEDIR); if (dp) { /* first go for links and memorize them */ while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { int len; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); /* !realpath(entry_name, buf) unfortunately does a stat on * target so we can't really use it to check if links stay * within /dev without triggering e.g. AVC-logs (with * SELinux policy that just allows stat within /dev). * Without canonicalization that doesn't actually touch the * filesystem easily available introduce some limitations * for simplicity: * - just simple path without '..' * - just one level of symlinks (avoid e.g. loop-checking) */ len = readlink(entry_name, buf, sizeof(buf) - 1); if ((len < 1) || (len > sizeof(buf) - WATCHDOG_NODEDIR_LEN - 1)) { continue; } buf[len] = '\0'; if (buf[0] != '/') { memmove(&buf[WATCHDOG_NODEDIR_LEN], buf, len+1); memcpy(buf, WATCHDOG_NODEDIR, WATCHDOG_NODEDIR_LEN); len += WATCHDOG_NODEDIR_LEN; } if (strstr(buf, "/../") || strncmp(WATCHDOG_NODEDIR, buf, WATCHDOG_NODEDIR_LEN)) { continue; } else { /* just memorize to avoid statting the target - SELinux */ struct link_list_item *lli = calloc(1, sizeof(struct link_list_item)); lli->dev_node = strdup(buf); lli->link_name = strdup(entry_name); lli->next = link_list; link_list = lli; } } } rewinddir(dp); while ((entry = readdir(dp))) { if (entry->d_type == DT_CHR) { struct stat statbuf; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode)) { int i; for (i=0; idev = watchdogs[i]; wdg->dev_node = strdup(entry_name); wdg->next = watchdog_list; watchdog_list = wdg; watchdog_list_items++; if (wdfd >= 0) { struct watchdog_info ident; ident.identity[0] = '\0'; ioctl(wdfd, WDIOC_GETSUPPORT, &ident); watchdog_close_fd(wdfd, entry_name, true); if (ident.identity[0]) { wdg->dev_ident = strdup((char *) ident.identity); } } snprintf(entry_name, sizeof(entry_name), SYS_CHAR_DEV_DIR "/%d:%d/device/driver", major(watchdogs[i]), minor(watchdogs[i])); len = readlink(entry_name, buf, sizeof(buf) - 1); if (len > 0) { buf[len] = '\0'; wdg->dev_driver = strdup(basename(buf)); } else if ((wdg->dev_ident) && (strcmp(wdg->dev_ident, "Software Watchdog") == 0)) { wdg->dev_driver = strdup("softdog"); } /* create dupes if we have memorized links * to this node */ for (tmp_list = link_list; tmp_list; tmp_list = tmp_list->next) { if (!strcmp(tmp_list->dev_node, wdg->dev_node)) { struct watchdog_list_item *dupe_wdg = calloc(1, sizeof(struct watchdog_list_item)); /* as long as we never purge watchdog_list * there is no need to dupe strings */ *dupe_wdg = *wdg; dupe_wdg->dev_node = strdup(tmp_list->link_name); dupe_wdg->next = watchdog_list; watchdog_list = dupe_wdg; watchdog_list_items++; } /* for performance reasons we could remove * the link_list entry */ } break; } } } } } closedir(dp); } /* cleanup link list */ while (link_list) { struct link_list_item *tmp_list = link_list; link_list = link_list->next; free(tmp_list->dev_node); free(tmp_list->link_name); free(tmp_list); } } int watchdog_info(void) { struct watchdog_list_item *wdg; int wdg_cnt = 0; watchdog_populate_list(); printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items); for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { wdg_cnt++; printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n", wdg_cnt, wdg->dev_node, wdg->dev_ident?wdg->dev_ident:"Error: Check if hogged by e.g. sbd-daemon!", wdg->dev_driver?wdg->dev_driver:""); if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) { printf("CAUTION: Not recommended for use with sbd.\n"); } } return 0; } int watchdog_test(void) { int i; if ((watchdog_set_timeout == 0) || !watchdog_use) { printf("\nWatchdog is disabled - aborting test!!!\n"); return 0; } if (watchdogdev_is_default) { watchdog_populate_list(); if (watchdog_list_items > 1) { printf("\nError: Multiple watchdog devices discovered.\n" " Use -w or SBD_WATCHDOG_DEV to specify\n" " which device to reset the system with\n"); watchdog_info(); return -1; } } if ((isatty(fileno(stdin)))) { char buffer[16]; printf("\nWARNING: This operation is expected to force-reboot this system\n" " without following any shutdown procedures.\n\n" "Proceed? [NO/Proceed] "); if ((fgets(buffer, 16, stdin) == NULL) || strcmp(buffer, "Proceed\n")) { printf("\nAborting watchdog test!!!\n"); return 0; } printf("\n"); } printf("Initializing %s with a reset countdown of %d seconds ...\n", watchdogdev, (int) timeout_watchdog); if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) { printf("Failed to initialize watchdog!!!\n"); return -1; } printf("\n"); printf("NOTICE: The watchdog device is expected to reset the system\n" " in %d seconds. If system remains active beyond that time,\n" " watchdog may not be functional.\n\n", (int) timeout_watchdog); for (i=timeout_watchdog; i>1; i--) { printf("Reset countdown ... %d seconds\n", i); sleep(1); } for (i=2; i>0; i--) { printf("System expected to reset any moment ...\n"); sleep(1); } for (i=5; i>0; i--) { printf("System should have reset ...\n"); sleep(1); } printf("Error: The watchdog device has failed to reboot the system,\n" " and it may not be suitable for usage with sbd.\n"); /* test should trigger a reboot thus returning is actually bad */ return -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static void sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { sbd_stack_hogger(buf, kbytes-1); } return; } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } static int get_realtime_budget(void) { FILE *f; char fname[PATH_MAX]; int res = -1, lnum = 0; char *cgroup = NULL, *namespecs = NULL; snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd", (intmax_t)getpid()); goto exit_res; } while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum, &namespecs, &cgroup) !=EOF ) { if (namespecs && strstr(namespecs, "cpuacct")) { free(namespecs); break; } if (cgroup) { free(cgroup); cgroup = NULL; } if (namespecs) { free(namespecs); namespecs = NULL; } } fclose(f); if (cgroup == NULL) { cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd", (intmax_t)getpid()); goto exit_res; } snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us", cgroup); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but " "doesn't for '%s'", cgroup); goto exit_res; } if (fscanf(f, "%d", &res) != 1) { cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname); } else { cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res); } fclose(f); exit_res: if (cgroup) { free(cgroup); } return res; } /* stolen from corosync */ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } fclose(f); if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) { cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are " "-> skip moving to root-slice"); res = 0; goto exit_res; } f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { cl_log(LOG_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } #ifdef SCHED_RR if (move_to_root_cgroup) { sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); } { int pcurrent = 0; int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } pcurrent = sched_getscheduler(0); if (pcurrent < 0) { cl_perror("Unable to get scheduler priority"); } else if(pcurrent < priority) { struct sched_param sp; memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror("Unable to set scheduler priority to %d", priority); } else { cl_log(LOG_INFO, "Scheduler priority is now %d", priority); } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler priority"); #endif sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind, bool do_flush) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicking the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicking the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); if (do_flush) { sync(); } - if(kind == 'c') { - watchdog_close(true); + if (kind == 'c') { + if (timeout_watchdog_crashdump) { + if (timeout_watchdog != timeout_watchdog_crashdump) { + timeout_watchdog = timeout_watchdog_crashdump; + watchdog_init_interval(); + } + watchdog_close(false); + } else { + watchdog_close(true); + } sysrq_trigger(kind); - } else { watchdog_close(false); sysrq_trigger(kind); if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) { cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot"); } } exit(1); } void do_crashdump(void) { do_exit('c', true); } void do_reset(void) { do_exit('b', true); } void do_off(void) { do_exit('o', true); } void do_timeout_action(void) { do_exit(timeout_sysrq_char, do_flush); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } void notify_parent(void) { pid_t ppid; union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_timeout_action(); } switch (servant_health) { case pcmk_health_pending: case pcmk_health_shutdown: case pcmk_health_transient: DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health); break; case pcmk_health_unknown: case pcmk_health_unclean: case pcmk_health_noquorum: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; case pcmk_health_online: DBGLOG(LOG_DEBUG, "Notifying parent: healthy"); sigqueue(ppid, SIG_LIVENESS, signal_value); break; default: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; } } void set_servant_health(enum pcmk_health state, int level, char const *format, ...) { if (servant_health != state) { va_list ap; int len = 0; char *string = NULL; servant_health = state; va_start(ap, format); len = vasprintf (&string, format, ap); if(len > 0) { cl_log(level, "%s", string); } va_end(ap); free(string); } } bool sbd_is_disk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (servant->devname[0] == '/')) { return true; } return false; } bool sbd_is_cluster(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("cluster", servant->devname) == 0)) { return true; } return false; } bool sbd_is_pcmk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("pcmk", servant->devname) == 0)) { return true; } return false; } diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index 5f5da1d..52ede8a 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,1272 +1,1278 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "sbd.h" #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int disk_priority = 1; int check_pcmk = 1; int check_cluster = 1; int disk_count = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; bool do_flush = true; char timeout_sysrq_char = 'b'; bool move_to_root_cgroup = true; bool enforce_moving_to_root_cgroup = false; int parse_device_line(const char *line); void recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { cl_log(LOG_DEBUG, "Servant %s already exists", devname); return; } newbie = malloc(sizeof(*newbie)); if (newbie) { memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; } if (!newbie || !newbie->devname) { fprintf(stderr, "heap allocation failed in recruit_servant.\n"); exit(1); } /* some sanity-check on our newbie */ if (sbd_is_disk(newbie)) { cl_log(LOG_INFO, "Monitoring %s", devname); disk_count++; } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { /* alive just after pcmk and cluster servants have shown up */ newbie->outdated = 1; } else { /* toss our newbie */ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); free((void *) newbie->devname); free(newbie); return; } if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strcasecmp(s->devname, devname) == 0) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif } else if(sbd_is_pcmk(s)) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else if(sbd_is_cluster(s)) { DBGLOG(LOG_INFO, "Starting Cluster servant"); s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); } else { cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } static inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int cluster_alive(bool all) { int alive = 1; struct servants_list_item* s; if(servant_count == disk_count) { return 0; } for (s = servants_leader; s; s = s->next) { if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if(s->outdated) { alive = 0; } else if(all == false) { return 1; } } } return alive; } int quorum_read(int good_servants) { if (disk_count > 2) return (good_servants > disk_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int cluster_appeared = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { bool tickle = 0; bool can_detach = 0; int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; default: break; } } } else if (sbd_is_pcmk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); /* revert to state prior to pacemaker-detection */ s->restarts = 0; s->restart_blocked = 0; cluster_appeared = 0; s->outdated = 1; s->t_last.tv_sec = 0; break; default: break; } } } cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if (s->outdated == 0) { cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname); } s->t_last.tv_sec = 1; } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { if (sbd_is_disk(s)) { good_servants++; } if (s->outdated) { cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age); } s->outdated = 0; } else if (!s->outdated) { if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(disk_count == 0) { /* NO disks, everything is up to the cluster */ if(cluster_alive(true)) { /* We LIVE! */ if(cluster_appeared == false) { cl_log(LOG_INFO, "Active cluster detected"); } tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(cluster_alive(false)) { if(!decoupled) { /* On the way up, detach and arm the watchdog */ cl_log(LOG_INFO, "Partial cluster detected, detaching"); } can_detach = 1; tickle = !cluster_appeared; } else if(!decoupled) { /* Stay alive until the cluster comes up */ tickle = !cluster_appeared; } } else if(disk_priority == 1 || servant_count == disk_count) { if (quorum_read(good_servants)) { /* There are disks and we're connected to the majority of them */ tickle = 1; can_detach = 1; pcmk_override = 0; } else if (servant_count > disk_count && cluster_alive(true)) { tickle = 1; if(!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Only log this message once */ } } } else if(cluster_alive(true) && quorum_read(good_servants)) { /* Both disk and cluster servants are healthy */ tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(quorum_read(good_servants)) { /* The cluster takes priority but only once * connected for the first time. * * Until then, we tickle based on disk quorum. */ can_detach = 1; tickle = !cluster_appeared; } /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, tickle, disk_count); */ if(tickle) { watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } if (!decoupled && can_detach) { /* We only do this at the point either the disk or * cluster servants become healthy */ cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_timeout_action(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_timeout_action(); } } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { size_t lpc = 0; size_t last = 0; size_t max = 0; int found = 0; bool skip_space = true; int space_run = 0; if (!line) { return 0; } max = strlen(line); cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); for (lpc = 0; lpc <= max; lpc++) { if (isspace(line[lpc])) { if (skip_space) { last = lpc + 1; } else { space_run++; } continue; } skip_space = false; if (line[lpc] == ';' || line[lpc] == 0) { int rc = 0; char *entry = calloc(1, 1 + lpc - last); if (entry) { rc = sscanf(line + last, "%[^;]", entry); } else { fprintf(stderr, "Heap allocation failed parsing device-line.\n"); exit(1); } if (rc != 1) { cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); recruit_servant(entry, 0); found++; } free(entry); skip_space = true; last = lpc + 1; } space_run = 0; } return found; } #define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c" static void sbd_log_filter_ctl(const char *files, uint8_t priority) { if (files == NULL) { files = SBD_SOURCE_FILES; } qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); } int arg_enabled(int arg_count) { return arg_count % 2; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int W_count = 0; int c_count = 0; int P_count = 0; int qb_facility; const char *value = NULL; bool delay_start = false; long delay = 0; char *timeout_action = NULL; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); watchdogdev_is_default = true; qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_log_filter_ctl(NULL, LOG_NOTICE); sbd_get_uname(); - value = getenv("SBD_DEVICE"); - if(value) { -#if SUPPORT_SHARED_DISK - int devices = parse_device_line(value); - if(devices < 1) { - fprintf(stderr, "Invalid device line: %s\n", value); - exit_status = -2; - goto out; - } -#else - fprintf(stderr, "Shared disk functionality not supported\n"); - exit_status = -2; - goto out; -#endif - } - value = getenv("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); check_cluster = crm_is_true(value); } cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default"); value = getenv("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = getenv("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); watchdogdev_is_default = false; } /* SBD_WATCHDOG has been dropped from sbd.sysconfig example. * This is for backward compatibility. */ value = getenv("SBD_WATCHDOG"); if(value) { watchdog_use = crm_is_true(value); } value = getenv("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } } value = getenv("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = getenv("SBD_DELAY_START"); if(value) { delay_start = crm_is_true(value); if (!delay_start) { delay = crm_get_msec(value) / 1000; if (delay > 0) { delay_start = true; } } } cl_log(LOG_DEBUG, "Delay start: %s%s%s", delay_start? "yes (" : "no", delay_start? (delay > 0 ? value: "msgwait") : "", delay_start? ")" : ""); value = getenv("SBD_TIMEOUT_ACTION"); if(value) { timeout_action = strdup(value); } value = getenv("SBD_MOVE_TO_ROOT_CGROUP"); if(value) { move_to_root_cgroup = crm_is_true(value); if (move_to_root_cgroup) { enforce_moving_to_root_cgroup = true; } else { if (strcmp(value, "auto") == 0) { move_to_root_cgroup = true; } } } while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) { switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = atoi(optarg); cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = atoi(optarg); cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { sbd_log_filter_ctl(NULL, LOG_INFO); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { sbd_log_filter_ctl(NULL, LOG_DEBUG); cl_log(LOG_INFO, "Debug mode enabled."); } else if(debug == 3) { /* Go nuts, turn on pacemaker's logging too */ sbd_log_filter_ctl("*", LOG_DEBUG); cl_log(LOG_INFO, "Debug library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': W_count++; break; case 'w': cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); free(watchdogdev); watchdogdev = strdup(optarg); watchdogdev_is_default = false; break; case 'd': #if SUPPORT_SHARED_DISK recruit_servant(optarg, 0); #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; case 'c': c_count++; break; case 'P': P_count++; break; case 'z': disk_priority = 0; break; case 'n': local_uname = strdup(optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = atoi(optarg); cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = atoi(optarg); if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } break; case '2': timeout_allocate = atoi(optarg); break; case '3': timeout_loop = atoi(optarg); break; case '4': timeout_msgwait = atoi(optarg); break; case '5': timeout_watchdog_warn = atoi(optarg); cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = atoi(optarg); cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = atoi(optarg); cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = atoi(optarg); cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'r': if (timeout_action) { free(timeout_action); } timeout_action = strdup(optarg); break; case 'h': usage(); goto out; break; default: exit_status = -2; goto out; break; } } + if (disk_count == 0) { + /* if we already have disks from commandline + then it is probably undesirable to add those + from environment (general rule cmdline has precedence) + */ + value = getenv("SBD_DEVICE"); + if ((value) && strlen(value)) { +#if SUPPORT_SHARED_DISK + int devices = parse_device_line(value); + if(devices < 1) { + fprintf(stderr, "Invalid device line: %s\n", value); + exit_status = -2; + goto out; + } +#else + fprintf(stderr, "Shared disk functionality not supported\n"); + exit_status = -2; + goto out; +#endif + } + } + if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } else if (W_count > 0) { watchdog_use = arg_enabled(W_count); } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (c_count > 0) { check_cluster = arg_enabled(c_count); } if (P_count > 0) { check_pcmk = arg_enabled(P_count); } if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) { fprintf(stderr, "Node name mustn't be longer than %d chars.\n", SECTOR_NAME_MAX); fprintf(stderr, "If uname is longer define a name to be used by sbd.\n"); exit_status = -1; goto out; } if (disk_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } if (timeout_action) { char *p[2]; int i; char c; int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c); bool parse_error = (nrflags < 1) || (nrflags > 2); for (i = 0; (i < nrflags) && (i < 2); i++) { if (!strcmp(p[i], "reboot")) { timeout_sysrq_char = 'b'; } else if (!strcmp(p[i], "crashdump")) { timeout_sysrq_char = 'c'; } else if (!strcmp(p[i], "off")) { timeout_sysrq_char = 'o'; } else if (!strcmp(p[i], "flush")) { do_flush = true; } else if (!strcmp(p[i], "noflush")) { do_flush = false; } else { parse_error = true; } free(p[i]); } if (parse_error) { fprintf(stderr, "Failed to parse timeout-action \"%s\".\n", timeout_action); exit_status = -1; goto out; } } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "watch") == 0) { if(disk_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); } if (delay_start) { if (delay <= 0) { delay = get_first_msgwait(servants_leader); } sleep((unsigned long) delay); } } else { exit_status = -2; } #endif if (strcmp(argv[optind], "query-watchdog") == 0) { exit_status = watchdog_info(); } else if (strcmp(argv[optind], "test-watchdog") == 0) { exit_status = watchdog_test(); } else if (strcmp(argv[optind], "watch") == 0) { /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); #if SUPPORT_PLUGIN check_cluster = 1; #endif } cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); if (check_cluster) { recruit_servant("cluster", 0); } cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout", do_flush?"Do":"Skip", timeout_sysrq_char); exit_status = inquisitor(); } out: if (timeout_action) { free(timeout_action); } if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..0b9a406 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,4 @@ +lib_LTLIBRARIES = libsbdtestbed.la +libsbdtestbed_la_SOURCES = sbd-testbed.c +libsbdtestbed_la_LDFLAGS = +libsbdtestbed_la_LIBADD = @LIBADD_DL@ diff --git a/configure.ac b/tests/configure.ac similarity index 56% copy from configure.ac copy to tests/configure.ac index 401cb93..91c56cd 100644 --- a/configure.ac +++ b/tests/configure.ac @@ -1,253 +1,183 @@ dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== -dnl Bootstrap +dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services -AC_INIT([sbd], +AC_INIT([sbd], [1.4.0], [lmb@suse.com]) +m4_include([../tests-opt.m4]) AC_CANONICAL_HOST AC_CONFIG_AUX_DIR(.) AC_CONFIG_HEADERS(config.h) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([no])]) -AM_INIT_AUTOMAKE +AM_INIT_AUTOMAKE(1.11.1 foreign TESTS_OPTION) +LT_INIT([dlopen],[disable-static]) AM_PROG_CC_C_O PKG_CHECK_MODULES(glib, [glib-2.0]) -dnl PKG_CHECK_MODULES(libcoroipcc, [libcoroipcc]) -PKG_CHECK_MODULES(cmap, [libcmap], HAVE_cmap=1, HAVE_cmap=0) -PKG_CHECK_MODULES(votequorum, [libvotequorum], HAVE_votequorum=1, HAVE_votequorum=0) - -dnl pacemaker > 1.1.8 -PKG_CHECK_MODULES(pacemaker, [pacemaker, pacemaker-cib], HAVE_pacemaker=1, HAVE_pacemaker=0) - -dnl pacemaker <= 1.1.8 -PKG_CHECK_MODULES(pcmk, [pcmk, pcmk-cib], HAVE_pcmk=1, HAVE_pcmk=0) -PKG_CHECK_MODULES(libqb, [libqb]) - -CPPFLAGS="$CPPFLAGS -Werror" -if test $HAVE_pacemaker = 0 -a $HAVE_pcmk = 0; then - AC_MSG_ERROR(No package 'pacemaker' found) -elif test $HAVE_pacemaker = 1; then - CPPFLAGS="$CPPFLAGS $glib_CFLAGS $pacemaker_CFLAGS" - if test $HAVE_cmap = 0; then - AC_MSG_NOTICE(No library 'cmap' found) - else - CPPFLAGS="$CPPFLAGS $cmap_CFLAGS" - LIBS="$LIBS $cmap_LIBS" - fi - if test $HAVE_votequorum = 0; then - AC_MSG_NOTICE(No library 'votequorum' found) - else - CPPFLAGS="$CPPFLAGS $votequorum_CFLAGS" - LIBS="$LIBS $votequorum_LIBS" - fi -fi - -PKG_CHECK_MODULES(libxml, [libxml-2.0]) -CPPFLAGS="$CPPFLAGS $libxml_CFLAGS $libqb_CFLAGS $pacemaker_CFLAGS $pcmk_CFLAGS" -LIBS="$LIBS $libxml_LIBS $libqb_LIBS $pacemaker_LIBS $pcmk_LIBS" +CPPFLAGS="$CPPFLAGS -Werror $glib_CFLAGS" +LIBS="$LIBS $glib_LIBS" dnl checks for libraries -AC_CHECK_LIB(aio, io_setup, , missing="yes") -AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set, , missing="yes") -AC_CHECK_LIB(cib, cib_new, , missing="yes") -AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") -AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") -AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") -AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") -AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes") -AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0) -AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0) - -dnl pacemaker >= 1.1.8 -AC_CHECK_HEADERS(crm/cluster.h) -AC_CHECK_LIB(crmcommon, pcmk_strerror, , missing="yes") -AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes") - -dnl pacemaker-2.0 removed support for corosync 1 cluster layer -AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,, - [#include ]) - -dnl check for new pe-API -AC_CHECK_FUNCS(pe_new_working_set) - -if test "$missing" = "yes"; then - AC_MSG_ERROR([Missing required libraries or functions.]) -fi - -AC_PATH_PROGS(POD2MAN, pod2man, pod2man) +AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc... +AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux) -AC_ARG_ENABLE([shared-disk], -[ --enable-shared-disk Turn on functionality that requires shared disk - [default=yes]]) - -DISK=0 -if test "x${enable_shared_disk}" != xno ; then - DISK=1 -fi - -AC_DEFINE_UNQUOTED(SUPPORT_SHARED_DISK, $DISK, Turn on functionality that requires shared disk) -AM_CONDITIONAL(SUPPORT_SHARED_DISK, test "$DISK" = "1") - -if - test -e /proc/$$ -then - echo "/proc/{pid} is supported" - AC_DEFINE_UNQUOTED(HAVE_PROC_PID, 1, Define to 1 if /proc/{pid} is supported.) -fi - -AC_DEFINE_UNQUOTED(CHECK_TWO_NODE, $HAVE_cmap, Turn on checking for 2-node cluster) -AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1") - -AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle) -AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1") CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR Directory for SBD configuration file [${CONFIGDIR}]], [ CONFIGDIR="$withval" ] ) +# +# Where is dlopen? +# +if test "$ac_cv_lib_c_dlopen" = yes; then + LIBADD_DL="" +elif test "$ac_cv_lib_dl_dlopen" = yes; then + LIBADD_DL=-ldl +else + LIBADD_DL=${lt_cv_dlopen_libs} +fi + + dnl ********************************************************************** dnl Check for various argv[] replacing functions on various OSs dnl dnl Borrowed from Proftpd dnl Proftpd is Licenced under the terms of the GNU General Public Licence dnl and is available from http://www.proftpd.org/ dnl AC_CHECK_FUNCS(setproctitle) AC_CHECK_HEADERS(libutil.h) AC_CHECK_LIB(util, setproctitle, [AC_DEFINE(HAVE_SETPROCTITLE,1,[ ]) ac_cv_func_setproctitle="yes" ; LIBS="$LIBS -lutil"]) if test "$ac_cv_func_setproctitle" = "yes"; then pf_argv_set="PF_ARGV_NONE" fi if test "$pf_argv_set" = ""; then AC_CHECK_HEADERS(sys/pstat.h) if test "$ac_cv_header_pstat_h" = "yes"; then AC_CHECK_FUNCS(pstat) if test "$ac_cv_func_pstat" = "yes"; then pf_argv_set="PF_ARGV_PSTAT" else pf_argv_set="PF_ARGV_WRITEABLE" fi fi if test "$pf_argv_set" = ""; then AC_EGREP_HEADER([#define.*PS_STRINGS.*],sys/exec.h, have_psstrings="yes",have_psstrings="no") if test "$have_psstrings" = "yes"; then pf_argv_set="PF_ARGV_PSSTRINGS" fi fi if test "$pf_argv_set" = ""; then AC_CACHE_CHECK(whether __progname and __progname_full are available, pf_cv_var_progname, AC_TRY_LINK([extern char *__progname, *__progname_full;], [__progname = "foo"; __progname_full = "foo bar";], pf_cv_var_progname="yes", pf_cv_var_progname="no")) if test "$pf_cv_var_progname" = "yes"; then AC_DEFINE(HAVE___PROGNAME,1,[ ]) fi AC_CACHE_CHECK(which argv replacement method to use, pf_cv_argv_type, AC_EGREP_CPP(yes,[ #if defined(__GNU_HURD__) yes #endif ],pf_cv_argv_type="new", pf_cv_argv_type="writeable")) if test "$pf_cv_argv_type" = "new"; then pf_argv_set="PF_ARGV_NEW" fi if test "$pf_argv_set" = ""; then pf_argv_set="PF_ARGV_WRITEABLE" fi fi fi AC_DEFINE_UNQUOTED(PF_ARGV_TYPE, $pf_argv_set, mechanism to pretty-print ps output: setproctitle-equivalent) dnl End of tests borrowed from Proftpd AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr dnl Fix default variables - "prefix" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi ;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac dnl Expand autoconf variables so that we dont end up with '${prefix}' dnl in #defines and python scripts dnl NOTE: Autoconf deliberately leaves them unexpanded to allow dnl make exec_prefix=/foo install dnl No longer being able to do this seems like no great loss to me... eval prefix="`eval echo ${prefix}`" eval exec_prefix="`eval echo ${exec_prefix}`" eval bindir="`eval echo ${bindir}`" eval sbindir="`eval echo ${sbindir}`" eval libexecdir="`eval echo ${libexecdir}`" eval datadir="`eval echo ${datadir}`" eval sysconfdir="`eval echo ${sysconfdir}`" eval sharedstatedir="`eval echo ${sharedstatedir}`" eval localstatedir="`eval echo ${localstatedir}`" eval libdir="`eval echo ${libdir}`" eval includedir="`eval echo ${includedir}`" eval oldincludedir="`eval echo ${oldincludedir}`" eval infodir="`eval echo ${infodir}`" eval mandir="`eval echo ${mandir}`" -if test x"${CONFIGDIR}" = x""; then - CONFIGDIR="${sysconfdir}/sysconfig" -fi -AC_SUBST(CONFIGDIR) +AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries dnl The Makefiles and shell scripts we output -AC_CONFIG_FILES([Makefile src/Makefile agent/Makefile man/Makefile agent/sbd src/sbd.service src/sbd_remote.service src/sbd.sh]) +AC_CONFIG_FILES([Makefile]) -dnl Now process the entire list of files added by previous +dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() diff --git a/tests/regressions.sh b/tests/regressions.sh index c8733e2..b06166d 100755 --- a/tests/regressions.sh +++ b/tests/regressions.sh @@ -1,182 +1,332 @@ #!/bin/bash # # Copyright (C) 2013 Lars Marowsky-Bree # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # TODO: # - More tests # - Handle optional, long-running tests better # - Support for explicitly running a single test # - Verify output from commands # - Normalize uuids and device names so they are diffable # - Log to file, instead of syslog is needed # - How to test watch mode? # - Can the unit/service file be tested? or at least the wrapper? : ${SBD_BINARY:="/usr/sbin/sbd"} +: ${SBD_PRELOAD="libsbdtestbed.so"} +: ${SBD_USE_DM:="yes"} sbd() { - ${SBD_BINARY} $* + LD_PRELOAD=${SBD_PRELOAD} SBD_WATCHDOG_TIMEOUT=5 SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} $* +} + +sbd_wipe_disk() { + dd if=/dev/zero of=$1 count=2048 2>/dev/null } sbd_setup() { trap sbd_teardown EXIT for N in $(seq 3) ; do F[$N]=$(mktemp /tmp/sbd.device.$N.XXXXXX) - R[$N]=$(echo ${F[$N]}|cut -f4 -d.) - dd if=/dev/zero of=${F[$N]} count=2048 - L[$N]=$(losetup -f) - losetup ${L[$N]} ${F[$N]} - D[$N]="/dev/mapper/sbd_${N}_${R[$N]}" - dmsetup create sbd_${N}_${R[$N]} --table "0 2048 linear ${L[$N]} 0" - dmsetup mknodes sbd_${N}_${R[$N]} + sbd_wipe_disk ${F[$N]} + if [[ "${SBD_USE_DM}" == "yes" ]]; then + R[$N]=$(echo ${F[$N]}|cut -f4 -d.) + L[$N]=$(losetup -f) + losetup ${L[$N]} ${F[$N]} + D[$N]="/dev/mapper/sbd_${N}_${R[$N]}" + dmsetup create sbd_${N}_${R[$N]} --table "0 2048 linear ${L[$N]} 0" + dmsetup mknodes sbd_${N}_${R[$N]} + else + D[$N]=${F[$N]} + fi done + if [[ "${SBD_USE_DM}" != "yes" ]]; then + SBD_DEVICE="${F[1]};${F[2]};${F[3]}" + fi + SBD_PIDFILE=$(mktemp /tmp/sbd.pidfile.XXXXXX) + SBD_PRELOAD_LOG=$(mktemp /tmp/sbd.logfile.XXXXXX) } sbd_teardown() { for N in $(seq 3) ; do - dmsetup remove sbd_${N}_${R[$N]} - losetup -d ${L[$N]} + if [[ "${SBD_USE_DM}" == "yes" ]]; then + dmsetup remove sbd_${N}_${R[$N]} + losetup -d ${L[$N]} + fi rm -f ${F[$N]} + sbd_daemon_cleanup + rm -f ${SBD_PIDFILE} + rm -f ${SBD_PRELOAD_LOG} done } sbd_dev_fail() { - dmsetup wipe_table sbd_${1}_${R[$1]} + if [[ "${SBD_USE_DM}" == "yes" ]]; then + dmsetup wipe_table sbd_${1}_${R[$1]} + else + D[$1]=/tmp/fail123456789 + fi } sbd_dev_resume() { - dmsetup suspend sbd_${1}_${R[$1]} - dmsetup load sbd_${1}_${R[$1]} --table "0 2048 linear ${L[$1]} 0" - dmsetup resume sbd_${1}_${R[$1]} + if [[ "${SBD_USE_DM}" == "yes" ]]; then + dmsetup suspend sbd_${1}_${R[$1]} + dmsetup load sbd_${1}_${R[$1]} --table "0 2048 linear ${L[$1]} 0" + dmsetup resume sbd_${1}_${R[$1]} + else + D[$1]=${F[$1]} + fi +} + +sbd_daemon_cleanup() { + echo > ${SBD_PRELOAD_LOG} + pkill -TERM --pidfile ${SBD_PIDFILE} 2>/dev/null + sleep 5 + pkill -KILL --pidfile ${SBD_PIDFILE} 2>/dev/null + pkill -KILL --parent $(cat ${SBD_PIDFILE} 2>/dev/null) 2>/dev/null + echo > ${SBD_PIDFILE} } _ok() { echo -- $@ $@ rc=$? if [ $rc -ne 0 ]; then echo "$@ failed with $rc" exit $rc fi } _no() { echo -- $@ $@ rc=$? if [ $rc -eq 0 ]; then echo "$@ did NOT fail ($rc)" exit $rc fi return 0 } +_in_log() { + grep "$@" ${SBD_PRELOAD_LOG} >/dev/null + if [ $? -ne 0 ]; then + echo "didn't find '$@' in log:" + cat ${SBD_PRELOAD_LOG} + sbd_daemon_cleanup + exit 1 + fi +} + test_1() { echo "Creating three devices" _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} create _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} dump } test_2() { echo "Basic functionality" for S in `seq 2` ; do _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-$S" done _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 reset _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} list } test_3() { echo "Start mode (expected not to start, because reset was written in test_2)" _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-2 -Z -Z -Z -S 1 watch } test_4() { echo "Deliver message with 1 failure" sbd_dev_fail 1 _no sbd -d ${D[1]} -n test-1 message test-2 exit _no sbd -d ${D[1]} -d ${D[2]} -n test-1 message test-2 exit _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 } test_5() { echo "Deliver message with 2 failures" sbd_dev_fail 1 sbd_dev_fail 2 _no sbd -d ${D[1]} -d ${D[2]} -n test-1 message test-2 exit _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 sbd_dev_resume 2 } test_6() { echo "Deliver message with 3 failures" sbd_dev_fail 1 sbd_dev_fail 2 sbd_dev_fail 3 _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 sbd_dev_resume 2 sbd_dev_resume 3 } test_101() { echo "Creating one device" _ok sbd -d ${D[1]} create } test_102() { echo "Creating two devices" _ok sbd -d ${D[1]} -d ${D[2]} create } test_7() { echo "Allocate all slots plus 1" _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -2 0 create for S in `seq 255` ; do _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-$S" done _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-256" } test_8() { echo "Non-existent device path" _no sbd -d /dev/kfdifdifdfdlfd -create 2>/dev/null } test_9() { echo "Basic sbd invocation" _no sbd _ok sbd -h } +test_watchdog() { + echo "Basic watchdog test" + echo > ${SBD_PRELOAD_LOG} + sbd test-watchdog < /dev/null + _in_log "watchdog fired" +} + +test_stall_inquisitor() { + echo "Stall inquisitor test" + sbd_daemon_cleanup + sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 watch + sleep 10 + _ok kill -0 $(cat ${SBD_PIDFILE}) + kill -STOP $(cat ${SBD_PIDFILE}) + sleep 10 + kill -CONT $(cat ${SBD_PIDFILE}) 2>/dev/null + _in_log "watchdog fired" +} + +test_wipe_slots1() { + echo "Wipe slots test (with watchdog)" + sbd_daemon_cleanup + sbd -d ${D[1]} -n test-1 watch + sleep 2 + sbd_wipe_disk ${D[1]} + sleep 15 + _in_log "watchdog fired" +} + +test_wipe_slots2() { + echo "Wipe slots test (without watchdog)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd_wipe_disk ${D[1]} + sleep 15 + _in_log "sysrq-trigger ('b')" + _in_log "reboot (reboot)" +} + +test_message1() { + echo "Message test (reset)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd -d ${D[1]} message test-1 reset + sleep 2 + _in_log "sysrq-trigger ('b')" + _in_log "reboot (reboot)" +} + +test_message2() { + echo "Message test (off)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd -d ${D[1]} message test-1 off + sleep 2 + _in_log "sysrq-trigger ('o')" + _in_log "reboot (poweroff)" +} + +test_message3() { + echo "Message test (crashdump)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd -d ${D[1]} message test-1 crashdump + sleep 2 + _in_log "sysrq-trigger ('c')" +} + +test_timeout_action1() { + echo "Timeout action test (off)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + SBD_TIMEOUT_ACTION=off sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd_wipe_disk ${D[1]} + sleep 10 + _in_log "sysrq-trigger ('o')" + _in_log "reboot (poweroff)" +} + +test_timeout_action2() { + echo "Timeout action test (crashdump)" + sbd_daemon_cleanup + sbd -d ${D[1]} create + SBD_TIMEOUT_ACTION=crashdump sbd -d ${D[1]} -w /dev/null -n test-1 watch + sleep 2 + sbd_wipe_disk ${D[1]} + sleep 10 + _in_log "sysrq-trigger ('c')" +} + sbd_setup -for T in $(seq 9); do +if [[ "${SBD_PRELOAD}" != "" ]]; then + SBD_DAEMON_TESTS="watchdog stall_inquisitor wipe_slots1 wipe_slots2 message1 message2 message3 timeout_action1 timeout_action2" +fi + +for T in $(seq 9) ${SBD_DAEMON_TESTS}; do if ! test_$T ; then echo "FAILURE: Test $T" break fi echo "SUCCESS: Test $T" done echo "SUCCESS: All tests completed" diff --git a/tests/sbd-testbed.c b/tests/sbd-testbed.c new file mode 100644 index 0000000..858b1be --- /dev/null +++ b/tests/sbd-testbed.c @@ -0,0 +1,729 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __GLIBC_PREREQ(2,36) +#include +#else +#include + +typedef gboolean (*GUnixFDSourceFunc) (gint fd, + GIOCondition condition, + gpointer user_data); + +static gboolean +GIOFunc2GUnixFDSourceFunc(GIOChannel *source, + GIOCondition condition, + gpointer data) +{ + return ((GUnixFDSourceFunc) data) ( + g_io_channel_unix_get_fd(source), + condition, NULL); +} + +static guint +g_unix_fd_add(gint fd, + GIOCondition condition, + GUnixFDSourceFunc function, + gpointer user_data) +{ + GIOChannel *chan = g_io_channel_unix_new (fd); + + if (chan == NULL) { + return 0; + } else { + return g_io_add_watch(chan, + condition, + GIOFunc2GUnixFDSourceFunc, + (gpointer) function); + } +} +#endif + +typedef int (*orig_open_f_type)(const char *pathname, int flags, ...); +typedef int (*orig_ioctl_f_type)(int fd, unsigned long int request, ...); +typedef ssize_t (*orig_write_f_type)(int fd, const void *buf, size_t count); +typedef int (*orig_close_f_type)(int fd); +typedef FILE *(*orig_fopen_f_type)(const char *pathname, const char *mode); +typedef int (*orig_fclose_f_type)(FILE *fp); +typedef int (*orig_io_setup_f_type)(int nr_events, io_context_t *ctx_idp); +typedef int (*orig_io_submit_f_type)(io_context_t ctx_id, long nr, struct iocb *ios[]); +typedef int (*orig_io_getevents_f_type)(io_context_t ctx_id, long min_nr, long nr, + struct io_event *events, struct timespec *timeout); +typedef int (*orig_io_cancel_f_type)(io_context_t ctx_id, struct iocb *iocb, + struct io_event *result); + +static int is_init = 0; +static FILE *log_fp = NULL; + +static char *sbd_device[3] = {NULL, NULL, NULL}; +static int sbd_device_fd[3] = {-1, -1, -1}; + +static FILE *sysrq_fp = NULL; +static FILE *sysrq_trigger_fp = NULL; + +static char *watchdog_device = NULL; +static int watchdog_device_fd = -1; +static int watchdog_timeout = -1; +static pid_t watchdog_pid = -1; +static int watchdog_pipe[2] = {-1, -1}; +static guint watchdog_source_id = 0; +static int watchdog_timer_id = 0; + +static orig_open_f_type orig_open = NULL; +static orig_ioctl_f_type orig_ioctl = NULL; +static orig_write_f_type orig_write = NULL; +static orig_close_f_type orig_close = NULL; +static orig_fopen_f_type orig_fopen = NULL; +static orig_fclose_f_type orig_fclose = NULL; +static orig_io_setup_f_type orig_io_setup = NULL; +static orig_io_submit_f_type orig_io_submit = NULL; +static orig_io_getevents_f_type orig_io_getevents = NULL; +static orig_io_cancel_f_type orig_io_cancel = NULL; + +/* fprintf is inlined as __fprintf_chk or + * we have vfprintf. + * For fscanf we have vfscanf. + * For reboot we anyway don't want that to be + * called in any case. + */ + +static struct iocb *pending_iocb = NULL; +struct io_context { int context_num; }; +static struct io_context our_io_context = {.context_num = 1}; +static int translate_aio = 0; + +static GMainLoop *mainloop = NULL; + +#if 0 +static void +watchdog_shutdown(int nsig) +{ + if (watchdog_timer_id > 0) { + fprintf(log_fp, "exiting with watchdog-timer armed\n"); + } +} +#endif + +static void* +dlsym_fatal(void *handle, const char *symbol) +{ + void *rv = dlsym(handle, symbol); + + if (!rv) { + fprintf(stderr, "Failed looking up symbol %s\n", symbol); + exit(1); + } + return rv; +} + +static void +init (void) +{ + void *handle; + + if (!is_init) { + const char *value; + int i; + char *token, *str, *str_orig; + + is_init = 1; + + orig_open = (orig_open_f_type)dlsym_fatal(RTLD_NEXT,"open"); + orig_ioctl = (orig_ioctl_f_type)dlsym_fatal(RTLD_NEXT,"ioctl"); + orig_close = (orig_close_f_type)dlsym_fatal(RTLD_NEXT,"close"); + orig_write = (orig_write_f_type)dlsym_fatal(RTLD_NEXT,"write"); + orig_fopen = (orig_fopen_f_type)dlsym_fatal(RTLD_NEXT,"fopen"); + orig_fclose = (orig_fclose_f_type)dlsym_fatal(RTLD_NEXT,"fclose"); + + handle = dlopen("libaio.so.1", RTLD_NOW); + if (!handle) { + fprintf(stderr, "Failed opening libaio.so.1\n"); + exit(1); + } + orig_io_setup = (orig_io_setup_f_type)dlsym_fatal(handle,"io_setup"); + orig_io_submit = (orig_io_submit_f_type)dlsym_fatal(handle,"io_submit"); + orig_io_getevents = (orig_io_getevents_f_type)dlsym_fatal(handle,"io_getevents"); + orig_io_cancel = (orig_io_cancel_f_type)dlsym_fatal(handle,"io_cancel"); + dlclose(handle); + + value = getenv("SBD_PRELOAD_LOG"); + if (value) { + log_fp = fopen(value, "a"); + } else { + int fd = dup(fileno(stderr)); + if (fd >= 0) { + log_fp = fdopen(fd, "w"); + } + } + if (log_fp == NULL) { + fprintf(stderr, "couldn't open log-file\n"); + } + + value = getenv("SBD_WATCHDOG_DEV"); + if (value) { + watchdog_device = strdup(value); + } + + value = getenv("SBD_DEVICE"); + if ((value) && (str = str_orig = strdup(value))) { + for (i = 0; i < 3; i++, str = NULL) { + token = strtok(str, ";"); + if (token == NULL) { + break; + } + sbd_device[i] = strdup(token); + } + free(str_orig); + } + + value = getenv("SBD_TRANSLATE_AIO"); + if ((value) && !strcmp(value, "yes")) { + translate_aio = 1; + } + } +} + +// ***** end - handling of watchdog & block-devices **** + +static gboolean +watchdog_timeout_notify(gpointer data) +{ + fprintf(log_fp, "watchdog fired after %ds - killing process group\n", + watchdog_timeout); + fclose(log_fp); + log_fp = NULL; + killpg(0, SIGKILL); + exit(1); +} + +static gboolean +watchdog_dispatch_callback (gint fd, + GIOCondition condition, + gpointer user_data) +{ + char buf[256]; + int i = 0; + + if (condition & G_IO_HUP) { + return FALSE; + } + if (watchdog_timer_id > 0) { + g_source_remove(watchdog_timer_id); + } + watchdog_timer_id = 0; + for (i = 0; i < sizeof(buf)-1; i++) { + ssize_t len; + + do { + len = read(watchdog_pipe[0], &buf[i], 1); + } while ((len == -1) && (errno == EINTR)); + if (len <= 0) { + if (len == -1) { + fprintf(log_fp, "Couldn't read from watchdog-pipe\n"); + } + buf[i] = '\0'; + break; + } + if (buf[i] == '\n') { + buf[i] = '\0'; + break; + } + } + buf[sizeof(buf)-1] = '\0'; + if (sscanf(buf, "trigger %ds", &watchdog_timeout) == 1) { + watchdog_timer_id = g_timeout_add(watchdog_timeout * 1000, watchdog_timeout_notify, NULL); + } else if (strcmp(buf, "disarm") == 0) { + // timer is stopped already + } else { + fprintf(log_fp, "unknown watchdog command\n"); + } + return TRUE; +} + +static void +watchdog_arm (void) { + char buf[256]; + + if ((watchdog_timeout > 0) && (watchdog_pipe[1] >= 0)) { + sprintf(buf, "trigger %ds\n", watchdog_timeout); + if (write(watchdog_pipe[1], buf, strlen(buf)) != strlen(buf)) { + fprintf(log_fp, "Failed tickling watchdog via pipe\n"); + } + } +} + +static void +watchdog_disarm (void) { + char buf[256]; + + watchdog_timeout = -1; + if (watchdog_pipe[1] >= 0) { + sprintf(buf, "disarm\n"); + if (write(watchdog_pipe[1], buf, strlen(buf)) != strlen(buf)) { + fprintf(log_fp, "Failed disarming watchdog via pipe\n"); + } + } +} + +int +open(const char *pathname, int flags, ...) +{ + int i, fd; + int devnum = -1; + int is_wd_dev = 0; + va_list ap; + + init(); + + for (i=0; i < 3; i++) { + if (sbd_device[i]) { + if (strcmp(sbd_device[i], pathname) == 0) { + devnum = i; + flags &= ~O_DIRECT; + break; + } + } + } + if (watchdog_device) { + if (strcmp(watchdog_device, pathname) == 0) { + is_wd_dev = 1; + if (watchdog_pipe[1] == -1) { + if (pipe(watchdog_pipe) == -1) { + fprintf(log_fp, "Creating pipe for watchdog failed\n"); + } else { + int i; + + watchdog_pid = fork(); + switch (watchdog_pid) { + case -1: + fprintf(log_fp, "Forking watchdog-child failed\n"); + break; + case 0: + free(watchdog_device); + watchdog_device = NULL; + for (i = 0; i < 3; i++) { + free(sbd_device[i]); + sbd_device[i] = NULL; + } + close(watchdog_pipe[1]); + if (fcntl(watchdog_pipe[0], F_SETFL, O_NONBLOCK) == -1) { + // don't block on read for timer to be handled + fprintf(log_fp, + "Failed setting watchdog-pipe-read to non-blocking"); + } + mainloop = g_main_loop_new(NULL, FALSE); + // mainloop_add_signal(SIGTERM, watchdog_shutdown); + // mainloop_add_signal(SIGINT, watchdog_shutdown); + watchdog_source_id = g_unix_fd_add(watchdog_pipe[0], + G_IO_IN, + watchdog_dispatch_callback, + NULL); + if (watchdog_source_id == 0) { + fprintf(log_fp, "Failed creating source for watchdog-pipe\n"); + exit(1); + } + g_main_loop_run(mainloop); + g_main_loop_unref(mainloop); + exit(0); + default: + close(watchdog_pipe[0]); + if (fcntl(watchdog_pipe[1], F_SETFL, O_NONBLOCK) == -1) { + fprintf(log_fp, + "Failed setting watchdog-pipe-write to non-blocking"); + } + } + } + } + pathname = "/dev/null"; + } + } + + va_start (ap, flags); + fd = (flags & (O_CREAT +#ifdef O_TMPFILE + | O_TMPFILE +#endif + ))? + orig_open(pathname, flags, va_arg(ap, mode_t)): + orig_open(pathname, flags); + va_end (ap); + + if (devnum >= 0) { + sbd_device_fd[devnum] = fd; + } else if (is_wd_dev) { + watchdog_device_fd = fd; + } + + return fd; +} + +ssize_t +write(int fd, const void *buf, size_t count) +{ + init(); + + if ((fd == watchdog_device_fd) && (count >= 1)) { + if (*(const char *)buf == 'V') { + watchdog_disarm(); + } else { + watchdog_arm(); + } + } + + return orig_write(fd, buf, count); +} + +int +ioctl(int fd, unsigned long int request, ...) +{ + int rv = -1; + va_list ap; + int i; + + init(); + + va_start(ap, request); + switch (request) { + case BLKSSZGET: + for (i=0; i < 3; i++) { + if (sbd_device_fd[i] == fd) { + rv = 0; + *(va_arg(ap, int *)) = 512; + break; + } + if (i == 2) { + rv = orig_ioctl(fd, request, va_arg(ap, int *)); + } + } + break; + case WDIOC_SETTIMEOUT: + if (fd == watchdog_device_fd) { + watchdog_timeout = *va_arg(ap, int *); + + watchdog_arm(); + rv = 0; + break; + } + rv = orig_ioctl(fd, request, va_arg(ap, int *)); + break; + case WDIOC_SETOPTIONS: + if (fd == watchdog_device_fd) { + int flags = *va_arg(ap, int *); + + if (flags & WDIOS_DISABLECARD) { + watchdog_disarm(); + } + rv = 0; + break; + } + rv = orig_ioctl(fd, request, va_arg(ap, int *)); + break; + case WDIOC_GETSUPPORT: + rv = orig_ioctl(fd, request, va_arg(ap, struct watchdog_info *)); + break; + default: + fprintf(log_fp, "ioctl using unknown request = 0x%08lx", request); + rv = orig_ioctl(fd, request, va_arg(ap, void *)); + } + va_end(ap); + + return rv; +} + +int +close(int fd) +{ + int i; + + init(); + + if (fd == watchdog_device_fd) { + watchdog_device_fd = -1; + } else { + for (i = 0; i < 3; i++) { + if (sbd_device_fd[i] == fd) { + sbd_device_fd[i] = -1; + break; + } + } + } + return orig_close(fd); +} + +// ***** end - handling of watchdog & block-devices **** + +// ***** handling of sysrq, sysrq-trigger & reboot **** + +FILE * +fopen(const char *pathname, const char *mode) +{ + int is_sysrq = 0; + int is_sysrq_trigger = 0; + FILE *fp; + + init(); + + if ((strcmp("/proc/sys/kernel/sysrq", pathname) == 0) && + strcmp("w", mode)) { + pathname = "/dev/null"; + is_sysrq = 1; + } else if (strcmp("/proc/sysrq-trigger", pathname) == 0) { + pathname = "/dev/null"; + is_sysrq_trigger = 1; + } + fp = orig_fopen(pathname, mode); + if (is_sysrq) { + sysrq_fp = fp; + } else if (is_sysrq_trigger) { + sysrq_trigger_fp = fp; + } + return fp; +} + +int +fclose(FILE *fp) +{ + init(); + + if (fp == sysrq_fp) { + sysrq_fp = NULL; + } else if (fp == sysrq_trigger_fp) { + sysrq_trigger_fp = NULL; + } + return orig_fclose(fp); +} + +#if defined(__USE_FORTIFY_LEVEL) && (__USE_FORTIFY_LEVEL > 1) +int +__fprintf_chk(FILE *stream, int flag, const char *format, ...) +#else +int +fprintf(FILE *stream, const char *format, ...) +#endif +{ + va_list ap; + int rv; + + init(); + va_start (ap, format); + if (stream == sysrq_trigger_fp) { + char buf[256]; + + rv = vsnprintf(buf, sizeof(buf), format, ap); + if (rv >= 1) { + fprintf(log_fp, "sysrq-trigger ('%c') - %s\n", buf[0], + (buf[0] == 'c')?"killing process group":"don't kill but wait for reboot-call"); + if (buf[0] == 'c') { + fclose(log_fp); + log_fp = NULL; + killpg(0, SIGKILL); + exit(1); + } + } + } else { + rv = vfprintf(stream, format, ap); + } + va_end (ap); + return rv; +} + +int +fscanf(FILE *stream, const char *format, ...) +{ + va_list ap; + int rv; + + init(); + va_start (ap, format); + rv = vfscanf(stream, format, ap); + va_end (ap); + return rv; +} + +int +reboot (int __howto) +{ + fprintf(log_fp, "reboot (%s) - exiting inquisitor process\n", + (__howto == RB_POWER_OFF)?"poweroff":"reboot"); + fclose(log_fp); + log_fp = NULL; + killpg(0, SIGKILL); + exit(1); +} + +// ***** end - handling of sysrq, sysrq-trigger & reboot **** + +// ***** aio translate **** + +#if 0 +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; + +static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PREAD; + iocb->aio_reqprio = 0; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PWRITE; + iocb->aio_reqprio = 0; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} +#endif + +int io_setup(int nr_events, io_context_t *ctx_idp) +{ + init(); + + if (!translate_aio) { + return orig_io_setup(nr_events, ctx_idp); + } + + if (nr_events == 0) { + return EINVAL; + } + if (nr_events > 1) { + return EAGAIN; + } + if (ctx_idp == NULL) { + return EFAULT; + } + *ctx_idp = &our_io_context; + return 0; +} + + +int io_submit(io_context_t ctx_id, long nr, struct iocb *ios[]) +{ + init(); + + if (!translate_aio) { + return orig_io_submit(ctx_id, nr, ios); + } + + if ((pending_iocb != NULL) || + (nr > 1)) { + return EAGAIN; + } + if ((nr == 1) && ((ios == NULL) || (ios[0] == NULL))) { + return EFAULT; + } + if ((ctx_id != &our_io_context) || + (nr < 0) || + ((nr == 1) && + (ios[0]->aio_lio_opcode != IO_CMD_PREAD) && + (ios[0]->aio_lio_opcode != IO_CMD_PWRITE))) { + return EINVAL; + } + if ((fcntl(ios[0]->aio_fildes, F_GETFD) == -1) && (errno == EBADF)) { + return EBADF; + } + if (nr == 1) { + pending_iocb = ios[0]; + } + return nr; +} + +int io_getevents(io_context_t ctx_id, long min_nr, long nr, + struct io_event *events, struct timespec *timeout) +{ + init(); + + if (!translate_aio) { + return orig_io_getevents(ctx_id, min_nr, nr, events, timeout); + } + + if ((ctx_id != &our_io_context) || + (min_nr != 1) || + (nr != 1)) { + return EINVAL; + } + if (pending_iocb == NULL) { + return 0; + } + + switch (pending_iocb->aio_lio_opcode) { + case IO_CMD_PWRITE: + events->res = pwrite(pending_iocb->aio_fildes, + pending_iocb->u.c.buf, + pending_iocb->u.c.nbytes, + pending_iocb->u.c.offset); + break; + case IO_CMD_PREAD: + events->res = pread(pending_iocb->aio_fildes, + pending_iocb->u.c.buf, + pending_iocb->u.c.nbytes, + pending_iocb->u.c.offset); + break; + default: + events->res = 0; + } + + events->data = pending_iocb->data; + events->obj = pending_iocb; + + events->res2 = 0; + pending_iocb = NULL; + return 1; +} + +int io_cancel(io_context_t ctx_id, struct iocb *iocb, + struct io_event *result) +{ + init(); + + if (!translate_aio) { + return orig_io_cancel(ctx_id, iocb, result); + } + + if (ctx_id != &our_io_context) { + return EINVAL; + } + if ((iocb == NULL) || (result == NULL)) { + return EFAULT; + } + if (pending_iocb != iocb) { + return EAGAIN; + } + result->data = iocb->data; + result->obj = iocb; + result->res = 0; + result->res2 = 0; + pending_iocb = NULL; + return 0; +} + +// ***** end - aio translate ****