diff --git a/configure.ac b/configure.ac index 23547cf..11d12f0 100644 --- a/configure.ac +++ b/configure.ac @@ -1,302 +1,306 @@ dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT([sbd], [1.4.1], [lmb@suse.com]) m4_include([tests-opt.m4]) AC_CANONICAL_HOST AC_CONFIG_AUX_DIR(.) AC_CONFIG_HEADERS(config.h) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([no])]) AM_INIT_AUTOMAKE(1.11.1 foreign TESTS_OPTION) AM_PROG_CC_C_O PKG_CHECK_MODULES(glib, [glib-2.0]) PKG_CHECK_MODULES(libxml, [libxml-2.0]) PKG_CHECK_MODULES(cmap, [libcmap], HAVE_cmap=1, HAVE_cmap=0) PKG_CHECK_MODULES(votequorum, [libvotequorum], HAVE_votequorum=1, HAVE_votequorum=0) dnl pacemaker > 1.1.8 PKG_CHECK_MODULES(pacemaker, [pacemaker, pacemaker-cib], HAVE_pacemaker=1, HAVE_pacemaker=0) dnl pacemaker <= 1.1.8 PKG_CHECK_MODULES(pcmk, [pcmk, pcmk-cib], HAVE_pcmk=1, HAVE_pcmk=0) PKG_CHECK_MODULES(libqb, [libqb]) CPPFLAGS="$CPPFLAGS -Werror $glib_CFLAGS $libxml_CFLAGS" LIBS="$LIBS $glib_LIBS $libxml_LIBS" if test $HAVE_pacemaker = 0 -a $HAVE_pcmk = 0; then AC_MSG_ERROR(No package 'pacemaker' found) elif test $HAVE_pacemaker = 1; then CPPFLAGS="$CPPFLAGS $glib_CFLAGS $pacemaker_CFLAGS" if test $HAVE_cmap = 0; then AC_MSG_NOTICE(No library 'cmap' found) else CPPFLAGS="$CPPFLAGS $cmap_CFLAGS" LIBS="$LIBS $cmap_LIBS" fi if test $HAVE_votequorum = 0; then AC_MSG_NOTICE(No library 'votequorum' found) else CPPFLAGS="$CPPFLAGS $votequorum_CFLAGS" LIBS="$LIBS $votequorum_LIBS" fi fi CPPFLAGS="$CPPFLAGS $libqb_CFLAGS $pacemaker_CFLAGS $pcmk_CFLAGS" LIBS="$LIBS $libqb_LIBS $pacemaker_LIBS $pcmk_LIBS" dnl checks for libraries AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc... AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux) AC_CHECK_LIB(aio, io_setup, , missing="yes") AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set, , missing="yes") AC_CHECK_LIB(cib, cib_new, , missing="yes") AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes") AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0) AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0) +AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0) dnl pacemaker >= 1.1.8 AC_CHECK_HEADERS(crm/cluster.h) AC_CHECK_LIB(crmcommon, pcmk_strerror, , missing="yes") AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes") dnl pacemaker-2.0 removed support for corosync 1 cluster layer AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,, [#include ]) dnl check for additional no-quorum-policies dnl AC_TEST_NO_QUORUM_POLICY(POLICY) AC_DEFUN([AC_TEST_NO_QUORUM_POLICY],[ AC_MSG_CHECKING([whether enum pe_quorum_policy defines value $1]) AC_LANG_PUSH([C]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM( [#include ], [enum pe_quorum_policy policy = $1; return policy;])], AC_DEFINE_UNQUOTED(m4_toupper(HAVE_ENUM_$1), 1, [Does pe_types.h have $1 value in enum pe_quorum_policy?]) AC_MSG_RESULT([yes]), AC_MSG_RESULT([no])) AC_LANG_POP([C]) ]) AC_TEST_NO_QUORUM_POLICY(no_quorum_demote) dnl check for new pe-API AC_CHECK_FUNCS(pe_new_working_set) dnl check if votequorum comes with default for qdevice-sync_timeout AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT], HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1, HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0, [#include ]) if test "$missing" = "yes"; then AC_MSG_ERROR([Missing required libraries or functions.]) fi AC_PATH_PROGS(POD2MAN, pod2man, pod2man) AC_ARG_ENABLE([shared-disk], [ --enable-shared-disk Turn on functionality that requires shared disk [default=yes]]) DISK=0 if test "x${enable_shared_disk}" != xno ; then DISK=1 fi AC_DEFINE_UNQUOTED(SUPPORT_SHARED_DISK, $DISK, Turn on functionality that requires shared disk) AM_CONDITIONAL(SUPPORT_SHARED_DISK, test "$DISK" = "1") if test -e /proc/$$ then echo "/proc/{pid} is supported" AC_DEFINE_UNQUOTED(HAVE_PROC_PID, 1, Define to 1 if /proc/{pid} is supported.) fi AC_DEFINE_UNQUOTED(CHECK_TWO_NODE, $HAVE_cmap, Turn on checking for 2-node cluster) AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1") AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle) AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1") AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT, ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap), Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching) AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT, test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" && test "$HAVE_cmap" = "1") +AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd) +AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1") + CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR Directory for SBD configuration file [${CONFIGDIR}]], [ CONFIGDIR="$withval" ] ) # # Where is dlopen? # if test "$ac_cv_lib_c_dlopen" = yes; then LIBADD_DL="" elif test "$ac_cv_lib_dl_dlopen" = yes; then LIBADD_DL=-ldl else LIBADD_DL=${lt_cv_dlopen_libs} fi dnl ********************************************************************** dnl Check for various argv[] replacing functions on various OSs dnl dnl Borrowed from Proftpd dnl Proftpd is Licenced under the terms of the GNU General Public Licence dnl and is available from http://www.proftpd.org/ dnl AC_CHECK_FUNCS(setproctitle) AC_CHECK_HEADERS(libutil.h) AC_CHECK_LIB(util, setproctitle, [AC_DEFINE(HAVE_SETPROCTITLE,1,[ ]) ac_cv_func_setproctitle="yes" ; LIBS="$LIBS -lutil"]) if test "$ac_cv_func_setproctitle" = "yes"; then pf_argv_set="PF_ARGV_NONE" fi if test "$pf_argv_set" = ""; then AC_CHECK_HEADERS(sys/pstat.h) if test "$ac_cv_header_pstat_h" = "yes"; then AC_CHECK_FUNCS(pstat) if test "$ac_cv_func_pstat" = "yes"; then pf_argv_set="PF_ARGV_PSTAT" else pf_argv_set="PF_ARGV_WRITEABLE" fi fi if test "$pf_argv_set" = ""; then AC_EGREP_HEADER([#define.*PS_STRINGS.*],sys/exec.h, have_psstrings="yes",have_psstrings="no") if test "$have_psstrings" = "yes"; then pf_argv_set="PF_ARGV_PSSTRINGS" fi fi if test "$pf_argv_set" = ""; then AC_CACHE_CHECK(whether __progname and __progname_full are available, pf_cv_var_progname, AC_TRY_LINK([extern char *__progname, *__progname_full;], [__progname = "foo"; __progname_full = "foo bar";], pf_cv_var_progname="yes", pf_cv_var_progname="no")) if test "$pf_cv_var_progname" = "yes"; then AC_DEFINE(HAVE___PROGNAME,1,[ ]) fi AC_CACHE_CHECK(which argv replacement method to use, pf_cv_argv_type, AC_EGREP_CPP(yes,[ #if defined(__GNU_HURD__) yes #endif ],pf_cv_argv_type="new", pf_cv_argv_type="writeable")) if test "$pf_cv_argv_type" = "new"; then pf_argv_set="PF_ARGV_NEW" fi if test "$pf_argv_set" = ""; then pf_argv_set="PF_ARGV_WRITEABLE" fi fi fi AC_DEFINE_UNQUOTED(PF_ARGV_TYPE, $pf_argv_set, mechanism to pretty-print ps output: setproctitle-equivalent) dnl End of tests borrowed from Proftpd AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr dnl Fix default variables - "prefix" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi ;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac dnl Expand autoconf variables so that we dont end up with '${prefix}' dnl in #defines and python scripts dnl NOTE: Autoconf deliberately leaves them unexpanded to allow dnl make exec_prefix=/foo install dnl No longer being able to do this seems like no great loss to me... eval prefix="`eval echo ${prefix}`" eval exec_prefix="`eval echo ${exec_prefix}`" eval bindir="`eval echo ${bindir}`" eval sbindir="`eval echo ${sbindir}`" eval libexecdir="`eval echo ${libexecdir}`" eval datadir="`eval echo ${datadir}`" eval sysconfdir="`eval echo ${sysconfdir}`" eval sharedstatedir="`eval echo ${sharedstatedir}`" eval localstatedir="`eval echo ${localstatedir}`" eval libdir="`eval echo ${libdir}`" eval includedir="`eval echo ${includedir}`" eval oldincludedir="`eval echo ${oldincludedir}`" eval infodir="`eval echo ${infodir}`" eval mandir="`eval echo ${mandir}`" AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries if test x"${CONFIGDIR}" = x""; then CONFIGDIR="${sysconfdir}/sysconfig" fi AC_SUBST(CONFIGDIR) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES([Makefile src/Makefile agent/Makefile man/Makefile agent/sbd src/sbd.service src/sbd_remote.service src/sbd.sh]) AC_CONFIG_SUBDIRS([tests]) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index 52ede8a..962725e 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,1278 +1,1298 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "sbd.h" #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int disk_priority = 1; int check_pcmk = 1; int check_cluster = 1; int disk_count = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; bool do_flush = true; char timeout_sysrq_char = 'b'; bool move_to_root_cgroup = true; bool enforce_moving_to_root_cgroup = false; +bool sync_resource_startup = false; int parse_device_line(const char *line); void recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { cl_log(LOG_DEBUG, "Servant %s already exists", devname); return; } newbie = malloc(sizeof(*newbie)); if (newbie) { memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; } if (!newbie || !newbie->devname) { fprintf(stderr, "heap allocation failed in recruit_servant.\n"); exit(1); } /* some sanity-check on our newbie */ if (sbd_is_disk(newbie)) { cl_log(LOG_INFO, "Monitoring %s", devname); disk_count++; } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { /* alive just after pcmk and cluster servants have shown up */ newbie->outdated = 1; } else { /* toss our newbie */ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); free((void *) newbie->devname); free(newbie); return; } if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strcasecmp(s->devname, devname) == 0) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif } else if(sbd_is_pcmk(s)) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else if(sbd_is_cluster(s)) { DBGLOG(LOG_INFO, "Starting Cluster servant"); s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); } else { cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } static inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int cluster_alive(bool all) { int alive = 1; struct servants_list_item* s; if(servant_count == disk_count) { return 0; } for (s = servants_leader; s; s = s->next) { if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if(s->outdated) { alive = 0; } else if(all == false) { return 1; } } } return alive; } int quorum_read(int good_servants) { if (disk_count > 2) return (good_servants > disk_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int cluster_appeared = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { bool tickle = 0; bool can_detach = 0; int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; default: break; } } } else if (sbd_is_pcmk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); /* revert to state prior to pacemaker-detection */ s->restarts = 0; s->restart_blocked = 0; cluster_appeared = 0; s->outdated = 1; s->t_last.tv_sec = 0; break; default: break; } } } cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if (s->outdated == 0) { cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname); } s->t_last.tv_sec = 1; } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { if (sbd_is_disk(s)) { good_servants++; } if (s->outdated) { cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age); } s->outdated = 0; } else if (!s->outdated) { if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(disk_count == 0) { /* NO disks, everything is up to the cluster */ if(cluster_alive(true)) { /* We LIVE! */ if(cluster_appeared == false) { cl_log(LOG_INFO, "Active cluster detected"); } tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(cluster_alive(false)) { if(!decoupled) { /* On the way up, detach and arm the watchdog */ cl_log(LOG_INFO, "Partial cluster detected, detaching"); } can_detach = 1; tickle = !cluster_appeared; } else if(!decoupled) { /* Stay alive until the cluster comes up */ tickle = !cluster_appeared; } } else if(disk_priority == 1 || servant_count == disk_count) { if (quorum_read(good_servants)) { /* There are disks and we're connected to the majority of them */ tickle = 1; can_detach = 1; pcmk_override = 0; } else if (servant_count > disk_count && cluster_alive(true)) { tickle = 1; if(!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Only log this message once */ } } } else if(cluster_alive(true) && quorum_read(good_servants)) { /* Both disk and cluster servants are healthy */ tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(quorum_read(good_servants)) { /* The cluster takes priority but only once * connected for the first time. * * Until then, we tickle based on disk quorum. */ can_detach = 1; tickle = !cluster_appeared; } /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, tickle, disk_count); */ if(tickle) { watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } if (!decoupled && can_detach) { /* We only do this at the point either the disk or * cluster servants become healthy */ cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_timeout_action(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_timeout_action(); } } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { size_t lpc = 0; size_t last = 0; size_t max = 0; int found = 0; bool skip_space = true; int space_run = 0; if (!line) { return 0; } max = strlen(line); cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); for (lpc = 0; lpc <= max; lpc++) { if (isspace(line[lpc])) { if (skip_space) { last = lpc + 1; } else { space_run++; } continue; } skip_space = false; if (line[lpc] == ';' || line[lpc] == 0) { int rc = 0; char *entry = calloc(1, 1 + lpc - last); if (entry) { rc = sscanf(line + last, "%[^;]", entry); } else { fprintf(stderr, "Heap allocation failed parsing device-line.\n"); exit(1); } if (rc != 1) { cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); recruit_servant(entry, 0); found++; } free(entry); skip_space = true; last = lpc + 1; } space_run = 0; } return found; } #define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c" static void sbd_log_filter_ctl(const char *files, uint8_t priority) { if (files == NULL) { files = SBD_SOURCE_FILES; } qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); } int arg_enabled(int arg_count) { return arg_count % 2; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int W_count = 0; int c_count = 0; int P_count = 0; int qb_facility; const char *value = NULL; bool delay_start = false; long delay = 0; char *timeout_action = NULL; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); watchdogdev_is_default = true; qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_log_filter_ctl(NULL, LOG_NOTICE); sbd_get_uname(); value = getenv("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); check_cluster = crm_is_true(value); } cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default"); value = getenv("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = getenv("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); watchdogdev_is_default = false; } /* SBD_WATCHDOG has been dropped from sbd.sysconfig example. * This is for backward compatibility. */ value = getenv("SBD_WATCHDOG"); if(value) { watchdog_use = crm_is_true(value); } value = getenv("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } } value = getenv("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = getenv("SBD_DELAY_START"); if(value) { delay_start = crm_is_true(value); if (!delay_start) { delay = crm_get_msec(value) / 1000; if (delay > 0) { delay_start = true; } } } cl_log(LOG_DEBUG, "Delay start: %s%s%s", delay_start? "yes (" : "no", delay_start? (delay > 0 ? value: "msgwait") : "", delay_start? ")" : ""); value = getenv("SBD_TIMEOUT_ACTION"); if(value) { timeout_action = strdup(value); } value = getenv("SBD_MOVE_TO_ROOT_CGROUP"); if(value) { move_to_root_cgroup = crm_is_true(value); if (move_to_root_cgroup) { enforce_moving_to_root_cgroup = true; } else { if (strcmp(value, "auto") == 0) { move_to_root_cgroup = true; } } } + value = getenv("SBD_SYNC_RESOURCE_STARTUP"); + if(value) { + sync_resource_startup = crm_is_true(value); + } +#if !USE_PACEMAKERD_API + if (sync_resource_startup) { + fprintf(stderr, "Failed to sync resource-startup as " + "SBD was built against pacemaker not supporting pacemakerd-API.\n"); + exit_status = -1; + goto out; + } +#else + if (!sync_resource_startup) { + cl_log(LOG_WARNING, "SBD built against pacemaker supporting " + "pacemakerd-API. Should think about enabling " + "SBD_SYNC_RESOURCE_STARTUP."); + } +#endif + while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) { switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = atoi(optarg); cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = atoi(optarg); cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { sbd_log_filter_ctl(NULL, LOG_INFO); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { sbd_log_filter_ctl(NULL, LOG_DEBUG); cl_log(LOG_INFO, "Debug mode enabled."); } else if(debug == 3) { /* Go nuts, turn on pacemaker's logging too */ sbd_log_filter_ctl("*", LOG_DEBUG); cl_log(LOG_INFO, "Debug library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': W_count++; break; case 'w': cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); free(watchdogdev); watchdogdev = strdup(optarg); watchdogdev_is_default = false; break; case 'd': #if SUPPORT_SHARED_DISK recruit_servant(optarg, 0); #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; case 'c': c_count++; break; case 'P': P_count++; break; case 'z': disk_priority = 0; break; case 'n': local_uname = strdup(optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = atoi(optarg); cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = atoi(optarg); if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } break; case '2': timeout_allocate = atoi(optarg); break; case '3': timeout_loop = atoi(optarg); break; case '4': timeout_msgwait = atoi(optarg); break; case '5': timeout_watchdog_warn = atoi(optarg); cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = atoi(optarg); cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = atoi(optarg); cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = atoi(optarg); cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'r': if (timeout_action) { free(timeout_action); } timeout_action = strdup(optarg); break; case 'h': usage(); goto out; break; default: exit_status = -2; goto out; break; } } if (disk_count == 0) { /* if we already have disks from commandline then it is probably undesirable to add those from environment (general rule cmdline has precedence) */ value = getenv("SBD_DEVICE"); if ((value) && strlen(value)) { #if SUPPORT_SHARED_DISK int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); exit_status = -2; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif } } if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } else if (W_count > 0) { watchdog_use = arg_enabled(W_count); } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (c_count > 0) { check_cluster = arg_enabled(c_count); } if (P_count > 0) { check_pcmk = arg_enabled(P_count); } if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) { fprintf(stderr, "Node name mustn't be longer than %d chars.\n", SECTOR_NAME_MAX); fprintf(stderr, "If uname is longer define a name to be used by sbd.\n"); exit_status = -1; goto out; } if (disk_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } if (timeout_action) { char *p[2]; int i; char c; int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c); bool parse_error = (nrflags < 1) || (nrflags > 2); for (i = 0; (i < nrflags) && (i < 2); i++) { if (!strcmp(p[i], "reboot")) { timeout_sysrq_char = 'b'; } else if (!strcmp(p[i], "crashdump")) { timeout_sysrq_char = 'c'; } else if (!strcmp(p[i], "off")) { timeout_sysrq_char = 'o'; } else if (!strcmp(p[i], "flush")) { do_flush = true; } else if (!strcmp(p[i], "noflush")) { do_flush = false; } else { parse_error = true; } free(p[i]); } if (parse_error) { fprintf(stderr, "Failed to parse timeout-action \"%s\".\n", timeout_action); exit_status = -1; goto out; } } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "watch") == 0) { if(disk_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); } if (delay_start) { if (delay <= 0) { delay = get_first_msgwait(servants_leader); } sleep((unsigned long) delay); } } else { exit_status = -2; } #endif if (strcmp(argv[optind], "query-watchdog") == 0) { exit_status = watchdog_info(); } else if (strcmp(argv[optind], "test-watchdog") == 0) { exit_status = watchdog_test(); } else if (strcmp(argv[optind], "watch") == 0) { /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); #if SUPPORT_PLUGIN check_cluster = 1; #endif } cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); if (check_cluster) { recruit_servant("cluster", 0); } cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout", do_flush?"Do":"Skip", timeout_sysrq_char); exit_status = inquisitor(); } out: if (timeout_action) { free(timeout_action); } if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 6e53557..aa1fb57 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -1,587 +1,699 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sbd.h" #ifndef HAVE_PE_NEW_WORKING_SET #define pe_reset_working_set(data_set) cleanup_calculations(data_set) static pe_working_set_t * pe_new_working_set() { pe_working_set_t *data_set = calloc(1, sizeof(pe_working_set_t)); if (data_set != NULL) { set_working_set_defaults(data_set); } return data_set; } static void pe_free_working_set(pe_working_set_t *data_set) { if (data_set != NULL) { pe_reset_working_set(data_set); free(data_set); } } #endif +static void clean_up(int rc); + +#if USE_PACEMAKERD_API +#include + +static pcmk_ipc_api_t *pacemakerd_api = NULL; +static time_t last_ok = (time_t) 0; + +static void +pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, + enum pcmk_ipc_event event_type, crm_exit_t status, + void *event_data, void *user_data) +{ + pcmk_pacemakerd_api_reply_t *reply = event_data; + + switch (event_type) { + case pcmk_ipc_event_disconnect: + /* Unexpected */ + cl_log(LOG_ERR, "Lost connection to pacemakerd\n"); + return; + + case pcmk_ipc_event_reply: + break; + + default: + return; + } + + if (status != CRM_EX_OK) { + cl_log(LOG_ERR, "Bad reply from pacemakerd: %s", + crm_exit_str(status)); + return; + } + + if (reply->reply_type != pcmk_pacemakerd_reply_ping) { + cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n", + reply->reply_type); + } else { + if ((reply->data.ping.last_good != (time_t) 0) && + (reply->data.ping.status == pcmk_rc_ok)) { + switch (reply->data.ping.state) { + case pcmk_pacemakerd_state_running: + case pcmk_pacemakerd_state_shutting_down: + last_ok = reply->data.ping.last_good; + break; + case pcmk_pacemakerd_state_shutdown_complete: + clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); + break; + default: + break; + } + } + } +} +#endif + extern int disk_count; static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static int cib_connect(gboolean full); static void compute_status(pe_working_set_t * data_set); static gboolean mon_refresh_state(gpointer user_data); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; static int cib_connected = 0; static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static pe_working_set_t *data_set = NULL; static long last_refresh = 0; static int pcmk_clean_shutdown = 0; static int pcmk_shutdown = 0; static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { cib->cmds->signoff(cib); /* retrigger as last one might have been skipped */ mon_refresh_state(NULL); - if (pcmk_clean_shutdown) { + + + if ((pcmk_clean_shutdown) && (!sync_resource_startup)) { /* assume a graceful pacemaker-shutdown */ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); } + /* getting here we aren't sure about the pacemaker-state so try to use the timeout to reconnect and get everything sorted out again */ pcmk_shutdown = 0; set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; /* no sense in looking into outdated cib, trying to apply patch, ... */ if (current_cib) { free_xml(current_cib); current_cib = NULL; } return; } static void mon_retrieve_current_cib() { xmlNode *xml_cib = NULL; int options = cib_scope_local | cib_sync_call; int rc = pcmk_ok; free_xml(current_cib); current_cib = NULL; rc = cib->cmds->query(cib, NULL, &xml_cib, options); if (rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); free_xml(xml_cib); return; } else if (xml_cib == NULL) { crm_err("Couldn't retrieve the CIB: empty result"); return; } if (safe_str_eq(crm_element_name(xml_cib), XML_TAG_CIB)) { current_cib = xml_cib; } else { free_xml(xml_cib); } return; } static gboolean mon_timer_notify(gpointer data) { static int counter = 0; int counter_max = timeout_watchdog / timeout_loop / 2; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } +#if USE_PACEMAKERD_API + { + time_t now = time(NULL); + + if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) { +#endif + if (cib_connected) { if (counter == counter_max) { mon_retrieve_current_cib(); mon_refresh_state(NULL); counter = 0; } else { cib->cmds->noop(cib, 0); notify_parent(); counter++; } } + +#if USE_PACEMAKERD_API + } + } + if (pcmk_connect_ipc(pacemakerd_api, + pcmk_ipc_dispatch_main) == pcmk_rc_ok) { + pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); + } +#endif + timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; CRM_CHECK(cib != NULL, return -EINVAL); cib_connected = 0; crm_xml_init(); if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } mon_retrieve_current_cib(); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } static void compute_status(pe_working_set_t * data_set) { static int updates = 0; static int ever_had_quorum = FALSE; node_t *node = NULL; updates++; if (data_set->dc_node == NULL) { set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); notify_parent(); return; } node = pe_find_node(data_set->nodes, local_uname); if ((node == NULL) || (node->details == NULL)) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); notify_parent(); return; } if (node->details->online == FALSE) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); } else if (node->details->pending) { set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); } else if (data_set->flags & pe_flag_have_quorum) { set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; } else if(disk_count > 0) { set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); } else if(ever_had_quorum == FALSE) { set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); } else { /* We lost quorum, and there are no disks present * Setting healthy > 2 here will result in us self-fencing */ switch (data_set->no_quorum_policy) { case no_quorum_freeze: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); break; #if HAVE_ENUM_NO_QUORUM_DEMOTE case no_quorum_demote: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Demote promotable resources and stop others"); break; #endif case no_quorum_stop: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); break; case no_quorum_ignore: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); break; default: /* immediate reboot is the most excessive action we take use for no_quorum_suicide and everything we don't know yet */ set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); break; } } /* If we are in shutdown-state once this will go on till the end. * If we've on top reached a state of 0 locally running resources * we can assume a clean shutdown. * Tricky are the situations where the node is in maintenance-mode * or resources are unmanaged. So if the node is in maintenance or * all left-over running resources are unmanaged we assume intention. */ if (node->details->shutdown) { pcmk_shutdown = 1; } if (pcmk_shutdown) { pcmk_clean_shutdown = 1; if (!(node->details->maintenance)) { GListPtr iter; for (iter = node->details->running_rsc; iter != NULL; iter = iter->next) { resource_t *rsc = (resource_t *) iter->data; if (is_set(rsc->flags, pe_rsc_managed)) { pcmk_clean_shutdown = 0; crm_debug("not clean as %s managed and still running", rsc->id); break; } } if (pcmk_clean_shutdown) { crm_debug("pcmk_clean_shutdown because " "all managed resources down"); } } else { crm_debug("pcmk_clean_shutdown because node is in maintenance"); } } notify_parent(); return; } static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger(refresh_trigger); mon_refresh_state(NULL); return FALSE; } #define XPATH_SHUTDOWN "//" XML_CIB_TAG_STATE "[@uname='%s']/" \ XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" \ XML_CIB_TAG_NVPAIR "[@name='" XML_CIB_ATTR_SHUTDOWN "']" static gboolean shutdown_attr_in_cib(void) { xmlNode *match = NULL; char *xpath_string; xpath_string = crm_strdup_printf(XPATH_SHUTDOWN, local_uname); if (xpath_string) { match = get_xpath_object(xpath_string, current_cib, LOG_TRACE); free(xpath_string); } return (match != NULL); } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; long now = time(NULL); static int updates = 0; static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); break; case pcmk_ok: updates++; break; default: crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); break; } } if (current_cib == NULL) { mon_retrieve_current_cib(); } /* Refresh * - immediately if the last update was more than 1s ago * - every 10 updates * - at most 1s after the last update * - shutdown attribute for our node set for the first time */ if ((!pcmk_shutdown && shutdown_attr_in_cib()) || (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000))) { mon_refresh_state(refresh_timer); updates = 0; } else { mainloop_set_trigger(refresh_trigger); mainloop_timer_start(refresh_timer); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = NULL; if(current_cib == NULL) { return FALSE; } if(user_data) { mainloop_timer_t *timer = user_data; mainloop_timer_stop(timer); } cib_copy = copy_xml(current_cib); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { last_refresh = time(NULL); data_set->input = cib_copy; data_set->flags |= pe_flag_have_stonith_resource; cluster_status(data_set); compute_status(data_set); pe_reset_working_set(data_set); } return FALSE; } static void clean_up(int rc) { if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); timer_id_reconnect = 0; } if (timer_id_notify > 0) { g_source_remove(timer_id_notify); timer_id_notify = 0; } if (data_set != NULL) { pe_free_working_set(data_set); data_set = NULL; } if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } +#if USE_PACEMAKERD_API + if (pacemakerd_api != NULL) { + pcmk_ipc_api_t *capi = pacemakerd_api; + pacemakerd_api = NULL; // Ensure we can't free this twice + pcmk_free_ipc_api(capi); + } +#endif + if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, int mode, const void* argp) { - int exit_code = 0; + int exit_code = 0; - crm_system_name = strdup("sbd:pcmk"); - cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); - set_proc_title("sbd: watcher: Pacemaker"); + crm_system_name = strdup("sbd:pcmk"); + cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); + set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); } - if (data_set == NULL) { - data_set = pe_new_working_set(); - } - if (data_set == NULL) { - return -1; - } + if (data_set == NULL) { + data_set = pe_new_working_set(); + } + if (data_set == NULL) { + return -1; + } + +#if USE_PACEMAKERD_API + { + int rc; + + rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd); + if (pacemakerd_api == NULL) { + cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n", + pcmk_rc_str(rc)); + return -1; + } + pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL); + do { + rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main); + if (rc != pcmk_rc_ok) { + cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n", + pcmk_rc_str(rc)); + sleep(reconnect_msec / 1000); + } + } while (rc != pcmk_rc_ok); + /* send a ping to pacemakerd to wake it up */ + pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); + /* cib should come up now as well so it's time + * to have the inquisitor have a closer look + */ + notify_parent(); + } +#endif if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_loop_new(NULL, FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd.h b/src/sbd.h index 382e553..3b6647c 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,211 +1,212 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ /* exit status for disk-servant */ #define EXIT_MD_SERVANT_IO_FAIL 20 #define EXIT_MD_SERVANT_REQUEST_RESET 21 #define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 #define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 /* exit status for pcmk-servant */ #define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 #define HOG_CHAR 0xff #define SECTOR_NAME_MAX 63 /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[SECTOR_NAME_MAX+1]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[SECTOR_NAME_MAX+1]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; }; enum pcmk_health { pcmk_health_unknown, pcmk_health_pending, pcmk_health_transient, pcmk_health_unclean, pcmk_health_shutdown, pcmk_health_online, pcmk_health_noquorum, }; void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); int watchdog_info(void); int watchdog_test(void); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); void do_timeout_action(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); void notify_parent(void); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern bool watchdogdev_is_default; extern char* local_uname; extern bool do_flush; extern char timeout_sysrq_char; extern bool move_to_root_cgroup; extern bool enforce_moving_to_root_cgroup; +extern bool sync_resource_startup; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant_md(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) extern int servant_health; void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4))); bool sbd_is_disk(struct servants_list_item *servant); bool sbd_is_pcmk(struct servants_list_item *servant); bool sbd_is_cluster(struct servants_list_item *servant); diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig index 33b50d0..b32e826 100644 --- a/src/sbd.sysconfig +++ b/src/sbd.sysconfig @@ -1,114 +1,128 @@ ## Type: string ## Default: "" # # SBD_DEVICE specifies the devices to use for exchanging sbd messages # and to monitor. If specifying more than one path, use ";" as # separator. # #SBD_DEVICE="" ## Type: yesno ## Default: yes # # Whether to enable the pacemaker integration. # SBD_PACEMAKER=yes ## Type: always / clean ## Default: always # # Specify the start mode for sbd. Setting this to "clean" will only # allow sbd to start if it was not previously fenced. See the -S option # in the man page. # SBD_STARTMODE=always ## Type: yesno / integer ## Default: no # # Whether to delay after starting sbd on boot for "msgwait" seconds. # This may be necessary if your cluster nodes reboot so fast that the # other nodes are still waiting in the fence acknowledgement phase. # This is an occasional issue with virtual machines. # # This can also be enabled by being set to a specific delay value, in # seconds. Sometimes a longer delay than the default, "msgwait", is # needed, for example in the cases where it's considered to be safer to # wait longer than: # corosync token timeout + consensus timeout + pcmk_delay_max + msgwait # # Be aware that the special value "1" means "yes" rather than "1s". # # Consider that you might have to adapt the startup-timeout accordingly # if the default isn't sufficient. (TimeoutStartSec for systemd) # # This option may be ignored at a later point, once pacemaker handles # this case better. # SBD_DELAY_START=no ## Type: string ## Default: /dev/watchdog # # Watchdog device to use. If set to /dev/null, no watchdog device will # be used. # SBD_WATCHDOG_DEV=/dev/watchdog ## Type: integer ## Default: 5 # # How long, in seconds, the watchdog will wait before panicking the # node if no-one tickles it. # # This depends mostly on your storage latency; the majority of devices # must be successfully read within this time, or else the node will # self-fence. # # If your sbd device(s) reside on a multipath setup or iSCSI, this # should be the time required to detect a path failure. # # Be aware that watchdog timeout set in the on-disk metadata takes # precedence. # SBD_WATCHDOG_TIMEOUT=5 ## Type: string ## Default: "flush,reboot" # # Actions to be executed when the watchers don't timely report to the sbd # master process or one of the watchers detects that the master process # has died. # # Set timeout-action to comma-separated combination of # noflush|flush plus reboot|crashdump|off. # If just one of both is given the other stays at the default. # # This doesn't affect actions like off, crashdump, reboot explicitly # triggered via message slots. # And it does as well not configure the action a watchdog would # trigger should it run off (there is no generic interface). # SBD_TIMEOUT_ACTION=flush,reboot ## Type: yesno / auto ## Default: auto # # If CPUAccounting is enabled default is not to assign any RT-budget # to the system.slice which prevents sbd from running RR-scheduled. # # One way to escape that issue is to move sbd-processes from the # slice they were originally started to root-slice. # Of course starting sbd in a certain slice might be intentional. # Thus in auto-mode sbd will check if the slice has RT-budget assigned. # If that is the case sbd will stay in that slice while it will # be moved to root-slice otherwise. # SBD_MOVE_TO_ROOT_CGROUP=auto +## Type: yesno +## Default: no +# +# If resource startup syncing is enabled then pacemakerd is +# gonna wait to be pinged via IPC before it starts resources. +# On shutdown pacemakerd is going to wait in a state where it +# has cleanly shutdown resources till sbd fetches that state. +# +# Default is 'no' to prevent pacemaker from waiting for a +# ping that will never come when working together with an sbd +# version that doesn't support the feature. +# +SBD_SYNC_RESOURCE_STARTUP=no + ## Type: string ## Default: "" # # Additional options for starting sbd # SBD_OPTS=