diff --git a/src/Makefile.am b/src/Makefile.am index cd65533..045ed0f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,14 +1,14 @@ AM_CFLAGS = -D_GNU_SOURCE -DCHECK_AIS AM_CPPFLAGS = -I$(includedir)/pacemaker \ $(glib_CFLAGS) sbin_PROGRAMS = sbd -sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c setproctitle.c +sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c if SUPPORT_SHARED_DISK sbd_SOURCES += sbd-md.c endif sbd_LDFLAGS = $(glib_LIBS) $(libcoroipcc_LIBS) diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c new file mode 100644 index 0000000..dbc2bf8 --- /dev/null +++ b/src/sbd-cluster.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2013 Lars Marowsky-Bree + * + * Based on crm_mon.c, which was: + * Copyright (C) 2004 Andrew Beekhof + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* TODO list: + * + * - Trying to shutdown a node if no devices are up will fail, since SBD + * currently uses a message via the disk to achieve this. + * + * - Shutting down cluster nodes while the majority of devices is down + * will eventually take the cluster below the quorum threshold, at which + * time the remaining cluster nodes will all immediately suicide. + * + */ + +#include "sbd.h" + +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef SUPPORT_PLUGIN +# include +static guint timer_id_ais = 0; +static struct timespec t_last_quorum; +static int check_ais = 0; +#endif + +extern int servant_count; +enum pcmk_health cluster_healthy; +static int last_state = 0; +static int reconnect_msec = 1000; +static GMainLoop *mainloop = NULL; + +static void +set_cluster_health(enum pcmk_health healthy) +{ + cluster_healthy = healthy; + notify_parent(cluster_healthy); +} + +static enum cluster_type_e cluster_stack = pcmk_cluster_unknown; + +void +update_status(void) +{ + enum pcmk_health healthy = pcmk_health_unknown; +#ifdef SUPPORT_PLUGIN + if (check_ais) { + struct timespec t_now; + int quorum_age = t_now.tv_sec - t_last_quorum.tv_sec; + + clock_gettime(CLOCK_MONOTONIC, &t_now); + + if (quorum_age > (int)(timeout_io+timeout_loop)) { + if (t_last_quorum.tv_sec != 0) + LOGONCE(pcmk_health_transient, LOG_WARNING, "AIS: Quorum outdated"); + + } else if (crm_have_quorum) { + LOGONCE(pcmk_health_online, LOG_INFO, "AIS: We have quorum"); + + } else { + LOGONCE(pcmk_health_unclean, LOG_WARNING, "AIS: We do NOT have quorum"); + } + } +#endif + set_cluster_health(healthy); +} + +#ifdef SUPPORT_PLUGIN +static gboolean +plugin_timer(gpointer data) +{ + if (timer_id_ais > 0) { + g_source_remove(timer_id_ais); + } + + send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); + + /* The timer is set in the response processing */ + return FALSE; +} + +static void +plugin_membership_destroy(gpointer user_data) +{ + cl_log(LOG_ERR, "AIS connection terminated - corosync down?"); + +#if SUPPORT_PLUGIN + ais_fd_sync = -1; +#endif + + /* TODO: Is recovery even worth it here? After all, this means + * that corosync died ... */ + exit(1); +} + +static void +plugin_membership_dispatch(cpg_handle_t handle, + const struct cpg_name *groupName, + uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) +{ + uint32_t kind = 0; + const char *from = NULL; + char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); + + if (!data) { + return; + } + free(data); + data = NULL; + + if (kind != crm_class_quorum) { + return; + } + + DBGLOG(LOG_INFO, "AIS quorum state: %d", (int)crm_have_quorum); + clock_gettime(CLOCK_MONOTONIC, &t_last_quorum); + update_status(); + + timer_id_ais = g_timeout_add(timeout_loop * 1000, plugin_timer, NULL); + return; +} +#endif + + +static void +clean_up(int rc) +{ + return; +} + +static void +cluster_shutdown(int nsig) +{ + clean_up(0); +} + +int +servant_cluster(const char *diskname, int mode, const void* argp) +{ + crm_cluster_t crm_cluster; + + cluster_stack = get_cluster_type(); + +#ifdef SUPPORT_PLUGIN + + if (cluster_stack != pcmk_cluster_classic_ais) { + check_ais = 0; + } else { + check_ais = 1; + cl_log(LOG_INFO, "Legacy plug-in detected, AIS quorum check enabled"); + if(is_openais_cluster()) { + crm_cluster.destroy = plugin_membership_destroy; + crm_cluster.cpg.cpg_deliver_fn = plugin_membership_dispatch; + /* crm_cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; TODO? */ + crm_cluster.cpg.cpg_confchg_fn = NULL; + } + + while (!crm_cluster_connect(&crm_cluster)) { + cl_log(LOG_INFO, "Waiting to sign in with cluster ..."); + sleep(reconnect_msec / 1000); + } + } + + if (check_ais) { + timer_id_ais = g_timeout_add(timeout_loop * 1000, plugin_timer, NULL); + } +#endif + mainloop = g_main_new(FALSE); + + mainloop_add_signal(SIGTERM, cluster_shutdown); + mainloop_add_signal(SIGINT, cluster_shutdown); + + g_main_run(mainloop); + g_main_destroy(mainloop); + + clean_up(0); + return 0; /* never reached */ +} + + diff --git a/src/sbd-common.c b/src/sbd-common.c index 1dc3916..5df3c62 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,631 +1,673 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ #if defined(__s390__) || defined(__s390x__) unsigned long timeout_watchdog = 15; int timeout_msgwait = 30; #else unsigned long timeout_watchdog = 5; int timeout_msgwait = 10; #endif unsigned long timeout_watchdog_warn = 3; int timeout_allocate = 2; int timeout_loop = 1; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; unsigned long timeout_watchdog_crashdump = 240; int skip_rt = 0; int debug = 0; int debug_mode = 0; char *watchdogdev = NULL; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v Enable some verbose debug logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping (def: 240s, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" , cmdname); } int watchdog_init_interval(void) { int timeout = timeout_watchdog; if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } else { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", timeout); } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { if (write(watchdogfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", watchdogdev); return -1; } } return 0; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { watchdogfd = open(watchdogdev, O_WRONLY); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); if ((watchdog_init_interval() < 0) || (watchdog_tickle() < 0)) { return -1; } }else{ cl_perror("Cannot open watchdog device '%s'", watchdogdev); return -1; } } return 0; } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(watchdogfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", watchdogdev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(watchdogfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", watchdogdev); } } if (close(watchdogfd) < 0) { cl_perror("Watchdog close(%d) failed", watchdogfd); } watchdogfd = -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static unsigned char sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return HOG_CHAR; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { return sbd_stack_hogger(buf, kbytes-1); } else { return buf[sizeof(buf)-1]; } } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } #ifdef SCHED_RR { int pcurrent = 0; int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } pcurrent = sched_getscheduler(0); if (pcurrent < 0) { cl_perror("Unable to get scheduler priority"); } else if(pcurrent < priority) { struct sched_param sp; memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror("Unable to set scheduler priority to %d", priority); } else { cl_log(LOG_INFO, "Scheduler priority is now %d", priority); } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler priority"); #endif sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicing the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicing the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); sync(); if(kind == 'c') { watchdog_close(true); sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if(reboot(RB_AUTOBOOT) < 0) { cl_perror("Reboot failed"); } } exit(1); } void do_crashdump(void) { do_exit('c'); } void do_reset(void) { do_exit('b'); } void do_off(void) { do_exit('o'); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } + +void +notify_parent(enum pcmk_health healthy) +{ + pid_t ppid; + union sigval signal_value; + + memset(&signal_value, 0, sizeof(signal_value)); + ppid = getppid(); + + if (ppid == 1) { + /* Our parent died unexpectedly. Triggering + * self-fence. */ + cl_log(LOG_WARNING, "Our parent is dead."); + do_reset(); + } + + switch (healthy) { + case pcmk_health_pending: + case pcmk_health_shutdown: + case pcmk_health_transient: + DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", healthy); + break; + + case pcmk_health_unknown: + case pcmk_health_unclean: + case pcmk_health_noquorum: + DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", healthy); + sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); + break; + + case pcmk_health_online: + DBGLOG(LOG_INFO, "Notifying parent: healthy"); + sigqueue(ppid, SIG_LIVENESS, signal_value); + break; + + default: + DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", healthy); + sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); + break; + } +} diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index 963e5de..dbae2fa 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,1006 +1,1033 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" #include #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int check_pcmk = 0; +int check_cluster = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; int parse_device_line(const char *line); +static bool +sbd_is_disk(struct servants_list_item *servant) +{ + if (servant == NULL) { + return true; + + } else if (strcmp(servant->devname, "pcmk") == 0) { + return false; + + } else if (strcmp(servant->devname, "cluster") == 0) { + return false; + } + return true; +} + void recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; newbie = malloc(sizeof(*newbie)); if (!newbie) { fprintf(stderr, "malloc failed in recruit_servant.\n"); exit(1); } memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strncasecmp(s->devname, devname, strlen(s->devname))) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; - if (strcmp("pcmk",s->devname) == 0) { - DBGLOG(LOG_INFO, "Starting Pacemaker servant"); - s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); - } else { + if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif - } + } else if(strcmp("pcmk", s->devname) == 0) { + DBGLOG(LOG_INFO, "Starting Pacemaker servant"); + s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); + + } else if(strcmp("cluster", s->devname) == 0) { + DBGLOG(LOG_INFO, "Starting Cluster servant"); + /* s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); */ + + } else { + cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); + } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int quorum_read(int good_servants) { if (servant_count > 2) return (good_servants > servant_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int pcmk_healthy = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_IO_FAIL); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); - if (s && strcmp(s->devname, "pcmk") == 0) { + if (sbd_is_disk(s) == false) { if (pcmk_healthy != 0) { cl_log(LOG_WARNING, "Pacemaker health check: UNHEALTHY"); } pcmk_healthy = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_IO_FAIL) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); cleanup_servant_by_pid(sinfo.si_pid); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { - if (strcmp(s->devname, "pcmk") == 0) { + if (sbd_is_disk(s) == false) { if (pcmk_healthy != 1) { cl_log(LOG_INFO, "Pacemaker health check: OK"); } pcmk_healthy = 1; }; s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { - if (strcmp(s->devname, "pcmk") != 0) { + if (sbd_is_disk(s)) { good_servants++; } s->outdated = 0; } else if (!s->outdated) { - if (strcmp(s->devname, "pcmk") == 0) { + if (sbd_is_disk(s) == false) { /* If the state is outdated, we * override the last reported * state */ pcmk_healthy = 0; cl_log(LOG_WARNING, "Pacemaker state outdated (age: %d)", age); } else if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant for %s outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(!decoupled && check_pcmk && servant_count == 0) { pcmk_healthy = TRUE; } if (quorum_read(good_servants) || (check_pcmk && pcmk_healthy) || (check_pcmk == FALSE && servant_count == 0)) { if (!decoupled) { cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } if (servant_count == 0) { /* cl_log(LOG_DEBUG, "Stand-alone mode"); */ } else if (!quorum_read(good_servants)) { cl_log(LOG_DEBUG, "Not enough good servants: %d", good_servants); if (!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Just to ensure the message is only logged once */ } } else { pcmk_override = 0; } watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, pcmk_healthy, servant_count); */ } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_reset(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_reset(); } } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { int lpc = 0; int last = 0; int max = 0; int found = 0; if(line) { max = strlen(line); } if (max <= 0) { return found; } cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", max, line); /* Skip initial whitespace */ for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { last = lpc + 1; } /* Now the actual content */ for (lpc = 0; lpc <= max; lpc++) { int a_space = isspace(line[lpc]); if (a_space && lpc < max && isspace(line[lpc + 1])) { /* fast-forward to the end of the spaces */ } else if (a_space || line[lpc] == ';' || line[lpc] == 0) { int rc = 1; char *entry = NULL; if (lpc > last) { entry = calloc(1, 1 + lpc - last); rc = sscanf(line + last, "%[^;]", entry); } if (entry == NULL) { /* Skip */ } else if (rc != 1) { cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last); } else { cl_log(LOG_DEBUG, "Adding '%s'", entry); recruit_servant(entry, 0); found++; } free(entry); last = lpc + 1; } } return found; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int w = 0; int qb_facility; const char *value = NULL; int start_delay = 0; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_get_uname(); value = getenv("SBD_DEVICE"); if(value) { #if SUPPORT_SHARED_DISK int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); exit_status = -2; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif } value = getenv("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); } cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default"); value = getenv("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = getenv("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); } value = getenv("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } } value = getenv("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = getenv("SBD_DELAY_START"); if(value) { start_delay = crm_is_true(value); } cl_log(LOG_DEBUG, "Start delay: %d (%s)", (int)start_delay, value?value:"default"); while ((c = getopt(argc, argv, "C:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:")) != -1) { switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = atoi(optarg); cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = atoi(optarg); cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, "sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c", LOG_DEBUG); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, "sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c", LOG_DEBUG); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { /* Go nuts, turn on pacemaker's logging too */ qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, "*", LOG_DEBUG); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, "*", LOG_DEBUG); cl_log(LOG_INFO, "Verbose library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': w++; break; case 'w': cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); free(watchdogdev); watchdogdev = strdup(optarg); break; case 'd': #if SUPPORT_SHARED_DISK recruit_servant(optarg, 0); #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; + case 'c': + check_cluster = 1; + break; case 'P': check_pcmk = 1; break; case 'n': local_uname = strdup(optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = atoi(optarg); cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = atoi(optarg); if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } break; case '2': timeout_allocate = atoi(optarg); break; case '3': timeout_loop = atoi(optarg); break; case '4': timeout_msgwait = atoi(optarg); break; case '5': timeout_watchdog_warn = atoi(optarg); cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = atoi(optarg); cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = atoi(optarg); cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = atoi(optarg); cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'h': usage(); return (0); default: exit_status = -2; goto out; break; } } if (w > 0) { watchdog_use = w % 2; } else if(watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (servant_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); - } else if (strcmp(argv[optind], "dump") == 0) { + + } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); - } else if (strcmp(argv[optind], "allocate") == 0) { + + } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); - } else if (strcmp(argv[optind], "list") == 0) { + + } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); - } else if (strcmp(argv[optind], "message") == 0) { + + } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); - } else if (strcmp(argv[optind], "ping") == 0) { + + } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); - } else if (strcmp(argv[optind], "watch") == 0) { + + } else if (strcmp(argv[optind], "watch") == 0) { if(servant_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); } - /* We only want this to have an effect during watch right now; - * pinging and fencing would be too confused */ - cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); - if (check_pcmk) { - recruit_servant("pcmk", 0); - servant_count--; - } - if(start_delay) { unsigned long delay = get_first_msgwait(servants_leader); sleep(delay); } - exit_status = inquisitor(); - } else { exit_status = -2; } -#else +#endif + if (strcmp(argv[optind], "watch") == 0) { - /* sleep $(sbd -d "$SBD_DEVICE" dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ + /* sleep $(sbd -d "$SBD_DEVICE" dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); servant_count--; } + cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); + if (check_cluster) { + recruit_servant("cluster", 0); + servant_count--; + } + exit_status = inquisitor(); - } else { - exit_status = -2; } -#endif -out: + + out: if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 772b265..0f41d27 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -1,620 +1,442 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * */ #include "sbd.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#ifdef SUPPORT_PLUGIN -# include -#endif #include #include - -enum pcmk_health -{ - pcmk_health_unknown, - pcmk_health_pending, - pcmk_health_transient, - pcmk_health_unclean, - pcmk_health_shutdown, - pcmk_health_online, - pcmk_health_noquorum, -}; - extern int servant_count; static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static int cib_connect(gboolean full); static void set_pcmk_health(enum pcmk_health healthy); -static void notify_parent(void); static void compute_status(pe_working_set_t * data_set); static gboolean mon_refresh_state(gpointer user_data); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; static enum pcmk_health pcmk_healthy = 0; +static int last_state = 0; static int cib_connected = 0; -#ifdef SUPPORT_PLUGIN -static guint timer_id_ais = 0; -static enum cluster_type_e cluster_stack = pcmk_cluster_unknown; -static struct timespec t_last_quorum; -static int check_ais = 0; -#endif - - - -#define LOGONCE(state, lvl, fmt, args...) do { \ - if (last_state != state) { \ - cl_log(lvl, fmt, ##args); \ - last_state = state; \ - } \ - healthy = state; \ - } while(0) - static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static long last_refresh = 0; - - static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { cl_log(LOG_WARNING, "Disconnected from CIB"); cib->cmds->signoff(cib); set_pcmk_health(pcmk_health_transient); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; return; } static gboolean mon_timer_notify(gpointer data) { static int counter = 0; int counter_max = timeout_watchdog / timeout_loop; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } if (cib_connected) { if (counter == counter_max) { free_xml(current_cib); current_cib = get_cib_copy(cib); mon_refresh_state(NULL); counter = 0; } else { cib->cmds->noop(cib, 0); - notify_parent(); + notify_parent(pcmk_healthy); counter++; } } timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; CRM_CHECK(cib != NULL, return -EINVAL); cib_connected = 0; crm_xml_init(); if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } current_cib = get_cib_copy(cib); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } -#ifdef SUPPORT_PLUGIN -static gboolean -mon_timer_ais(gpointer data) -{ - if (timer_id_ais > 0) { - g_source_remove(timer_id_ais); - } - - send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); - - /* The timer is set in the response processing */ - return FALSE; -} - -static void -ais_membership_destroy(gpointer user_data) -{ - cl_log(LOG_ERR, "AIS connection terminated - corosync down?"); -#if SUPPORT_PLUGIN - ais_fd_sync = -1; -#endif - /* TODO: Is recovery even worth it here? After all, this means - * that corosync died ... */ - exit(1); -} - -static void -ais_membership_dispatch(cpg_handle_t handle, - const struct cpg_name *groupName, - uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) -{ - uint32_t kind = 0; - const char *from = NULL; - char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); - - if (!data) { - return; - } - free(data); - data = NULL; - - if (kind != crm_class_quorum) { - return; - } - - DBGLOG(LOG_INFO, "AIS quorum state: %d", (int)crm_have_quorum); - clock_gettime(CLOCK_MONOTONIC, &t_last_quorum); - - timer_id_ais = g_timeout_add(timeout_loop * 1000, mon_timer_ais, NULL); - - return; -} -#endif static void compute_status(pe_working_set_t * data_set) { static int updates = 0; - static int last_state = 0; static int ever_had_quorum = FALSE; int healthy = 0; - struct timespec t_now; node_t *node = pe_find_node(data_set->nodes, local_uname); updates++; - clock_gettime(CLOCK_MONOTONIC, &t_now); if (data_set->dc_node == NULL) { LOGONCE(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); goto out; } if (node == NULL) { - LOGONCE(pcmk_health_unknown, LOG_WARNING, "Node state: UNKNOWN"); + LOGONCE(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); } else if (node->details->online == FALSE) { LOGONCE(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { LOGONCE(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); } else if (node->details->pending) { LOGONCE(pcmk_health_pending, LOG_WARNING, "Node state: pending"); #if 0 } else if (node->details->shutdown) { LOGONCE(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down"); #endif } else { if (data_set->flags & pe_flag_have_quorum) { LOGONCE(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; } else if(servant_count > 0) { LOGONCE(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); goto out; } else if(ever_had_quorum == FALSE) { LOGONCE(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); } else { /* We lost quorum, and there are no disks present * Setting healthy > 2 here will result in us self-fencing */ switch (data_set->no_quorum_policy) { case no_quorum_freeze: LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); break; case no_quorum_stop: LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); break; case no_quorum_ignore: LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); break; case no_quorum_suicide: LOGONCE(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); break; } } } -#ifdef SUPPORT_PLUGIN - if (check_ais) { - int quorum_age = t_now.tv_sec - t_last_quorum.tv_sec; - - if (quorum_age > (int)(timeout_io+timeout_loop)) { - if (t_last_quorum.tv_sec != 0) - LOGONCE(pcmk_health_transient, LOG_WARNING, "AIS: Quorum outdated"); - - } else if (crm_have_quorum) { - LOGONCE(pcmk_health_online, LOG_INFO, "AIS: We have quorum"); - - } else { - LOGONCE(pcmk_health_unclean, LOG_WARNING, "AIS: We do NOT have quorum"); - } - } -#endif - out: set_pcmk_health(healthy); return; } static void set_pcmk_health(enum pcmk_health healthy) { pcmk_healthy = healthy; - notify_parent(); -} - - -static void -notify_parent(void) -{ - pid_t ppid; - union sigval signal_value; - - memset(&signal_value, 0, sizeof(signal_value)); - ppid = getppid(); - - if (ppid == 1) { - /* Our parent died unexpectedly. Triggering - * self-fence. */ - cl_log(LOG_WARNING, "Our parent is dead."); - do_reset(); - } - - switch (pcmk_healthy) { - case pcmk_health_pending: - case pcmk_health_shutdown: - case pcmk_health_transient: - DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", pcmk_healthy); - break; - - case pcmk_health_unknown: - case pcmk_health_unclean: - case pcmk_health_noquorum: - DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", pcmk_healthy); - sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); - break; - - case pcmk_health_online: - DBGLOG(LOG_INFO, "Notifying parent: healthy"); - sigqueue(ppid, SIG_LIVENESS, signal_value); - break; - - default: - DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", pcmk_healthy); - sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); - break; - } + notify_parent(pcmk_healthy); } static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger(refresh_trigger); mon_refresh_state(NULL); return FALSE; } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; long now = time(NULL); static int updates = 0; static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); break; case pcmk_ok: updates++; break; default: crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); break; } } if (current_cib == NULL) { current_cib = get_cib_copy(cib); } /* Refresh * - immediately if the last update was more than 5s ago * - every 10 updates * - at most 2s after the last update */ if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) { mon_refresh_state(refresh_timer); updates = 0; } else { mainloop_set_trigger(refresh_trigger); mainloop_timer_start(refresh_timer); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = NULL; pe_working_set_t data_set; if(current_cib == NULL) { return FALSE; } if(user_data) { mainloop_timer_t *timer = user_data; mainloop_timer_stop(timer); } cib_copy = copy_xml(current_cib); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { last_refresh = time(NULL); set_working_set_defaults(&data_set); data_set.input = cib_copy; data_set.flags |= pe_flag_have_stonith_resource; cluster_status(&data_set); compute_status(&data_set); cleanup_calculations(&data_set); } return FALSE; } static void clean_up(int rc) { if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, int mode, const void* argp) { int exit_code = 0; - crm_cluster_t crm_cluster; cl_log(LOG_INFO, "Monitoring Pacemaker health"); set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); } -#ifdef SUPPORT_PLUGIN - cluster_stack = get_cluster_type(); - - if (cluster_stack != pcmk_cluster_classic_ais) { - check_ais = 0; - } else { - check_ais = 1; - cl_log(LOG_INFO, "Legacy plug-in detected, AIS quorum check enabled"); - if(is_openais_cluster()) { - crm_cluster.destroy = ais_membership_destroy; - crm_cluster.cpg.cpg_deliver_fn = ais_membership_dispatch; - /* crm_cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; TODO? */ - crm_cluster.cpg.cpg_confchg_fn = NULL; - } - - while (!crm_cluster_connect(&crm_cluster)) { - cl_log(LOG_INFO, "Waiting to sign in with cluster ..."); - sleep(reconnect_msec / 1000); - } - } -#endif - if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_new(FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); -#ifdef SUPPORT_PLUGIN - if (check_ais) { - timer_id_ais = g_timeout_add(timeout_loop * 1000, mon_timer_ais, NULL); - } -#endif g_main_run(mainloop); g_main_destroy(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd.h b/src/sbd.h index 5a42252..78425ae 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,174 +1,195 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_IO_FAIL (SIGRTMIN + 5) /* the IO child requests to be considered failed */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 6) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ #define HOG_CHAR 0xff /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[64]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[64]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; }; +enum pcmk_health +{ + pcmk_health_unknown, + pcmk_health_pending, + pcmk_health_transient, + pcmk_health_unclean, + pcmk_health_shutdown, + pcmk_health_online, + pcmk_health_noquorum, +}; + void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); +void notify_parent(enum pcmk_health healthy); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern char* local_uname; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); +int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) + +#define LOGONCE(state, lvl, fmt, args...) do { \ + if (last_state != state) { \ + cl_log(lvl, fmt, ##args); \ + last_state = state; \ + } \ + healthy = state; \ + } while(0)