diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c index eb212e4..d2b8b0a 100644 --- a/src/sbd-cluster.c +++ b/src/sbd-cluster.c @@ -1,220 +1,170 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -/* TODO list: - * - * - Trying to shutdown a node if no devices are up will fail, since SBD - * currently uses a message via the disk to achieve this. - * - * - Shutting down cluster nodes while the majority of devices is down - * will eventually take the cluster below the quorum threshold, at which - * time the remaining cluster nodes will all immediately suicide. - * - */ - #include "sbd.h" -#include - -#include - -#include -#include -#include -#include - -#include #include -#include -#include -#include #include - #include -#include -#include -#include -#include -#include + #include +#include //undef SUPPORT_PLUGIN //define SUPPORT_PLUGIN 1 -#if SUPPORT_PLUGIN -static int last_state = 0; -static guint timer_id_ais = 0; -static struct timespec t_last_quorum; -#endif - extern int servant_count; -enum pcmk_health cluster_healthy; static int reconnect_msec = 1000; static GMainLoop *mainloop = NULL; +static guint notify_timer = 0; +#if SUPPORT_PLUGIN static void -set_cluster_health(enum pcmk_health healthy) +sbd_plugin_membership_dispatch(cpg_handle_t handle, + const struct cpg_name *groupName, + uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { - cluster_healthy = healthy; - notify_parent(cluster_healthy); + if(msg_len > 0) { + set_servant_health(pcmk_health_online, LOG_INFO, + "Connected to %s", name_for_cluster_type(get_cluster_type())); + } else { + set_servant_health(pcmk_health_unclean, LOG_INFO, + "Broken %s message", name_for_cluster_type(get_cluster_type())); + } + notify_parent(); + return; } +#endif +#if SUPPORT_COROSYNC void -update_status(void) +sbd_cpg_membership_dispatch(cpg_handle_t handle, + const struct cpg_name *groupName, + const struct cpg_address *member_list, size_t member_list_entries, + const struct cpg_address *left_list, size_t left_list_entries, + const struct cpg_address *joined_list, size_t joined_list_entries) { - enum pcmk_health healthy = pcmk_health_unknown; - -#if SUPPORT_PLUGIN - { - struct timespec t_now; - int quorum_age = t_now.tv_sec - t_last_quorum.tv_sec; - - clock_gettime(CLOCK_MONOTONIC, &t_now); - - if (quorum_age > (int)(timeout_io+timeout_loop)) { - if (t_last_quorum.tv_sec != 0) - LOGONCE(pcmk_health_transient, LOG_WARNING, "AIS: Quorum outdated"); - - } else if (crm_have_quorum) { - LOGONCE(pcmk_health_online, LOG_INFO, "AIS: We have quorum"); - - } else { - LOGONCE(pcmk_health_unclean, LOG_WARNING, "AIS: We do NOT have quorum"); - } + if(member_list_entries > 0) { + set_servant_health(pcmk_health_online, LOG_INFO, + "Connected to %s", name_for_cluster_type(get_cluster_type())); + } else { + set_servant_health(pcmk_health_unclean, LOG_INFO, + "Empty %s membership", name_for_cluster_type(get_cluster_type())); } -#endif - - set_cluster_health(healthy); + notify_parent(); } +#endif -#if SUPPORT_PLUGIN static gboolean -plugin_timer(gpointer data) -{ - if (timer_id_ais > 0) { - g_source_remove(timer_id_ais); - } - - send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); - - /* The timer is set in the response processing */ - return FALSE; -} - -static void -plugin_membership_dispatch(cpg_handle_t handle, - const struct cpg_name *groupName, - uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) +notify_timer_cb(gpointer data) { - uint32_t kind = 0; - const char *from = NULL; - char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); + enum cluster_type_e stack = get_cluster_type(); + switch (stack) { + case pcmk_cluster_classic_ais: + send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); + break; - if (!data) { - return; - } - free(data); - data = NULL; + case pcmk_cluster_corosync: + case pcmk_cluster_cman: + /* TODO - Make a CPG call and only call notify_parent() when we get a reply */ + notify_parent(); + break; - if (kind != crm_class_quorum) { - return; + default: + break; } - - DBGLOG(LOG_INFO, "AIS quorum state: %d", (int)crm_have_quorum); - clock_gettime(CLOCK_MONOTONIC, &t_last_quorum); - update_status(); - - timer_id_ais = g_timeout_add(timeout_loop * 1000, plugin_timer, NULL); - return; + return TRUE; } -#endif static void sbd_membership_destroy(gpointer user_data) { cl_log(LOG_ERR, "Connection to %s terminated", name_for_cluster_type(get_cluster_type())); exit(1); } static void clean_up(int rc) { return; } static void cluster_shutdown(int nsig) { clean_up(0); } int servant_cluster(const char *diskname, int mode, const void* argp) { crm_cluster_t cluster; enum cluster_type_e cluster_stack = get_cluster_type(); - if (is_openais_cluster()) { -#if SUPPORT_COROSYNC - cluster.destroy = sbd_membership_destroy; - cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; -# if SUPPORT_PLUGIN - cluster.cpg.cpg_deliver_fn = plugin_membership_dispatch; -# endif -#endif - } - - while (!crm_cluster_connect(&cluster)) { - cl_log(LOG_INFO, "Waiting to sign in with cluster ..."); - sleep(reconnect_msec / 1000); - } - - /* stonith_our_uname = cluster.uname; */ - /* stonith_our_uuid = cluster.uuid; */ - switch (cluster_stack) { + #if SUPPORT_PLUGIN case pcmk_cluster_classic_ais: - timer_id_ais = g_timeout_add(timeout_loop * 1000, plugin_timer, NULL); + cluster.destroy = sbd_membership_destroy; + cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch; break; #endif + +#if SUPPORT_COROSYNC case pcmk_cluster_corosync: - break; case pcmk_cluster_cman: + cluster.destroy = sbd_membership_destroy; + cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch; break; + break; +#endif default: cl_log(LOG_ERR, "Unsupported cluster type: %s", name_for_cluster_type(cluster_stack)); exit(1); break; } + + while (!crm_cluster_connect(&cluster)) { + cl_log(LOG_INFO, "Waiting to sign in with cluster ..."); + sleep(reconnect_msec / 1000); + } + + /* stonith_our_uname = cluster.uname; */ + /* stonith_our_uuid = cluster.uuid; */ + + set_servant_health(pcmk_health_transient, LOG_INFO, + "Empty %s membership", name_for_cluster_type(get_cluster_type())); + mainloop = g_main_new(FALSE); + notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL); mainloop_add_signal(SIGTERM, cluster_shutdown); mainloop_add_signal(SIGINT, cluster_shutdown); + g_main_run(mainloop); g_main_destroy(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd-common.c b/src/sbd-common.c index 5df3c62..1ebbf87 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,673 +1,696 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ #if defined(__s390__) || defined(__s390x__) unsigned long timeout_watchdog = 15; int timeout_msgwait = 30; #else unsigned long timeout_watchdog = 5; int timeout_msgwait = 10; #endif unsigned long timeout_watchdog_warn = 3; int timeout_allocate = 2; int timeout_loop = 1; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; unsigned long timeout_watchdog_crashdump = 240; int skip_rt = 0; int debug = 0; int debug_mode = 0; char *watchdogdev = NULL; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; +int servant_health = 0; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v Enable some verbose debug logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping (def: 240s, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" , cmdname); } int watchdog_init_interval(void) { int timeout = timeout_watchdog; if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } else { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", timeout); } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { if (write(watchdogfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", watchdogdev); return -1; } } return 0; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { watchdogfd = open(watchdogdev, O_WRONLY); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); if ((watchdog_init_interval() < 0) || (watchdog_tickle() < 0)) { return -1; } }else{ cl_perror("Cannot open watchdog device '%s'", watchdogdev); return -1; } } return 0; } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(watchdogfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", watchdogdev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(watchdogfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", watchdogdev); } } if (close(watchdogfd) < 0) { cl_perror("Watchdog close(%d) failed", watchdogfd); } watchdogfd = -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static unsigned char sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return HOG_CHAR; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { return sbd_stack_hogger(buf, kbytes-1); } else { return buf[sizeof(buf)-1]; } } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } #ifdef SCHED_RR { int pcurrent = 0; int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } pcurrent = sched_getscheduler(0); if (pcurrent < 0) { cl_perror("Unable to get scheduler priority"); } else if(pcurrent < priority) { struct sched_param sp; memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror("Unable to set scheduler priority to %d", priority); } else { cl_log(LOG_INFO, "Scheduler priority is now %d", priority); } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler priority"); #endif sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicing the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicing the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); sync(); if(kind == 'c') { watchdog_close(true); sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if(reboot(RB_AUTOBOOT) < 0) { cl_perror("Reboot failed"); } } exit(1); } void do_crashdump(void) { do_exit('c'); } void do_reset(void) { do_exit('b'); } void do_off(void) { do_exit('o'); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } void -notify_parent(enum pcmk_health healthy) +notify_parent(void) { pid_t ppid; union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_reset(); } - switch (healthy) { + switch (servant_health) { case pcmk_health_pending: case pcmk_health_shutdown: case pcmk_health_transient: - DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", healthy); + DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", servant_health); break; case pcmk_health_unknown: case pcmk_health_unclean: case pcmk_health_noquorum: - DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", healthy); + DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; case pcmk_health_online: - DBGLOG(LOG_INFO, "Notifying parent: healthy"); + DBGLOG(LOG_INFO, "Notifying parent: servant_health"); sigqueue(ppid, SIG_LIVENESS, signal_value); break; default: - DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", healthy); + DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; } } + +void +set_servant_health(enum pcmk_health state, int level, char const *format, ...) +{ + if (servant_health != state) { + va_list ap; + int len = 0; + char *string = NULL; + + servant_health = state; + + va_start(ap, format); + len = vasprintf (&string, format, ap); + + if(len > 0) { + cl_log(level, string); + } + + va_end(ap); + free(string); + } +} diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 0f41d27..1aa1c52 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -1,442 +1,430 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * */ #include "sbd.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int servant_count; static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static int cib_connect(gboolean full); -static void set_pcmk_health(enum pcmk_health healthy); static void compute_status(pe_working_set_t * data_set); static gboolean mon_refresh_state(gpointer user_data); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; -static enum pcmk_health pcmk_healthy = 0; -static int last_state = 0; static int cib_connected = 0; static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static long last_refresh = 0; static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { - cl_log(LOG_WARNING, "Disconnected from CIB"); cib->cmds->signoff(cib); - set_pcmk_health(pcmk_health_transient); + set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; return; } static gboolean mon_timer_notify(gpointer data) { static int counter = 0; int counter_max = timeout_watchdog / timeout_loop; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } if (cib_connected) { if (counter == counter_max) { free_xml(current_cib); current_cib = get_cib_copy(cib); mon_refresh_state(NULL); counter = 0; } else { cib->cmds->noop(cib, 0); - notify_parent(pcmk_healthy); + notify_parent(); counter++; } } timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; CRM_CHECK(cib != NULL, return -EINVAL); cib_connected = 0; crm_xml_init(); if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } current_cib = get_cib_copy(cib); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } static void compute_status(pe_working_set_t * data_set) { static int updates = 0; static int ever_had_quorum = FALSE; - int healthy = 0; node_t *node = pe_find_node(data_set->nodes, local_uname); updates++; if (data_set->dc_node == NULL) { - LOGONCE(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); + set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); goto out; } if (node == NULL) { - LOGONCE(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); + set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); } else if (node->details->online == FALSE) { - LOGONCE(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); + set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { - LOGONCE(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); + set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); } else if (node->details->pending) { - LOGONCE(pcmk_health_pending, LOG_WARNING, "Node state: pending"); + set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); #if 0 } else if (node->details->shutdown) { - LOGONCE(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down"); + set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down"); #endif } else { if (data_set->flags & pe_flag_have_quorum) { - LOGONCE(pcmk_health_online, LOG_INFO, "Node state: online"); + set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; } else if(servant_count > 0) { - LOGONCE(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); + set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); goto out; } else if(ever_had_quorum == FALSE) { - LOGONCE(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); + set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); } else { /* We lost quorum, and there are no disks present * Setting healthy > 2 here will result in us self-fencing */ switch (data_set->no_quorum_policy) { case no_quorum_freeze: - LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); + set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); break; case no_quorum_stop: - LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); + set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); break; case no_quorum_ignore: - LOGONCE(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); + set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); break; case no_quorum_suicide: - LOGONCE(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); + set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); break; } } } out: - set_pcmk_health(healthy); + notify_parent(); return; } -static void -set_pcmk_health(enum pcmk_health healthy) -{ - pcmk_healthy = healthy; - notify_parent(pcmk_healthy); -} - static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger(refresh_trigger); mon_refresh_state(NULL); return FALSE; } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; long now = time(NULL); static int updates = 0; static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); break; case pcmk_ok: updates++; break; default: crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); break; } } if (current_cib == NULL) { current_cib = get_cib_copy(cib); } /* Refresh * - immediately if the last update was more than 5s ago * - every 10 updates * - at most 2s after the last update */ if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) { mon_refresh_state(refresh_timer); updates = 0; } else { mainloop_set_trigger(refresh_trigger); mainloop_timer_start(refresh_timer); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = NULL; pe_working_set_t data_set; if(current_cib == NULL) { return FALSE; } if(user_data) { mainloop_timer_t *timer = user_data; mainloop_timer_stop(timer); } cib_copy = copy_xml(current_cib); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { last_refresh = time(NULL); set_working_set_defaults(&data_set); data_set.input = cib_copy; data_set.flags |= pe_flag_have_stonith_resource; cluster_status(&data_set); compute_status(&data_set); cleanup_calculations(&data_set); } return FALSE; } static void clean_up(int rc) { if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, int mode, const void* argp) { int exit_code = 0; cl_log(LOG_INFO, "Monitoring Pacemaker health"); set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); } if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_new(FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); g_main_run(mainloop); g_main_destroy(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd.h b/src/sbd.h index 78425ae..554d756 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,195 +1,190 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_IO_FAIL (SIGRTMIN + 5) /* the IO child requests to be considered failed */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 6) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ #define HOG_CHAR 0xff /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[64]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[64]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; }; enum pcmk_health { pcmk_health_unknown, pcmk_health_pending, pcmk_health_transient, pcmk_health_unclean, pcmk_health_shutdown, pcmk_health_online, pcmk_health_noquorum, }; void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); -void notify_parent(enum pcmk_health healthy); +void notify_parent(void); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern char* local_uname; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) -#define LOGONCE(state, lvl, fmt, args...) do { \ - if (last_state != state) { \ - cl_log(lvl, fmt, ##args); \ - last_state = state; \ - } \ - healthy = state; \ - } while(0) +extern int servant_health; +void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4)));