diff --git a/src/Makefile.am b/src/Makefile.am index db10c71..69535cf 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,13 +1,13 @@ AM_CFLAGS = -D_GNU_SOURCE -DCHECK_AIS -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS = -I$(includedir)/pacemaker \ -I$(includedir)/heartbeat \ $(glib_CFLAGS) sbin_PROGRAMS = sbd -sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig +sbd_SOURCES = sbd-common.c sbd-watchdog.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig if SUPPORT_SHARED_DISK sbd_SOURCES += sbd-md.c endif diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c index b6c5512..48c6071 100644 --- a/src/sbd-cluster.c +++ b/src/sbd-cluster.c @@ -1,770 +1,770 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include #endif #include "sbd.h" //undef SUPPORT_PLUGIN //define SUPPORT_PLUGIN 1 /* binary for pacemaker-remote has changed with pacemaker 2 */ #ifdef CRM_SCORE_INFINITY #define PACEMAKER_REMOTE_BINARY "pacemaker-remoted" #else #define PACEMAKER_REMOTE_BINARY "pacemaker_remoted" #endif static bool remote_node = false; static pid_t remoted_pid = 0; static int reconnect_msec = 1000; static GMainLoop *mainloop = NULL; static guint notify_timer = 0; static crm_cluster_t cluster; static gboolean sbd_remote_check(gpointer user_data); static long unsigned int find_pacemaker_remote(void); static void sbd_membership_destroy(gpointer user_data); #if SUPPORT_PLUGIN static void sbd_plugin_membership_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { if(msg_len > 0) { set_servant_health(pcmk_health_online, LOG_INFO, "Connected to %s", name_for_cluster_type(get_cluster_type())); } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Broken %s message", name_for_cluster_type(get_cluster_type())); } notify_parent(); return; } #endif #if SUPPORT_COROSYNC #if CHECK_VOTEQUORUM_HANDLE #include static votequorum_handle_t votequorum_handle = 0; #endif #if CHECK_TWO_NODE static bool two_node = false; #endif static bool ever_seen_both = false; static int cpg_membership_entries = -1; #if CHECK_QDEVICE_SYNC_TIMEOUT #include static bool using_qdevice = false; static uint32_t qdevice_sync_timeout = /* in seconds */ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; #endif #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include static cmap_handle_t cmap_handle = 0; static cmap_track_handle_t track_handle = 0; static GSource *cmap_source = NULL; #endif void sbd_cpg_membership_health_update() { if(cpg_membership_entries > 0) { #if CHECK_TWO_NODE bool quorum_is_suspect_two_node = (two_node && ever_seen_both && cpg_membership_entries == 1); #endif #if CHECK_QDEVICE_SYNC_TIMEOUT bool quorum_is_suspect_qdevice_timing = using_qdevice && (qdevice_sync_timeout > timeout_watchdog); #endif do { #if CHECK_TWO_NODE if (quorum_is_suspect_two_node) { /* Alternative would be asking votequorum for number of votes. * Using pacemaker's cpg as source for number of active nodes * avoids binding to an additional library, is definitely * less code to write and we wouldn't have to combine data * from 3 sources (cmap, cpg & votequorum) in a potentially * racy environment. */ set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Connected to %s but requires both nodes present", name_for_cluster_type(get_cluster_type()) ); break; } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (quorum_is_suspect_qdevice_timing) { /* We can't really trust quorum info as qdevice-sync_timeout * makes reaction of quorum too sluggish for our * watchdog-timeout. */ set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Connected to %s but quorum using qdevice is distrusted " "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout " - "(%lus).", + "(%us).", name_for_cluster_type(get_cluster_type()), qdevice_sync_timeout, timeout_watchdog ); break; } #endif set_servant_health(pcmk_health_online, LOG_INFO, - "Connected to %s (%u members)%s", + "Connected to %s (%d members)%s", name_for_cluster_type(get_cluster_type()), cpg_membership_entries, #if CHECK_QDEVICE_SYNC_TIMEOUT using_qdevice?" using qdevice for quorum":"" #else "" #endif ); } while (false); if (cpg_membership_entries > 1) { ever_seen_both = true; } } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Empty %s membership", name_for_cluster_type(get_cluster_type())); } } void sbd_cpg_membership_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { cpg_membership_entries = member_list_entries; sbd_cpg_membership_health_update(); notify_parent(); } #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT static void sbd_cmap_notify_fn( cmap_handle_t cmap_handle, cmap_track_handle_t cmap_track_handle, int32_t event, const char *key_name, struct cmap_notify_value new_val, struct cmap_notify_value old_val, void *user_data) { switch (event) { case CMAP_TRACK_ADD: case CMAP_TRACK_MODIFY: switch (new_val.type) { case CMAP_VALUETYPE_UINT8: #if CHECK_TWO_NODE if (!strcmp(key_name, "quorum.two_node")) { two_node = *((uint8_t *) new_val.data); } else { return; } break; #else return; #endif case CMAP_VALUETYPE_STRING: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.model")) { using_qdevice = ((new_val.data) && strlen((char *) new_val.data)); } else { return; } break; #else return; #endif case CMAP_VALUETYPE_UINT32: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.sync_timeout")) { if (new_val.data) { qdevice_sync_timeout = *((uint32_t *) new_val.data) / 1000; } else { qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; } } else { return; } break; #else return; #endif default: return; } break; case CMAP_TRACK_DELETE: switch (new_val.type) { case CMAP_VALUETYPE_UINT8: #if CHECK_TWO_NODE if (!strcmp(key_name, "quorum.two_node")) { two_node = false; } else { return; } break; #else return; #endif case CMAP_VALUETYPE_STRING: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.model")) { using_qdevice = false; } else { return; } break; #else return; #endif case CMAP_VALUETYPE_UINT32: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.sync_timeout")) { qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; } else { return; } break; #else return; #endif default: return; } break; default: return; } sbd_cpg_membership_health_update(); notify_parent(); } static gboolean cmap_dispatch_callback (gpointer user_data) { cmap_dispatch(cmap_handle, CS_DISPATCH_ALL); return TRUE; } static void cmap_destroy(void) { if (cmap_source) { g_source_destroy(cmap_source); cmap_source = NULL; } if (track_handle) { cmap_track_delete(cmap_handle, track_handle); track_handle = 0; } if (cmap_handle) { cmap_finalize(cmap_handle); cmap_handle = 0; } } static gboolean verify_against_cmap_config(void) { #if CHECK_TWO_NODE uint8_t two_node_u8 = 0; #endif #if CHECK_QDEVICE_SYNC_TIMEOUT char *qdevice_model = NULL; #endif int cmap_fd; if (!track_handle) { if (cmap_initialize(&cmap_handle) != CS_OK) { cl_log(LOG_WARNING, "Cannot initialize CMAP service\n"); goto out; } #if CHECK_TWO_NODE if (cmap_track_add(cmap_handle, "quorum.two_node", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n"); goto out; } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (cmap_track_add(cmap_handle, "quorum.device.model", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n"); goto out; } if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-sync_timeout\n"); goto out; } #endif /* add the tracker to mainloop */ if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) { cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n"); goto out; } if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) { cl_log(LOG_WARNING, "Couldn't create source for cmap\n"); goto out; } g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL); g_source_attach(cmap_source, NULL); } #if CHECK_TWO_NODE if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) { cl_log(two_node_u8? LOG_NOTICE : LOG_INFO, "Corosync is%s in 2Node-mode", two_node_u8?"":" not"); two_node = two_node_u8; } else { cl_log(LOG_INFO, "quorum.two_node not present in cmap\n"); } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (cmap_get_string(cmap_handle, "quorum.device.model", &qdevice_model) == CS_OK) { using_qdevice = qdevice_model && strlen(qdevice_model); cl_log(using_qdevice? LOG_NOTICE : LOG_INFO, "Corosync is%s using qdevice", using_qdevice?"":" not"); } else { cl_log(LOG_INFO, "quorum.device.model not present in cmap\n"); } if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout", &qdevice_sync_timeout) == CS_OK) { qdevice_sync_timeout /= 1000; cl_log(LOG_INFO, "Corosync is using qdevice-sync_timeout=%ds", qdevice_sync_timeout); } else { cl_log(LOG_INFO, "quorum.device.sync_timeout not present in cmap\n"); } #endif return TRUE; out: cmap_destroy(); return FALSE; } #endif #endif static gboolean notify_timer_cb(gpointer data) { cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":""); if(remote_node) { sbd_remote_check(NULL); return TRUE; } switch (get_cluster_type()) { #if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS case pcmk_cluster_classic_ais: send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); break; #endif case pcmk_cluster_corosync: do { #if SUPPORT_COROSYNC && CHECK_VOTEQUORUM_HANDLE struct votequorum_info info; if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) { votequorum_finalize(votequorum_handle); if (votequorum_initialize(&votequorum_handle, NULL) != CS_OK) { votequorum_handle = 0; break; } if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) { break; } } #endif notify_parent(); } while (0); break; #if HAVE_DECL_PCMK_CLUSTER_CMAN case pcmk_cluster_cman: notify_parent(); break; #endif default: break; } return TRUE; } static void sbd_membership_connect(void) { bool connected = false; cl_log(LOG_INFO, "Attempting cluster connection"); cluster.destroy = sbd_membership_destroy; #if SUPPORT_PLUGIN cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch; #endif #if SUPPORT_COROSYNC cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch; #endif while(connected == false) { enum cluster_type_e stack = get_cluster_type(); if(get_cluster_type() == pcmk_cluster_unknown) { crm_debug("Attempting pacemaker remote connection"); /* Nothing is up, go looking for the pacemaker remote process */ if(find_pacemaker_remote() > 0) { connected = true; } } else { cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack)); #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) if (verify_against_cmap_config()) { #endif if(crm_cluster_connect(&cluster)) { connected = true; } #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) } #endif } if(connected == false) { cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000); sleep(reconnect_msec / 1000); } } set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership"); notify_parent(); notify_timer_cb(NULL); } static void sbd_membership_destroy(gpointer user_data) { cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type())); if (get_cluster_type() != pcmk_cluster_unknown) { #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) cmap_destroy(); #endif } set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated"); notify_parent(); /* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */ sbd_membership_connect(); } /* * \internal * \brief Get process ID and name associated with a /proc directory entry * * \param[in] entry Directory entry (must be result of readdir() on /proc) * \param[out] name If not NULL, a char[16] to hold the process name * \param[out] pid If not NULL, will be set to process ID of entry * * \return 0 on success, -1 if entry is not for a process or info not found * * \note This should be called only on Linux systems, as not all systems that * support /proc store process names and IDs in the same way. * Copied from the Pacemaker implementation. */ int sbd_procfs_process_info(struct dirent *entry, char *name, int *pid) { int fd, local_pid; FILE *file; struct stat statbuf; char procpath[128] = { 0 }; /* We're only interested in entries whose name is a PID, * so skip anything non-numeric or that is too long. * * 114 = 128 - strlen("/proc/") - strlen("/status") - 1 */ local_pid = atoi(entry->d_name); if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) { return -1; } if (pid) { *pid = local_pid; } /* Get this entry's file information */ strcpy(procpath, "/proc/"); strcat(procpath, entry->d_name); fd = open(procpath, O_RDONLY); if (fd < 0 ) { return -1; } if (fstat(fd, &statbuf) < 0) { close(fd); return -1; } close(fd); /* We're only interested in subdirectories */ if (!S_ISDIR(statbuf.st_mode)) { return -1; } /* Read the first entry ("Name:") from the process's status file. * We could handle the valgrind case if we parsed the cmdline file * instead, but that's more of a pain than it's worth. */ if (name != NULL) { strcat(procpath, "/status"); file = fopen(procpath, "r"); if (!file) { return -1; } if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) != 1) { fclose(file); return -1; } fclose(file); } return 0; } static gboolean sbd_remote_check(gpointer user_data) { static int have_proc_pid = 0; int running = 0; cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid); if(have_proc_pid == 0) { char proc_path[PATH_MAX], exe_path[PATH_MAX]; /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid()); have_proc_pid = 1; if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) { have_proc_pid = -1; } } if (remoted_pid <= 0) { set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection"); goto notify; } else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) { /* Not running */ } else if(have_proc_pid == -1) { running = 1; cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid); } else { int rc = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX]; /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid); rc = readlink(proc_path, exe_path, PATH_MAX - 1); if (rc < 0) { crm_perror(LOG_ERR, "Could not read from %s", proc_path); goto done; } exe_path[rc] = 0; if (strcmp(exe_path, SBINDIR "/" PACEMAKER_REMOTE_BINARY) == 0) { cl_log(LOG_DEBUG, "Process %s (%ld) is active", exe_path, (long)remoted_pid); running = 1; } } done: if(running) { set_servant_health(pcmk_health_online, LOG_INFO, "Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid); } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid); } notify: notify_parent(); if(running == 0) { sbd_membership_connect(); } return true; } static long unsigned int find_pacemaker_remote(void) { DIR *dp; char entry_name[16]; struct dirent *entry; dp = opendir("/proc"); if (!dp) { /* no proc directory to search through */ cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components"); return FALSE; } while ((entry = readdir(dp)) != NULL) { int pid; if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) { continue; } /* entry_name is truncated to 16 characters including the nul terminator */ - cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid); + cl_log(LOG_DEBUG, "Found %s at %d", entry_name, pid); if (strncmp(entry_name, PACEMAKER_REMOTE_BINARY, 15) == 0) { - cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid); + cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %d", pid); remoted_pid = pid; remote_node = true; break; } } closedir(dp); return remoted_pid; } static void clean_up(int rc) { #if CHECK_VOTEQUORUM_HANDLE votequorum_finalize(votequorum_handle); votequorum_handle = 0; /* there isn't really an invalid handle value * just to be back where we started */ #endif return; } static void cluster_shutdown(int nsig) { clean_up(0); } int servant_cluster(const char *diskname, int mode, const void* argp) { enum cluster_type_e cluster_stack = get_cluster_type(); crm_system_name = strdup("sbd:cluster"); cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack)); set_proc_title("sbd: watcher: Cluster"); sbd_membership_connect(); /* stonith_our_uname = cluster.uname; */ /* stonith_our_uuid = cluster.uuid; */ mainloop = g_main_loop_new(NULL, FALSE); notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL); mainloop_add_signal(SIGTERM, cluster_shutdown); mainloop_add_signal(SIGINT, cluster_shutdown); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd-common.c b/src/sbd-common.c index 3abf75f..7ebf4a7 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,1355 +1,832 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #include #include -#ifdef __GLIBC__ -#include -#endif #include #include #include -#include #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ -unsigned long timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT; -int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT; -unsigned long timeout_watchdog_warn = calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT); -bool do_calculate_timeout_watchdog_warn = true; -int timeout_allocate = 2; -int timeout_loop = 1; -int timeout_io = 3; -int timeout_startup = 120; - -int watchdog_use = 1; -int watchdog_set_timeout = 1; -unsigned long timeout_watchdog_crashdump = 0; -int skip_rt = 0; -int debug = 0; -int debug_mode = 0; -char *watchdogdev = NULL; -bool watchdogdev_is_default = false; -char * local_uname; +int timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT; +int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT; + +int timeout_allocate = 2; +int timeout_loop = 1; +int timeout_io = 3; +int timeout_startup = 120; + +int watchdog_use = 1; +int watchdog_set_timeout = 1; +int timeout_watchdog_crashdump = 0; +int skip_rt = 0; +int debug = 0; +int debug_mode = 0; /* Global, non-tunable variables: */ -int sector_size = 0; -int watchdogfd = -1; -int servant_health = 0; +int sector_size = 0; +int servant_health = 0; -/*const char *devname;*/ -const char *cmdname; +const char *cmdname; +char *local_uname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v|-vv|-vvv Enable verbose|debug|debug-library logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping\n" " (def: 0s = disable gracefully, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "-r Set timeout-action to comma-separated combination of\n" " noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|crashdump|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" "query-watchdog Check for available watchdog-devices and print some info\n" "test-watchdog Test the watchdog-device selected.\n" " Attention: This will arm the watchdog and have your system reset\n" " in case your watchdog is working properly!\n" , cmdname); } -#define MAX_WATCHDOGS 64 -#define SYS_CLASS_WATCHDOG "/sys/class/watchdog" -#define SYS_CHAR_DEV_DIR "/sys/dev/char" -#define WATCHDOG_NODEDIR "/dev/" - -static bool -is_watchdog(dev_t device) -{ - static int num_watchdog_devs = 0; - static dev_t watchdog_devs[MAX_WATCHDOGS]; - struct dirent *entry; - int i; - - /* populate on first call */ - if (num_watchdog_devs == 0) { - DIR *dp; - - watchdog_devs[0] = makedev(10,130); - num_watchdog_devs = 1; - - /* get additional devices from /sys/class/watchdog */ - dp = opendir(SYS_CLASS_WATCHDOG); - if (dp) { - while ((entry = readdir(dp))) { - if (entry->d_type == DT_LNK) { - FILE *file; - char entry_name[NAME_MAX+sizeof(SYS_CLASS_WATCHDOG)+5]; - - snprintf(entry_name, sizeof(entry_name), - SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); - file = fopen(entry_name, "r"); - if (file) { - int major, minor; - - if (fscanf(file, "%d:%d", &major, &minor) == 2) { - watchdog_devs[num_watchdog_devs++] = makedev(major, minor); - } - fclose(file); - if (num_watchdog_devs == MAX_WATCHDOGS) { - break; - } - } - } - } - closedir(dp); - } - } - - for (i=0; i < num_watchdog_devs; i++) { - if (device == watchdog_devs[i]) { - return true; - } - } - return false; -} - -static int -watchdog_init_interval_fd(int wdfd, int timeout) -{ - if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) { - cl_perror( "WDIOC_SETTIMEOUT" - ": Failed to set watchdog timer to %u seconds.", - timeout); - cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); - cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); - return -1; - } - return 0; -} - -int -watchdog_init_interval(void) -{ - if (watchdogfd < 0) { - return 0; - } - - if (watchdog_set_timeout == 0) { - cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); - return 0; - } - - if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) { - return -1; - } - cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); - return 0; -} - -static int -watchdog_tickle_fd(int wdfd, char *wddev) -{ - if (write(wdfd, "", 1) != 1) { - cl_perror("Watchdog write failure: %s!", wddev); - return -1; - } - return 0; -} - -int -watchdog_tickle(void) -{ - if (watchdogfd >= 0) { - return watchdog_tickle_fd(watchdogfd, watchdogdev); - } - return 0; -} - -static int -watchdog_init_fd(char *wddev, int timeout) -{ - int wdfd; - - wdfd = open(wddev, O_WRONLY); - if (wdfd >= 0) { - if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0)) || - (watchdog_tickle_fd(wdfd, wddev) < 0)) { - close(wdfd); - return -1; - } - } else { - struct stat statbuf; - - if(!stat(wddev, &statbuf) && S_ISCHR(statbuf.st_mode) && - is_watchdog(statbuf.st_rdev)) { - cl_perror("Cannot open watchdog device '%s'", wddev); - } else { - cl_perror("Seems as if '%s' isn't a valid watchdog-device", wddev); - } - return -1; - } - return wdfd; -} - -int -watchdog_init(void) -{ - if (watchdogfd < 0 && watchdogdev != NULL) { - int timeout = timeout_watchdog; - - if (watchdog_set_timeout == 0) { - cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); - timeout = -1; - } - watchdogfd = watchdog_init_fd(watchdogdev, timeout); - if (watchdogfd >= 0) { - cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); - if (watchdog_set_timeout) { - cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); - } - } else { - return -1; - } - } - return 0; -} - -static void -watchdog_close_fd(int wdfd, char *wddev, bool disarm) -{ - if (disarm) { - int r; - int flags = WDIOS_DISABLECARD;; - - /* Explicitly disarm it */ - r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags); - if (r < 0) { - cl_perror("Failed to disable hardware watchdog %s", wddev); - } - - /* To be sure, use magic close logic, too */ - for (;;) { - if (write(wdfd, "V", 1) > 0) { - break; - } - cl_perror("Cannot disable watchdog device %s", wddev); - } - } - - if (close(wdfd) < 0) { - cl_perror("Watchdog close(%d) failed", wdfd); - } -} - -void -watchdog_close(bool disarm) -{ - if (watchdogfd < 0) { - return; - } - - watchdog_close_fd(watchdogfd, watchdogdev, disarm); - watchdogfd = -1; -} - -struct watchdog_list_item { - dev_t dev; - char *dev_node; - char *dev_ident; - char *dev_driver; - pid_t busy_pid; - char *busy_name; - struct watchdog_list_item *next; -}; - -struct link_list_item { - char *dev_node; - char *link_name; - struct link_list_item *next; -}; - -static struct watchdog_list_item *watchdog_list = NULL; -static int watchdog_list_items = 0; - -static void -watchdog_populate_list(void) -{ - struct dirent *entry; - char entry_name[sizeof(WATCHDOG_NODEDIR)+NAME_MAX]; - DIR *dp; - char buf[NAME_MAX+sizeof(WATCHDOG_NODEDIR)] = ""; - struct link_list_item *link_list = NULL; - - if (watchdog_list != NULL) { - return; - } - - /* search for watchdog nodes in /dev */ - dp = opendir(WATCHDOG_NODEDIR); - if (dp) { - /* first go for links and memorize them */ - while ((entry = readdir(dp))) { - if (entry->d_type == DT_LNK) { - int len; - - snprintf(entry_name, sizeof(entry_name), - WATCHDOG_NODEDIR "%s", entry->d_name); - - /* realpath(entry_name, buf) unfortunately does a stat on - * target so we can't really use it to check if links stay - * within /dev without triggering e.g. AVC-logs (with - * SELinux policy that just allows stat within /dev). - * Without canonicalization that doesn't actually touch the - * filesystem easily available introduce some limitations - * for simplicity: - * - just simple path without '..' - * - just one level of symlinks (avoid e.g. loop-checking) - */ - len = readlink(entry_name, buf, sizeof(buf) - 1); - if ((len < 1) || - (len > sizeof(buf) - sizeof(WATCHDOG_NODEDIR) -1 - 1)) { - continue; - } - buf[len] = '\0'; - if (buf[0] != '/') { - memmove(&buf[sizeof(WATCHDOG_NODEDIR)-1], buf, len+1); - memcpy(buf, WATCHDOG_NODEDIR, sizeof(WATCHDOG_NODEDIR)-1); - len += sizeof(WATCHDOG_NODEDIR)-1; - } - if (strstr(buf, "/../") || - strncmp(WATCHDOG_NODEDIR, buf, sizeof(WATCHDOG_NODEDIR)-1)) { - continue; - } else { - /* just memorize to avoid statting the target - SELinux */ - struct link_list_item *lli = - calloc(1, sizeof(struct link_list_item)); - - if (lli == NULL) { - break; - } - lli->dev_node = strdup(buf); - lli->link_name = strdup(entry_name); - if ((lli->dev_node == NULL) || (lli->link_name == NULL)) { - free(lli->dev_node); - free(lli->link_name); - free(lli); - break; - } - lli->next = link_list; - link_list = lli; - } - } - } - - rewinddir(dp); - - while ((entry = readdir(dp))) { - if (entry->d_type == DT_CHR) { - struct stat statbuf; - - snprintf(entry_name, sizeof(entry_name), - WATCHDOG_NODEDIR "%s", entry->d_name); - if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode) && - is_watchdog(statbuf.st_rdev)) { - - int wdfd; - struct watchdog_list_item *wdg = - calloc(1, sizeof(struct watchdog_list_item)); - int len; - struct link_list_item *tmp_list = NULL; - - if (wdg == NULL) { - break; - } - - wdg->dev = statbuf.st_rdev; - wdg->dev_node = strdup(entry_name); - if (wdg->dev_node == NULL) { - free(wdg); - break; - } - wdg->next = watchdog_list; - watchdog_list = wdg; - watchdog_list_items++; - - wdfd = watchdog_init_fd(entry_name, -1); - if (wdfd >= 0) { - struct watchdog_info ident; - - ident.identity[0] = '\0'; - ioctl(wdfd, WDIOC_GETSUPPORT, &ident); - watchdog_close_fd(wdfd, entry_name, true); - if (ident.identity[0]) { - wdg->dev_ident = strdup((char *) ident.identity); - } - } - - snprintf(entry_name, sizeof(entry_name), - SYS_CHAR_DEV_DIR "/%d:%d/device/driver", - major(wdg->dev), minor(wdg->dev)); - len = readlink(entry_name, buf, sizeof(buf) - 1); - if (len > 0) { - buf[len] = '\0'; - wdg->dev_driver = strdup(basename(buf)); - } else if ((wdg->dev_ident) && - (strcmp(wdg->dev_ident, - "Software Watchdog") == 0)) { - wdg->dev_driver = strdup("softdog"); - } - - /* create dupes if we have memorized links - * to this node - */ - for (tmp_list = link_list; tmp_list; - tmp_list = tmp_list->next) { - if (!strcmp(tmp_list->dev_node, - wdg->dev_node)) { - struct watchdog_list_item *dupe_wdg = - calloc(1, sizeof(struct watchdog_list_item)); - - if (dupe_wdg == NULL) { - break; - } - /* as long as we never purge watchdog_list - * there is no need to dupe strings - */ - *dupe_wdg = *wdg; - dupe_wdg->dev_node = strdup(tmp_list->link_name); - if (dupe_wdg->dev_node == NULL) { - free(dupe_wdg); - break; - } - dupe_wdg->next = watchdog_list; - watchdog_list = dupe_wdg; - watchdog_list_items++; - } - /* for performance reasons we could remove - * the link_list entry - */ - } - } - } - } - - closedir(dp); - } - - /* cleanup link list */ - while (link_list) { - struct link_list_item *tmp_list = link_list; - - link_list = link_list->next; - free(tmp_list->dev_node); - free(tmp_list->link_name); - free(tmp_list); - } -} - -static void -watchdog_checkbusy() -{ - DIR *dproc; - struct dirent *entry; - - dproc = opendir("/proc"); - if (!dproc) { - /* no proc directory to search through */ - return; - } - - while ((entry = readdir(dproc)) != NULL) { - pid_t local_pid; - char *leftover; - DIR *dpid; - char procpath[NAME_MAX+10] = { 0 }; - - if (entry->d_name[0] == '.') { - continue; - } - - local_pid = strtol(entry->d_name, &leftover, 10); - if (leftover[0] != '\0') - continue; - - snprintf(procpath, sizeof(procpath), "/proc/%s/fd", entry->d_name); - dpid = opendir(procpath); - if (!dpid) { - /* silently continue - might be just a race */ - continue; - } - while ((entry = readdir(dpid)) != NULL) { - struct watchdog_list_item *wdg; - char entry_name[sizeof(procpath)+NAME_MAX+1] = { 0 }; - char buf[NAME_MAX+1] = { 0 }; - int len; - - if (entry->d_type != DT_LNK) { - continue; - } - snprintf(entry_name, sizeof(entry_name), - "%s/%s", procpath, entry->d_name); - len = readlink(entry_name, buf, sizeof(buf) - 1); - if (len < 1) { - continue; - } - buf[len] = '\0'; - for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { - if (!strcmp(buf, wdg->dev_node)) { - char name[16]; - FILE *file; - - wdg->busy_pid = local_pid; - snprintf(procpath, sizeof(procpath), "/proc/%d/status", local_pid); - file = fopen(procpath, "r"); - if (file) { - if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) == 1) { - wdg->busy_name = strdup(name); - } - fclose(file); - } - } - } - } - closedir(dpid); - } - - closedir(dproc); - - return; -} - -int watchdog_info(void) -{ - struct watchdog_list_item *wdg; - int wdg_cnt = 0; - - watchdog_populate_list(); - watchdog_checkbusy(); - printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items); - for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { - wdg_cnt++; - if (wdg->busy_pid) { - printf("\n[%d] %s\nIdentity: Busy: PID %d (%s)\nDriver: %s\n", - wdg_cnt, wdg->dev_node, - wdg->busy_pid, - wdg->busy_name?wdg->busy_name:"", - wdg->dev_driver?wdg->dev_driver:""); - } else { - printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n", - wdg_cnt, wdg->dev_node, - wdg->dev_ident?wdg->dev_ident: - "Error: device hogged via alias major/minor?", - wdg->dev_driver?wdg->dev_driver:""); - } - if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) { - printf("CAUTION: Not recommended for use with sbd.\n"); - } - } - - return 0; -} - -int watchdog_test(void) -{ - int i; - - if ((watchdog_set_timeout == 0) || !watchdog_use) { - printf("\nWatchdog is disabled - aborting test!!!\n"); - return 0; - } - if (watchdogdev_is_default) { - watchdog_populate_list(); - if (watchdog_list_items > 1) { - printf("\nError: Multiple watchdog devices discovered.\n" - " Use -w or SBD_WATCHDOG_DEV to specify\n" - " which device to reset the system with\n"); - watchdog_info(); - return -1; - } - } - if ((isatty(fileno(stdin)))) { - char buffer[16]; - printf("\nWARNING: This operation is expected to force-reboot this system\n" - " without following any shutdown procedures.\n\n" - "Proceed? [NO/Proceed] "); - - if ((fgets(buffer, 16, stdin) == NULL) || - strcmp(buffer, "Proceed\n")) { - printf("\nAborting watchdog test!!!\n"); - return 0; - } - printf("\n"); - } - printf("Initializing %s with a reset countdown of %d seconds ...\n", - watchdogdev, (int) timeout_watchdog); - if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) { - printf("Failed to initialize watchdog!!!\n"); - watchdog_info(); - return -1; - } - printf("\n"); - printf("NOTICE: The watchdog device is expected to reset the system\n" - " in %d seconds. If system remains active beyond that time,\n" - " watchdog may not be functional.\n\n", (int) timeout_watchdog); - for (i=timeout_watchdog; i>1; i--) { - printf("Reset countdown ... %d seconds\n", i); - sleep(1); - } - for (i=2; i>0; i--) { - printf("System expected to reset any moment ...\n"); - sleep(1); - } - for (i=5; i>0; i--) { - printf("System should have reset ...\n"); - sleep(1); - } - printf("Error: The watchdog device has failed to reboot the system,\n" - " and it may not be suitable for usage with sbd.\n"); - - /* test should trigger a reboot thus returning is actually bad */ - return -1; -} - /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static void sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { sbd_stack_hogger(buf, kbytes-1); } return; } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } static int get_realtime_budget(void) { FILE *f; char fname[PATH_MAX]; int res = -1, lnum = 0, num; char *cgroup = NULL, *namespecs = NULL; snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd", (intmax_t)getpid()); goto exit_res; } while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum, &namespecs, &cgroup)) !=EOF ) { if (namespecs && strstr(namespecs, "cpuacct")) { free(namespecs); break; } if (cgroup) { free(cgroup); cgroup = NULL; } if (namespecs) { free(namespecs); namespecs = NULL; } /* not to get stuck if format changes */ if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) || (fscanf(f, "\n") == EOF))) { break; } } fclose(f); if (cgroup == NULL) { cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd", (intmax_t)getpid()); goto exit_res; } snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us", cgroup); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but " "doesn't for '%s'", cgroup); goto exit_res; } if (fscanf(f, "%d", &res) != 1) { cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname); } else { cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res); } fclose(f); exit_res: if (cgroup) { free(cgroup); } return res; } /* stolen from corosync */ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } fclose(f); if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) { cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are " "-> skip moving to root-slice"); res = 0; goto exit_res; } f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { cl_log(LOG_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } do { #ifdef SCHED_RR if (move_to_root_cgroup) { sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); } { int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); struct sched_param sp; int pcurrent; if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } if (sched_getparam(0, &sp) < 0) { cl_perror("Unable to get scheduler priority"); } else if ((pcurrent = sched_getscheduler(0)) < 0) { cl_perror("Unable to get scheduler policy"); } else if ((pcurrent == SCHED_RR) && (sp.sched_priority >= priority)) { cl_log(LOG_INFO, "Stay with priority (%d) for policy SCHED_RR", sp.sched_priority); break; } else { memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror( "Unable to set scheduler policy to SCHED_RR priority %d", priority); } else { cl_log(LOG_INFO, "Scheduler policy is now SCHED_RR priority %d", priority); break; } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler policy"); #endif #ifdef PRIO_PGRP if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) { cl_perror("Unable to raise the scheduler priority"); } else { cl_log(LOG_INFO, "Scheduler priority raised to the maximum"); } #else cl_perror("System does not support setting the scheduler priority"); #endif } while (0); sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind, bool do_flush) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicking the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicking the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); if (do_flush) { sync(); } if (kind == 'c') { if (timeout_watchdog_crashdump) { if (timeout_watchdog != timeout_watchdog_crashdump) { timeout_watchdog = timeout_watchdog_crashdump; watchdog_init_interval(); } watchdog_close(false); } else { watchdog_close(true); } sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) { cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot"); } } exit(1); } void do_crashdump(void) { do_exit('c', true); } void do_reset(void) { do_exit('b', true); } void do_off(void) { do_exit('o', true); } void do_timeout_action(void) { do_exit(timeout_sysrq_char, do_flush); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } +int sigqueue_zero(pid_t pid, int sig) +{ +union sigval signal_value; + + memset(&signal_value, 0, sizeof(signal_value)); + + return sigqueue(pid, sig, signal_value); +} + void notify_parent(void) { pid_t ppid; - union sigval signal_value; - memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_timeout_action(); } switch (servant_health) { case pcmk_health_pending: case pcmk_health_shutdown: case pcmk_health_transient: DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health); break; case pcmk_health_unknown: case pcmk_health_unclean: case pcmk_health_noquorum: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health); - sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); + sigqueue_zero(ppid, SIG_PCMK_UNHEALTHY); break; case pcmk_health_online: DBGLOG(LOG_DEBUG, "Notifying parent: healthy"); - sigqueue(ppid, SIG_LIVENESS, signal_value); + sigqueue_zero(ppid, SIG_LIVENESS); break; default: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health); - sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); + sigqueue_zero(ppid, SIG_PCMK_UNHEALTHY); break; } } void set_servant_health(enum pcmk_health state, int level, char const *format, ...) { if (servant_health != state) { va_list ap; int len = 0; char *string = NULL; servant_health = state; va_start(ap, format); len = vasprintf (&string, format, ap); if(len > 0) { cl_log(level, "%s", string); } va_end(ap); free(string); } } bool sbd_is_disk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (servant->devname[0] == '/')) { return true; } return false; } bool sbd_is_cluster(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("cluster", servant->devname) == 0)) { return true; } return false; } bool sbd_is_pcmk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("pcmk", servant->devname) == 0)) { return true; } return false; } + +#define MAX_LEGITIMATE_AGE 3600 /* 1h should be plenty */ + +int +seconds_diff_time_t(time_t a, time_t b) +{ + long long diff; + + diff = a - b; + + if ((diff > -MAX_LEGITIMATE_AGE) && (diff < MAX_LEGITIMATE_AGE)) { + return (int) diff; + } + + DBGLOG(LOG_WARNING, "Detected unreasonable age (%lld)", diff); + return MAX_LEGITIMATE_AGE; /* something is fishy - provoke timeout */ +} + +int +seconds_diff_timespec(struct timespec *a, struct timespec *b) +{ + struct timeval diff; + struct timeval a_tv; + struct timeval b_tv; + + TIMESPEC_TO_TIMEVAL(&a_tv, a); + TIMESPEC_TO_TIMEVAL(&b_tv, b); + + timersub(&a_tv, &b_tv, &diff); + + return seconds_diff_time_t(diff.tv_sec, 0); +} diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index d917cd1..56fd8a8 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,1408 +1,1405 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "sbd.h" #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int disk_priority = 1; int check_pcmk = 1; int check_cluster = 1; int has_check_pcmk_env = false; int disk_count = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; bool do_flush = true; char timeout_sysrq_char = 'b'; bool move_to_root_cgroup = true; bool enforce_moving_to_root_cgroup = false; bool sync_resource_startup = false; int parse_device_line(const char *line); static int sanitize_numeric_option_value(const char *value) { char *end = NULL; long int result = -1; if (value == NULL) { return -1; } errno = 0; result = strtol(value, &end, 10); if (result <= INT_MIN || result >= INT_MAX || errno != 0) { result = -1; } else if (*end != '\0') { result = -1; } return (int)result; } static const char * sanitize_option_value(const char *value) { size_t max = 0; size_t lpc = 0; if (value == NULL) { return NULL; } max = strlen(value); for (lpc = 0; lpc < max; lpc++) { if (!isspace(value[lpc])) { break; } } return (strlen(value + lpc) > 0 ? (value + lpc) : NULL); } static const char * get_env_option(const char *option) { const char *value = getenv(option); return sanitize_option_value(value); } static int recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { cl_log(LOG_DEBUG, "Servant %s already exists", devname); return 0; } newbie = malloc(sizeof(*newbie)); if (newbie) { memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; } if (!newbie || !newbie->devname) { fprintf(stderr, "heap allocation failed in recruit_servant.\n"); exit(1); } /* some sanity-check on our newbie */ if (sbd_is_disk(newbie)) { cl_log(LOG_INFO, "Monitoring %s", devname); disk_count++; } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { /* alive just after pcmk and cluster servants have shown up */ newbie->outdated = 1; } else { /* toss our newbie */ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); free((void *) newbie->devname); free(newbie); return -1; } if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; return 0; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strcasecmp(s->devname, devname) == 0) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; - union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { - r = sigqueue(s->pid, 0, svalue); + r = sigqueue_zero(s->pid, 0); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; - union sigval svalue; if (s->pid != 0) { - r = sigqueue(s->pid, 0, svalue); + r = sigqueue_zero(s->pid, 0); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif } else if(sbd_is_pcmk(s)) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else if(sbd_is_cluster(s)) { DBGLOG(LOG_INFO, "Starting Cluster servant"); s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); } else { cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; - union sigval svalue; for (s = servants_leader; s; s = s->next) { - if (s->pid != 0) - sigqueue(s->pid, SIGKILL, svalue); + if (s->pid != 0) { + sigqueue_zero(s->pid, SIGKILL); + } } } static inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); - union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { - sigqueue(ppid, SIG_LIVENESS, signal_value); + sigqueue_zero(ppid, SIG_LIVENESS); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int cluster_alive(bool all) { int alive = 1; struct servants_list_item* s; if(servant_count == disk_count) { return 0; } for (s = servants_leader; s; s = s->next) { if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if(s->outdated) { alive = 0; } else if(all == false) { return 1; } } } return alive; } int quorum_read(int good_servants) { if (disk_count > 2) return (good_servants > disk_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int cluster_appeared = 0; int pcmk_override = 0; - time_t latency; + int latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { bool tickle = 0; bool can_detach = 0; int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; default: break; } } } else if (sbd_is_pcmk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); /* revert to state prior to pacemaker-detection */ s->restarts = 0; s->restart_blocked = 0; cluster_appeared = 0; s->outdated = 1; s->t_last.tv_sec = 0; break; default: break; } } } cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if (s->outdated == 0) { cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname); } s->t_last.tv_sec = 1; } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { - int age = t_now.tv_sec - s->t_last.tv_sec; + int age = seconds_diff_timespec(&t_now, &(s->t_last)); if (!s->t_last.tv_sec) continue; - if (age < (int)(timeout_io+timeout_loop)) { + if (age < timeout_io+timeout_loop) { if (sbd_is_disk(s)) { good_servants++; } if (s->outdated) { cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age); } s->outdated = 0; } else if (!s->outdated) { if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(disk_count == 0) { /* NO disks, everything is up to the cluster */ if(cluster_alive(true)) { /* We LIVE! */ if(cluster_appeared == false) { cl_log(LOG_INFO, "Active cluster detected"); } tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(cluster_alive(false)) { if(!decoupled) { /* On the way up, detach and arm the watchdog */ cl_log(LOG_INFO, "Partial cluster detected, detaching"); } can_detach = 1; tickle = !cluster_appeared; } else if(!decoupled) { /* Stay alive until the cluster comes up */ tickle = !cluster_appeared; } } else if(disk_priority == 1 || servant_count == disk_count) { if (quorum_read(good_servants)) { /* There are disks and we're connected to the majority of them */ tickle = 1; can_detach = 1; pcmk_override = 0; } else if (servant_count > disk_count && cluster_alive(true)) { tickle = 1; if(!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Only log this message once */ } } } else if(cluster_alive(true) && quorum_read(good_servants)) { /* Both disk and cluster servants are healthy */ tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(quorum_read(good_servants)) { /* The cluster takes priority but only once * connected for the first time. * * Until then, we tickle based on disk quorum. */ can_detach = 1; tickle = !cluster_appeared; } /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, tickle, disk_count); */ if(tickle) { watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } if (!decoupled && can_detach) { /* We only do this at the point either the disk or * cluster servants become healthy */ cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ - latency = t_now.tv_sec - t_last_tickle.tv_sec; - if (timeout_watchdog && (latency > (int)timeout_watchdog)) { + latency = seconds_diff_timespec(&t_now, &t_last_tickle); + if (timeout_watchdog && (latency > timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_timeout_action(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } - if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { + if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)", - (int)latency, (int)timeout_watchdog_warn, good_servants); + latency, timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_timeout_action(); } } for (s = servants_leader; s; s = s->next) { - int age = t_now.tv_sec - s->t_started.tv_sec; + int age = seconds_diff_timespec(&t_now, &(s->t_started)); if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { size_t lpc = 0; size_t last = 0; size_t max = 0; int found = 0; bool skip_space = true; int space_run = 0; if (!line) { return 0; } max = strlen(line); cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); for (lpc = 0; lpc <= max; lpc++) { if (isspace(line[lpc])) { if (skip_space) { last = lpc + 1; } else { space_run++; } continue; } skip_space = false; if (line[lpc] == ';' || line[lpc] == 0) { int rc = 0; char *entry = calloc(1, 1 + lpc - last); if (entry) { rc = sscanf(line + last, "%[^;]", entry); } else { fprintf(stderr, "Heap allocation failed parsing device-line.\n"); exit(1); } if (rc != 1) { cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); if (recruit_servant(entry, 0) != 0) { free(entry); // sbd should refuse to start if any of the configured device names is invalid. return -1; } found++; } free(entry); skip_space = true; last = lpc + 1; } space_run = 0; } return found; } -#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c" +#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,sbd-watchdog.c,setproctitle.c" static void sbd_log_filter_ctl(const char *files, uint8_t priority) { if (files == NULL) { files = SBD_SOURCE_FILES; } qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); } int arg_enabled(int arg_count) { return arg_count % 2; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int W_count = 0; int c_count = 0; int P_count = 0; int qb_facility; const char *value = NULL; bool delay_start = false; long delay = 0; char *timeout_action = NULL; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); watchdogdev_is_default = true; qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_log_filter_ctl(NULL, LOG_NOTICE); sbd_get_uname(); value = get_env_option("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); check_cluster = crm_is_true(value); has_check_pcmk_env = true; } cl_log(LOG_INFO, "SBD_PACEMAKER set to: %d (%s)", (int)check_pcmk, value?value:"default"); value = get_env_option("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = get_env_option("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); watchdogdev_is_default = false; } /* SBD_WATCHDOG has been dropped from sbd.sysconfig example. * This is for backward compatibility. */ value = get_env_option("SBD_WATCHDOG"); if(value) { watchdog_use = crm_is_true(value); } value = get_env_option("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; } value = get_env_option("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = get_env_option("SBD_DELAY_START"); if(value) { delay_start = crm_is_true(value); if (!delay_start) { delay = crm_get_msec(value) / 1000; if (delay > 0) { delay_start = true; } } } value = get_env_option("SBD_TIMEOUT_ACTION"); if(value) { timeout_action = strdup(value); } value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP"); if(value) { move_to_root_cgroup = crm_is_true(value); if (move_to_root_cgroup) { enforce_moving_to_root_cgroup = true; } else { if (strcmp(value, "auto") == 0) { move_to_root_cgroup = true; } } } while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) { int sanitized_num_optarg = 0; /* Call it before checking optarg for NULL to make coverity happy */ const char *sanitized_optarg = sanitize_option_value(optarg); if (optarg && ((sanitized_optarg == NULL) || (strchr("SsC12345tIF", c) && (sanitized_num_optarg = sanitize_numeric_option_value(sanitized_optarg)) < 0))) { fprintf(stderr, "Invalid value \"%s\" for option -%c\n", optarg, c); exit_status = -2; goto out; } switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = sanitized_num_optarg; cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = sanitized_num_optarg; cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { sbd_log_filter_ctl(NULL, LOG_INFO); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { sbd_log_filter_ctl(NULL, LOG_DEBUG); cl_log(LOG_INFO, "Debug mode enabled."); } else if(debug == 3) { /* Go nuts, turn on pacemaker's logging too */ sbd_log_filter_ctl("*", LOG_DEBUG); cl_log(LOG_INFO, "Debug library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': W_count++; break; case 'w': free(watchdogdev); watchdogdev = strdup(sanitized_optarg); watchdogdev_is_default = false; cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); break; case 'd': #if SUPPORT_SHARED_DISK if (recruit_servant(sanitized_optarg, 0) != 0) { fprintf(stderr, "Invalid device: %s\n", optarg); exit_status = -1; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; case 'c': c_count++; break; case 'P': P_count++; break; case 'z': disk_priority = 0; break; case 'n': local_uname = strdup(sanitized_optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(sanitized_optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = sanitized_num_optarg; cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", - (int)timeout_watchdog_crashdump); + timeout_watchdog_crashdump); break; case '1': timeout_watchdog = sanitized_num_optarg; break; case '2': timeout_allocate = sanitized_num_optarg; break; case '3': timeout_loop = sanitized_num_optarg; break; case '4': timeout_msgwait = sanitized_num_optarg; break; case '5': timeout_watchdog_warn = sanitized_num_optarg; do_calculate_timeout_watchdog_warn = false; cl_log(LOG_INFO, "Setting latency warning to %d", - (int)timeout_watchdog_warn); + timeout_watchdog_warn); break; case 't': servant_restart_interval = sanitized_num_optarg; cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = sanitized_num_optarg; cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = sanitized_num_optarg; cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'r': if (timeout_action) { free(timeout_action); } timeout_action = strdup(sanitized_optarg); break; case 'h': usage(); goto out; break; default: exit_status = -2; goto out; break; } } if (disk_count == 0) { /* if we already have disks from commandline then it is probably undesirable to add those from environment (general rule cmdline has precedence) */ value = get_env_option("SBD_DEVICE"); if ((value) && strlen(value)) { #if SUPPORT_SHARED_DISK int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); exit_status = -1; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif } } if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } else if (W_count > 0) { watchdog_use = arg_enabled(W_count); } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (c_count > 0) { check_cluster = arg_enabled(c_count); } if (P_count > 0) { int check_pcmk_arg = arg_enabled(P_count); if (has_check_pcmk_env && check_pcmk_arg != check_pcmk) { cl_log(LOG_WARNING, "Pacemaker integration is %s: " "SBD_PACEMAKER=%s is overridden by %s option. " "It's recommended to only use SBD_PACEMAKER.", check_pcmk_arg? "enabled" : "disabled", check_pcmk? "yes" : "no", check_pcmk_arg? "-P" : "-PP"); } check_pcmk = check_pcmk_arg; } if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) { fprintf(stderr, "Node name mustn't be longer than %d chars.\n", SECTOR_NAME_MAX); fprintf(stderr, "If uname is longer define a name to be used by sbd.\n"); exit_status = -1; goto out; } if (disk_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } if (timeout_action) { char *p[2]; int i; char c; int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c); bool parse_error = (nrflags < 1) || (nrflags > 2); for (i = 0; (i < nrflags) && (i < 2); i++) { if (!strcmp(p[i], "reboot")) { timeout_sysrq_char = 'b'; } else if (!strcmp(p[i], "crashdump")) { timeout_sysrq_char = 'c'; } else if (!strcmp(p[i], "off")) { timeout_sysrq_char = 'o'; } else if (!strcmp(p[i], "flush")) { do_flush = true; } else if (!strcmp(p[i], "noflush")) { do_flush = false; } else { parse_error = true; } free(p[i]); } if (parse_error) { fprintf(stderr, "Failed to parse timeout-action \"%s\".\n", timeout_action); exit_status = -1; goto out; } } if (strcmp(argv[optind], "watch") == 0) { value = get_env_option("SBD_SYNC_RESOURCE_STARTUP"); sync_resource_startup = crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT); #if !USE_PACEMAKERD_API if (sync_resource_startup) { fprintf(stderr, "Failed to sync resource-startup as " "SBD was built against pacemaker not supporting pacemakerd-API.\n"); exit_status = -1; goto out; } #else if (check_pcmk && !sync_resource_startup) { cl_log(LOG_WARNING, "SBD built against pacemaker supporting " "pacemakerd-API. Should think about enabling " "SBD_SYNC_RESOURCE_STARTUP."); } else if (!check_pcmk && sync_resource_startup) { fprintf(stderr, "Set SBD_PACEMAKER=yes to allow resource startup syncing. " "Otherwise explicitly set SBD_SYNC_RESOURCE_STARTUP=no if to intentionally disable.\n"); exit_status = -1; goto out; } #endif } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else #endif if (strcmp(argv[optind], "query-watchdog") == 0) { exit_status = watchdog_info(); } else if (strcmp(argv[optind], "test-watchdog") == 0) { exit_status = watchdog_test(); } else if (strcmp(argv[optind], "watch") == 0) { /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ const char *delay_source = delay ? "SBD_DELAY_START" : ""; #if SUPPORT_SHARED_DISK if(disk_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); if (delay_start && delay <= 0) { delay = get_first_msgwait(servants_leader); if (delay > 0) { delay_source = "msgwait"; } else { cl_log(LOG_WARNING, "No 'msgwait' value from disk, using '2 * watchdog-timeout' for 'delay' starting"); } } } #endif /* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from: * SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */ if (do_calculate_timeout_watchdog_warn) { timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog); } if (delay_start) { /* diskless mode or disk read issues causing get_first_msgwait() to return a 0 for delay */ if (delay <= 0) { delay = 2 * timeout_watchdog; delay_source = "watchdog-timeout * 2"; } cl_log(LOG_DEBUG, "Delay start (yes), (delay: %ld), (delay source: %s)", delay, delay_source); sleep((unsigned long) delay); } else { cl_log(LOG_DEBUG, "Delay start (no)"); } /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); #if SUPPORT_PLUGIN check_cluster = 1; #endif } cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); if (check_cluster) { recruit_servant("cluster", 0); } cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout", do_flush?"Do":"Skip", timeout_sysrq_char); exit_status = inquisitor(); } else { exit_status = -2; } out: if (timeout_action) { free(timeout_action); } if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/src/sbd-md.c b/src/sbd-md.c index 2a237ad..3a2e82d 100644 --- a/src/sbd-md.c +++ b/src/sbd-md.c @@ -1,1288 +1,1286 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #define SBD_MSG_EMPTY 0x00 #define SBD_MSG_TEST 0x01 #define SBD_MSG_RESET 0x02 #define SBD_MSG_OFF 0x03 #define SBD_MSG_EXIT 0x04 #define SBD_MSG_CRASHDUMP 0x05 #define SLOT_TO_SECTOR(slot) (1+slot*2) #define MBOX_TO_SECTOR(mbox) (2+mbox*2) extern int disk_count; /* These have to match the values in the header of the partition */ static char sbd_magic[8] = "SBD_SBD_"; static char sbd_version = 0x02; struct slot_msg_arg_t { const char* name; const char* msg; }; static signed char cmd2char(const char *cmd) { if (strcmp("clear", cmd) == 0) { return SBD_MSG_EMPTY; } else if (strcmp("test", cmd) == 0) { return SBD_MSG_TEST; } else if (strcmp("reset", cmd) == 0) { return SBD_MSG_RESET; } else if (strcmp("off", cmd) == 0) { return SBD_MSG_OFF; } else if (strcmp("exit", cmd) == 0) { return SBD_MSG_EXIT; } else if (strcmp("crashdump", cmd) == 0) { return SBD_MSG_CRASHDUMP; } return -1; } static const char* char2cmd(const char cmd) { switch (cmd) { case SBD_MSG_EMPTY: return "clear"; break; case SBD_MSG_TEST: return "test"; break; case SBD_MSG_RESET: return "reset"; break; case SBD_MSG_OFF: return "off"; break; case SBD_MSG_EXIT: return "exit"; break; case SBD_MSG_CRASHDUMP: return "crashdump"; break; default: return "undefined"; break; } } static void close_device(struct sbd_context *st) { if (!st) { return; } if (st->ioctx) { io_destroy(st->ioctx); } if (st->devfd >= 0) { close(st->devfd); } free(st->buffer); free(st); } static struct sbd_context * open_device(const char* devname, int loglevel) { struct sbd_context *st; if (!devname) return NULL; st = calloc(1, sizeof(struct sbd_context)); if (!st) { return NULL; } st->devfd = -1; if (io_setup(1, &st->ioctx) != 0) { cl_perror("io_setup failed"); goto out; } st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT); if (st->devfd == -1) { if (loglevel == LOG_DEBUG) { DBGLOG(loglevel, "Opening device %s failed.", devname); } else { cl_log(loglevel, "Opening device %s failed.", devname); } goto out; } ioctl(st->devfd, BLKSSZGET, §or_size); if (sector_size == 0) { cl_perror("Get sector size failed.\n"); goto out; } if (posix_memalign(&st->buffer, sector_size, sector_size)) { cl_perror("Couldn't allocate sector-buffer."); goto out; } return st; out: close_device(st); return NULL; } static void * sector_alloc(void) { void *x; x = calloc(1, sector_size); if (!x) { exit(1); } return x; } static int sector_io(struct sbd_context *st, int sector, void *data, int rw) { struct timespec timeout; struct io_event event; struct iocb *ios[1] = { &st->io }; - long r; + int r; timeout.tv_sec = timeout_io; timeout.tv_nsec = 0; memset(&st->io, 0, sizeof(struct iocb)); if (rw) { memcpy(st->buffer, data, sector_size); io_prep_pwrite(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } else { memset(st->buffer, 0, sector_size); io_prep_pread(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } if (io_submit(st->ioctx, 1, ios) != 1) { cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw); return -1; } errno = 0; r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout); if (r < 0 ) { cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw); return -1; - } else if (r < 1L) { - cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%ld)", rw, r); + } else if (r < 1) { + cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%d)", rw, r); r = io_cancel(st->ioctx, ios[0], &event); if (r) { DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw); /* Doesn't really matter, debugging information. */ } return -1; - } else if (r > 1L) { - cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r); + } else if (r > 1) { + cl_log(LOG_ERR, "More than one IO was returned (r=%d)", r); return -1; } /* IO is happy */ if (event.res == sector_size) { if (!rw) { memcpy(data, st->buffer, sector_size); } return 0; } else { cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)", rw, event.res, sector_size); return -1; } } static int sector_write(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 1); } static int sector_read(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 0); } static int slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_read(st, SLOT_TO_SECTOR(slot), s_node); } static int slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_write(st, SLOT_TO_SECTOR(slot), s_node); } static int mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { void *data; int rc = 0; if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0) return -1; data = sector_alloc(); if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) { rc = -1; goto out; } if (memcmp(s_mbox, data, sector_size) != 0) { cl_log(LOG_ERR, "Write verification failed!"); rc = -1; goto out; } rc = 0; out: free(data); return rc; } static int header_write(struct sbd_context *st, struct sector_header_s *s_header) { s_header->sector_size = htonl(s_header->sector_size); s_header->timeout_watchdog = htonl(s_header->timeout_watchdog); s_header->timeout_allocate = htonl(s_header->timeout_allocate); s_header->timeout_loop = htonl(s_header->timeout_loop); s_header->timeout_msgwait = htonl(s_header->timeout_msgwait); return sector_write(st, 0, s_header); } static int header_read(struct sbd_context *st, struct sector_header_s *s_header) { if (sector_read(st, 0, s_header) < 0) return -1; s_header->sector_size = ntohl(s_header->sector_size); s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog); s_header->timeout_allocate = ntohl(s_header->timeout_allocate); s_header->timeout_loop = ntohl(s_header->timeout_loop); s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait); /* This sets the global defaults: */ timeout_watchdog = s_header->timeout_watchdog; timeout_allocate = s_header->timeout_allocate; timeout_loop = s_header->timeout_loop; timeout_msgwait = s_header->timeout_msgwait; return 0; } static int valid_header(const struct sector_header_s *s_header) { if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) { cl_log(LOG_ERR, "Header magic does not match."); return -1; } if (s_header->version != sbd_version) { cl_log(LOG_ERR, "Header version does not match."); return -1; } if (s_header->sector_size != sector_size) { cl_log(LOG_ERR, "Header sector size does not match."); return -1; } return 0; } static struct sector_header_s * header_get(struct sbd_context *st) { struct sector_header_s *s_header; s_header = sector_alloc(); if (header_read(st, s_header) < 0) { cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd); free(s_header); return NULL; } if (valid_header(s_header) < 0) { cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd); free(s_header); return NULL; } /* cl_log(LOG_INFO, "Found version %d header with %d slots", s_header->version, s_header->slots); */ return s_header; } static int header_dump(struct sbd_context *st) { struct sector_header_s *s_header; char uuid[37]; s_header = header_get(st); if (s_header == NULL) return -1; printf("Header version : %u.%u\n", s_header->version, s_header->minor_version); if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); printf("UUID : %s\n", uuid); } printf("Number of slots : %u\n", s_header->slots); printf("Sector size : %lu\n", (unsigned long)s_header->sector_size); printf("Timeout (watchdog) : %lu\n", (unsigned long)s_header->timeout_watchdog); printf("Timeout (allocate) : %lu\n", (unsigned long)s_header->timeout_allocate); printf("Timeout (loop) : %lu\n", (unsigned long)s_header->timeout_loop); printf("Timeout (msgwait) : %lu\n", (unsigned long)s_header->timeout_msgwait); free(s_header); return 0; } static int init_device(struct sbd_context *st) { struct sector_header_s *s_header; struct sector_node_s *s_node; struct sector_mbox_s *s_mbox; char uuid[37]; int i; int rc = 0; s_header = sector_alloc(); s_node = sector_alloc(); s_mbox = sector_alloc(); memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic)); s_header->version = sbd_version; s_header->slots = 255; s_header->sector_size = sector_size; s_header->timeout_watchdog = timeout_watchdog; s_header->timeout_allocate = timeout_allocate; s_header->timeout_loop = timeout_loop; s_header->timeout_msgwait = timeout_msgwait; s_header->minor_version = 1; uuid_generate(s_header->uuid); uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", s_header->version, s_header->minor_version, st->devfd, uuid); fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n", s_header->version, s_header->minor_version, st->devfd, uuid); if (header_write(st, s_header) < 0) { rc = -1; goto out; } cl_log(LOG_INFO, "Initializing %d slots on device %d", s_header->slots, st->devfd); fprintf(stdout, "Initializing %d slots on device %d\n", s_header->slots, st->devfd); for (i=0;i < s_header->slots;i++) { if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } if (mbox_write(st, i, s_mbox) < 0) { rc = -1; goto out; } } out: free(s_mbox); free(s_node); free(s_header); return(rc); } /* Check if there already is a slot allocated to said name; returns the * slot number. If not found, returns -1. * This is necessary because slots might not be continuous. */ static int slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name) { struct sector_node_s *s_node = NULL; int i; int rc = -1; if (!name) { cl_log(LOG_ERR, "slot_lookup(): No name specified.\n"); goto out; } s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -2; goto out; } if (s_node->in_use != 0) { if (strncasecmp(s_node->name, name, SECTOR_NAME_MAX) == 0) { DBGLOG(LOG_INFO, "%s owns slot %d", name, i); rc = i; goto out; } } } out: free(s_node); return rc; } static int slot_unused(struct sbd_context *st, const struct sector_header_s *s_header) { struct sector_node_s *s_node; int i; int rc = -1; s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use == 0) { rc = i; goto out; } } out: free(s_node); return rc; } static int slot_allocate(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_allocate(): No name specified.\n"); fprintf(stderr, "slot_allocate(): No name specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); while (1) { i = slot_lookup(st, s_header, name); if ((i >= 0) || (i == -2)) { /* -1 is "no slot found", in which case we * proceed to allocate a new one. * -2 is "read error during lookup", in which * case we error out too * >= 0 is "slot already allocated" */ rc = i; goto out; } i = slot_unused(st, s_header); if (i >= 0) { cl_log(LOG_INFO, "slot %d is unused - trying to own", i); fprintf(stdout, "slot %d is unused - trying to own\n", i); memset(s_node, 0, sizeof(*s_node)); s_node->in_use = 1; strncpy(s_node->name, name, SECTOR_NAME_MAX); if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } sleep(timeout_allocate); } else { cl_log(LOG_ERR, "No more free slots."); fprintf(stderr, "No more free slots.\n"); rc = -1; goto out; } } out: free(s_mbox); free(s_node); free(s_header); return(rc); } static int slot_list(struct sbd_context *st) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use > 0) { if (mbox_read(st, i, s_mbox) < 0) { rc = -1; goto out; } printf("%d\t%s\t%s\t%s\n", i, s_node->name, char2cmd(s_mbox->cmd), s_mbox->from); } } out: free(s_mbox); free(s_node); free(s_header); return rc; } static int slot_msg(struct sbd_context *st, const char *name, const char *cmd) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int rc = 0; char uuid[37]; if (!name || !cmd) { cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device UUID: %s", uuid); } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = cmd2char(cmd); if (s_mbox->cmd < 0) { cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd); rc = -1; goto out; } strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); cl_log(LOG_INFO, "Writing %s to node slot %s", cmd, name); if (mbox_write_verify(st, mbox, s_mbox) < -1) { rc = -1; goto out; } if (strcasecmp(cmd, "exit") != 0) { cl_log(LOG_INFO, "Messaging delay: %d", (int)timeout_msgwait); sleep(timeout_msgwait); } cl_log(LOG_INFO, "%s successfully delivered to %s", cmd, name); out: free(s_mbox); free(s_header); return rc; } static int slot_ping(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int waited = 0; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = SBD_MSG_TEST; strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); DBGLOG(LOG_DEBUG, "Pinging node %s", name); if (mbox_write(st, mbox, s_mbox) < -1) { rc = -1; goto out; } rc = -1; while (waited <= timeout_msgwait) { if (mbox_read(st, mbox, s_mbox) < 0) break; if (s_mbox->cmd != SBD_MSG_TEST) { rc = 0; break; } sleep(1); waited++; } if (rc == 0) { cl_log(LOG_DEBUG, "%s successfully pinged.", name); } else { cl_log(LOG_ERR, "%s failed to ping.", name); } out: free(s_mbox); free(s_header); return rc; } int init_devices(struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Initializing device %s\n", s->devname); st = open_device(s->devname, LOG_ERR); if (!st) { return -1; } rc = init_device(st); close_device(st); if (rc == -1) { fprintf(stderr, "Failed to init device %s\n", s->devname); return rc; } fprintf(stdout, "Device %s is initialized.\n", s->devname); } fprintf(stdout, "Did you check sbd service down on all nodes before? If not do so now and restart afterwards.\n"); return 0; } static int slot_msg_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; struct sbd_context *st; const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp; st = open_device(devname, LOG_WARNING); if (!st) return -1; cl_log(LOG_INFO, "Delivery process handling %s", devname); rc = slot_msg(st, arg->name, arg->msg); close_device(st); return rc; } static int slot_ping_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; const char* name = (const char*)argp; struct sbd_context *st; st = open_device(devname, LOG_WARNING); if (!st) return -1; rc = slot_ping(st, name); close_device(st); return rc; } int allocate_slots(const char *name, struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Trying to allocate slot for %s on device %s.\n", name, s->devname); st = open_device(s->devname, LOG_WARNING); if (!st) { return -1; } rc = slot_allocate(st, name); close_device(st); if (rc < 0) return rc; fprintf(stdout, "Slot for %s has been allocated on %s.\n", name, s->devname); } return 0; } int list_slots(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv = 0; st = open_device(s->devname, LOG_WARNING); if (!st) { rc = -1; fprintf(stderr, "== disk %s unreadable!\n", s->devname); continue; } rv = slot_list(st); close_device(st); if (rv == -1) { rc = -1; fprintf(stderr, "== Slots on disk %s NOT dumped\n", s->devname); } } return rc; } int ping_via_slots(const char *name, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { if(sbd_is_disk(s)) { s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name); } } while (servants_finished < disk_count) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = wait(&status))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { servants_finished++; } } } } } return 0; } int quorum_write(int good_servants) { return (good_servants > disk_count/2); } int messenger(const char *name, const char *msg, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; int successful_delivery = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; struct slot_msg_arg_t slot_msg_arg = {name, msg}; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg); } while (!(quorum_write(successful_delivery) || (servants_finished == disk_count))) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { servants_finished++; if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { DBGLOG(LOG_INFO, "Process %d succeeded.", (int)pid); successful_delivery++; } else { cl_log(LOG_WARNING, "Process %d failed to deliver!", (int)pid); } } } } } if (quorum_write(successful_delivery)) { cl_log(LOG_INFO, "Message successfully delivered."); return 0; } else { cl_log(LOG_ERR, "Message is not delivered via more then a half of devices"); return -1; } } unsigned long get_first_msgwait(struct servants_list_item *servants) { unsigned long msgwait = 0; struct servants_list_item *s = servants; for (s = servants; s; s = s->next) { struct sbd_context *st; struct sector_header_s *s_header; st = open_device(s->devname, LOG_WARNING); if (!st) { continue; } s_header = header_get(st); if (s_header != NULL) { msgwait = (unsigned long)s_header->timeout_msgwait; close_device(st); free(s_header); return msgwait; } close_device(st); } return msgwait; } int dump_headers(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s = servants; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv; fprintf(stdout, "==Dumping header on disk %s\n", s->devname); st = open_device(s->devname, LOG_WARNING); if (st) { rv = header_dump(st); close_device(st); } else { fprintf(stderr, "== disk %s unreadable!\n", s->devname); rv = -1; } if (rv == -1) { rc = -1; fprintf(stderr, "==Header on disk %s NOT dumped\n", s->devname); } else { fprintf(stdout, "==Header on disk %s is dumped\n", s->devname); } } return rc; } void open_any_device(struct servants_list_item *servants) { struct sector_header_s *hdr_cur = NULL; struct timespec t_0; int t_wait = 0; bool logged_once = false; clock_gettime(CLOCK_MONOTONIC, &t_0); while (!hdr_cur && t_wait < timeout_startup) { struct timespec t_now; struct servants_list_item* s; for (s = servants; s; s = s->next) { struct sbd_context *st = open_device(s->devname, LOG_DEBUG); if (!st) { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to open %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } continue; } hdr_cur = header_get(st); close_device(st); if (hdr_cur) { break; } else { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to read header from %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } } } clock_gettime(CLOCK_MONOTONIC, &t_now); - t_wait = t_now.tv_sec - t_0.tv_sec; + t_wait = seconds_diff_timespec(&t_now, &t_0); if (!hdr_cur) { sleep(timeout_loop); } } if (hdr_cur) { timeout_watchdog = hdr_cur->timeout_watchdog; timeout_allocate = hdr_cur->timeout_allocate; timeout_loop = hdr_cur->timeout_loop; timeout_msgwait = hdr_cur->timeout_msgwait; } else { cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.", timeout_startup); exit(1); } free(hdr_cur); return; } /* ::-::-::-::-::-::-::-::-::-::-::-::-:: Begin disk based servant code ::-::-::-::-::-::-::-::-::-::-::-::-:: */ static int servant_check_timeout_inconsistent(struct sector_header_s *hdr) { if (timeout_watchdog != hdr->timeout_watchdog) { cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device", - (int)timeout_watchdog, (int)hdr->timeout_watchdog); + timeout_watchdog, (int)hdr->timeout_watchdog); return -1; } if (timeout_allocate != hdr->timeout_allocate) { cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device", (int)timeout_allocate, (int)hdr->timeout_allocate); return -1; } if (timeout_loop != hdr->timeout_loop) { cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device", (int)timeout_loop, (int)hdr->timeout_loop); return -1; } if (timeout_msgwait != hdr->timeout_msgwait) { cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device", (int)timeout_msgwait, (int)hdr->timeout_msgwait); return -1; } return 0; } int servant_md(const char *diskname, int mode, const void* argp) { struct sector_mbox_s *s_mbox = NULL; struct sector_node_s *s_node = NULL; struct sector_header_s *s_header = NULL; int mbox; int rc = 0; - time_t t0, t1, latency; - union sigval signal_value; + time_t t0, t1; + int latency; sigset_t servant_masks; struct sbd_context *st; pid_t ppid; char uuid[37]; const struct servants_list_item *s = argp; cl_log(LOG_INFO, "Servant starting for device %s", diskname); /* Block most of the signals */ sigfillset(&servant_masks); sigdelset(&servant_masks, SIGKILL); sigdelset(&servant_masks, SIGFPE); sigdelset(&servant_masks, SIGILL); sigdelset(&servant_masks, SIGSEGV); sigdelset(&servant_masks, SIGBUS); sigdelset(&servant_masks, SIGALRM); /* FIXME: check error */ sigprocmask(SIG_SETMASK, &servant_masks, NULL); st = open_device(diskname, LOG_WARNING); if (!st) { exit(EXIT_MD_SERVANT_IO_FAIL); } s_header = header_get(st); if (!s_header) { cl_log(LOG_ERR, "Not a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (servant_check_timeout_inconsistent(s_header) < 0) { cl_log(LOG_ERR, "Timeouts on %s do not match first device", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid); } mbox = slot_allocate(st, local_uname); if (mbox < 0) { cl_log(LOG_ERR, "No slot allocated, and automatic allocation failed for disk %s.", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } s_node = sector_alloc(); if (slot_read(st, mbox, s_node) < 0) { cl_log(LOG_ERR, "Unable to read node entry on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname); if (s_header->minor_version == 0) { set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox); } else { set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s", diskname, mbox, uuid); } s_mbox = sector_alloc(); if (s->first_start) { if (mode > 0) { if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed during start-up in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd != SBD_MSG_EXIT && s_mbox->cmd != SBD_MSG_EMPTY) { /* Not a clean stop. Abort start-up */ cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!"); ppid = getppid(); - sigqueue(ppid, SIG_EXITREQ, signal_value); + sigqueue_zero(ppid, SIG_EXITREQ); rc = 0; goto out; } } DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); memset(s_mbox, 0, sizeof(*s_mbox)); if (mbox_write(st, mbox, s_mbox) < 0) { rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } } - memset(&signal_value, 0, sizeof(signal_value)); - while (1) { struct sector_header_s *s_header_retry = NULL; struct sector_node_s *s_node_retry = NULL; t0 = time(NULL); sleep(timeout_loop); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ do_timeout_action(); } /* These attempts are, by definition, somewhat racy. If * the device is wiped out or corrupted between here and * us reading our mbox, there is nothing we can do about * that. But at least we tried. */ s_header_retry = header_get(st); if (!s_header_retry) { cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); free(s_header_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_header_retry); s_node_retry = sector_alloc(); if (slot_read(st, mbox, s_node_retry) < 0) { cl_log(LOG_ERR, "slot read failed in servant."); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_node_retry); if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd > 0) { cl_log(LOG_NOTICE, "Received command %s from %s on disk %s", char2cmd(s_mbox->cmd), s_mbox->from, diskname); switch (s_mbox->cmd) { case SBD_MSG_TEST: memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); - sigqueue(ppid, SIG_TEST, signal_value); + sigqueue_zero(ppid, SIG_TEST); break; case SBD_MSG_RESET: rc = EXIT_MD_SERVANT_REQUEST_RESET; goto out; case SBD_MSG_OFF: rc = EXIT_MD_SERVANT_REQUEST_SHUTOFF; goto out; case SBD_MSG_EXIT: - sigqueue(ppid, SIG_EXITREQ, signal_value); + sigqueue_zero(ppid, SIG_EXITREQ); break; case SBD_MSG_CRASHDUMP: rc = EXIT_MD_SERVANT_REQUEST_CRASHDUMP; goto out; default: /* FIXME: An "unknown" message might result from a partial write. log it and clear the slot. */ cl_log(LOG_ERR, "Unknown message on disk %s", diskname); memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); break; } } - sigqueue(ppid, SIG_LIVENESS, signal_value); + sigqueue_zero(ppid, SIG_LIVENESS); t1 = time(NULL); - latency = t1 - t0; + latency = seconds_diff_time_t(t1, t0); if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: %ds exceeded watchdog warning timeout %ds on disk %s", - (int)latency, (int)timeout_watchdog_warn, + latency, timeout_watchdog_warn, diskname); } else if (debug) { - DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", (int)latency, + DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", latency, diskname); } } out: free(s_node); free(s_mbox); free(s_header); close_device(st); exit(rc); } diff --git a/src/sbd-watchdog.c b/src/sbd-watchdog.c new file mode 100644 index 0000000..b0dd5d6 --- /dev/null +++ b/src/sbd-watchdog.c @@ -0,0 +1,601 @@ +/* + * Copyright (C) 2013 Lars Marowsky-Bree + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "sbd.h" +#ifdef __GLIBC__ +#include +#endif +#include +#include + +/* possibly tunable defaults regarding watchdog operation + are found in sbd-common.c + */ + +/* Global, non-tunable variables: */ +int watchdogfd = -1; +char *watchdogdev = NULL; +bool watchdogdev_is_default = false; +bool do_calculate_timeout_watchdog_warn = true; +int timeout_watchdog_warn = + calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT); + +#define MAX_WATCHDOGS 64 +#define SYS_CLASS_WATCHDOG "/sys/class/watchdog" +#define SYS_CHAR_DEV_DIR "/sys/dev/char" +#define WATCHDOG_NODEDIR "/dev/" + +static bool +is_watchdog(dev_t device) +{ + static int num_watchdog_devs = 0; + static dev_t watchdog_devs[MAX_WATCHDOGS]; + struct dirent *entry; + int i; + + /* populate on first call */ + if (num_watchdog_devs == 0) { + DIR *dp; + + watchdog_devs[0] = makedev(10,130); + num_watchdog_devs = 1; + + /* get additional devices from /sys/class/watchdog */ + dp = opendir(SYS_CLASS_WATCHDOG); + if (dp) { + while ((entry = readdir(dp))) { + if (entry->d_type == DT_LNK) { + FILE *file; + char entry_name[NAME_MAX+sizeof(SYS_CLASS_WATCHDOG)+5]; + + snprintf(entry_name, sizeof(entry_name), + SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); + file = fopen(entry_name, "r"); + if (file) { + int major, minor; + + if (fscanf(file, "%d:%d", &major, &minor) == 2) { + watchdog_devs[num_watchdog_devs++] = + makedev(major, minor); + } + fclose(file); + if (num_watchdog_devs == MAX_WATCHDOGS) { + break; + } + } + } + } + closedir(dp); + } + } + + for (i=0; i < num_watchdog_devs; i++) { + if (device == watchdog_devs[i]) { + return true; + } + } + return false; +} + +static int +watchdog_init_interval_fd(int wdfd, int timeout) +{ + if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) { + cl_perror( "WDIOC_SETTIMEOUT" + ": Failed to set watchdog timer to %d seconds.", + timeout); + cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); + cl_log(LOG_CRIT, "Choose a different watchdog driver or specify " + "-T to skip this if you are completely sure."); + return -1; + } + return 0; +} + +int +watchdog_init_interval(void) +{ + if (watchdogfd < 0) { + return 0; + } + + if (watchdog_set_timeout == 0) { + cl_log(LOG_INFO, + "NOT setting watchdog timeout on explicit user request!"); + return 0; + } + + if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) { + return -1; + } + cl_log(LOG_INFO, "Set watchdog timeout to %d seconds.", timeout_watchdog); + return 0; +} + +static int +watchdog_tickle_fd(int wdfd, char *wddev) +{ + if (write(wdfd, "", 1) != 1) { + cl_perror("Watchdog write failure: %s!", wddev); + return -1; + } + return 0; +} + +int +watchdog_tickle(void) +{ + if (watchdogfd >= 0) { + return watchdog_tickle_fd(watchdogfd, watchdogdev); + } + return 0; +} + +static int +watchdog_init_fd(char *wddev, int timeout) +{ + int wdfd; + + wdfd = open(wddev, O_WRONLY); + if (wdfd >= 0) { + if (((timeout >= 0) && + (watchdog_init_interval_fd(wdfd, timeout) < 0)) || + (watchdog_tickle_fd(wdfd, wddev) < 0)) { + close(wdfd); + return -1; + } + } else { + struct stat statbuf; + + if(!stat(wddev, &statbuf) && S_ISCHR(statbuf.st_mode) && + is_watchdog(statbuf.st_rdev)) { + cl_perror("Cannot open watchdog device '%s'", wddev); + } else { + cl_perror("Seems as if '%s' isn't a valid watchdog-device", wddev); + } + return -1; + } + return wdfd; +} + +int +watchdog_init(void) +{ + if (watchdogfd < 0 && watchdogdev != NULL) { + int timeout = timeout_watchdog; + + if (watchdog_set_timeout == 0) { + cl_log(LOG_INFO, + "NOT setting watchdog timeout on explicit user request!"); + timeout = -1; + } + watchdogfd = watchdog_init_fd(watchdogdev, timeout); + if (watchdogfd >= 0) { + cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); + if (watchdog_set_timeout) { + cl_log(LOG_INFO, "Set watchdog timeout to %d seconds.", + timeout_watchdog); + } + } else { + return -1; + } + } + return 0; +} + +static void +watchdog_close_fd(int wdfd, char *wddev, bool disarm) +{ + if (disarm) { + int r; + int flags = WDIOS_DISABLECARD;; + + /* Explicitly disarm it */ + r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags); + if (r < 0) { + cl_perror("Failed to disable hardware watchdog %s", wddev); + } + + /* To be sure, use magic close logic, too */ + for (;;) { + if (write(wdfd, "V", 1) > 0) { + break; + } + cl_perror("Cannot disable watchdog device %s", wddev); + } + } + + if (close(wdfd) < 0) { + cl_perror("Watchdog close(%d) failed", wdfd); + } +} + +void +watchdog_close(bool disarm) +{ + if (watchdogfd < 0) { + return; + } + + watchdog_close_fd(watchdogfd, watchdogdev, disarm); + watchdogfd = -1; +} + +struct watchdog_list_item { + dev_t dev; + char *dev_node; + char *dev_ident; + char *dev_driver; + pid_t busy_pid; + char *busy_name; + struct watchdog_list_item *next; +}; + +struct link_list_item { + char *dev_node; + char *link_name; + struct link_list_item *next; +}; + +static struct watchdog_list_item *watchdog_list = NULL; +static int watchdog_list_items = 0; + +static void +watchdog_populate_list(void) +{ + struct dirent *entry; + char entry_name[sizeof(WATCHDOG_NODEDIR)+NAME_MAX]; + DIR *dp; + char buf[NAME_MAX+sizeof(WATCHDOG_NODEDIR)] = ""; + struct link_list_item *link_list = NULL; + + if (watchdog_list != NULL) { + return; + } + + /* search for watchdog nodes in /dev */ + dp = opendir(WATCHDOG_NODEDIR); + if (dp) { + /* first go for links and memorize them */ + while ((entry = readdir(dp))) { + if (entry->d_type == DT_LNK) { + int len; + + snprintf(entry_name, sizeof(entry_name), + WATCHDOG_NODEDIR "%s", entry->d_name); + + /* realpath(entry_name, buf) unfortunately does a stat on + * target so we can't really use it to check if links stay + * within /dev without triggering e.g. AVC-logs (with + * SELinux policy that just allows stat within /dev). + * Without canonicalization that doesn't actually touch the + * filesystem easily available introduce some limitations + * for simplicity: + * - just simple path without '..' + * - just one level of symlinks (avoid e.g. loop-checking) + */ + len = readlink(entry_name, buf, sizeof(buf) - 1); + if ((len < 1) || + (len > sizeof(buf) - sizeof(WATCHDOG_NODEDIR) -1 - 1)) { + continue; + } + buf[len] = '\0'; + if (buf[0] != '/') { + memmove(&buf[sizeof(WATCHDOG_NODEDIR)-1], buf, len+1); + memcpy(buf, WATCHDOG_NODEDIR, sizeof(WATCHDOG_NODEDIR)-1); + len += sizeof(WATCHDOG_NODEDIR)-1; + } + if (strstr(buf, "/../") || + strncmp(WATCHDOG_NODEDIR, buf, + sizeof(WATCHDOG_NODEDIR)-1)) { + continue; + } else { + /* just memorize to avoid statting the target - SELinux */ + struct link_list_item *lli = + calloc(1, sizeof(struct link_list_item)); + + if (lli == NULL) { + break; + } + lli->dev_node = strdup(buf); + lli->link_name = strdup(entry_name); + if ((lli->dev_node == NULL) || (lli->link_name == NULL)) { + free(lli->dev_node); + free(lli->link_name); + free(lli); + break; + } + lli->next = link_list; + link_list = lli; + } + } + } + + rewinddir(dp); + + while ((entry = readdir(dp))) { + if (entry->d_type == DT_CHR) { + struct stat statbuf; + + snprintf(entry_name, sizeof(entry_name), + WATCHDOG_NODEDIR "%s", entry->d_name); + if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode) && + is_watchdog(statbuf.st_rdev)) { + + int wdfd; + struct watchdog_list_item *wdg = + calloc(1, sizeof(struct watchdog_list_item)); + int len; + struct link_list_item *tmp_list = NULL; + + if (wdg == NULL) { + break; + } + + wdg->dev = statbuf.st_rdev; + wdg->dev_node = strdup(entry_name); + if (wdg->dev_node == NULL) { + free(wdg); + break; + } + wdg->next = watchdog_list; + watchdog_list = wdg; + watchdog_list_items++; + + wdfd = watchdog_init_fd(entry_name, -1); + if (wdfd >= 0) { + struct watchdog_info ident; + + ident.identity[0] = '\0'; + ioctl(wdfd, WDIOC_GETSUPPORT, &ident); + watchdog_close_fd(wdfd, entry_name, true); + if (ident.identity[0]) { + wdg->dev_ident = strdup((char *) ident.identity); + } + } + + snprintf(entry_name, sizeof(entry_name), + SYS_CHAR_DEV_DIR "/%d:%d/device/driver", + major(wdg->dev), minor(wdg->dev)); + len = readlink(entry_name, buf, sizeof(buf) - 1); + if (len > 0) { + buf[len] = '\0'; + wdg->dev_driver = strdup(basename(buf)); + } else if ((wdg->dev_ident) && + (strcmp(wdg->dev_ident, + "Software Watchdog") == 0)) { + wdg->dev_driver = strdup("softdog"); + } + + /* create dupes if we have memorized links + * to this node + */ + for (tmp_list = link_list; tmp_list; + tmp_list = tmp_list->next) { + if (!strcmp(tmp_list->dev_node, + wdg->dev_node)) { + struct watchdog_list_item *dupe_wdg = + calloc(1, sizeof(struct watchdog_list_item)); + + if (dupe_wdg == NULL) { + break; + } + /* as long as we never purge watchdog_list + * there is no need to dupe strings + */ + *dupe_wdg = *wdg; + dupe_wdg->dev_node = strdup(tmp_list->link_name); + if (dupe_wdg->dev_node == NULL) { + free(dupe_wdg); + break; + } + dupe_wdg->next = watchdog_list; + watchdog_list = dupe_wdg; + watchdog_list_items++; + } + /* for performance reasons we could remove + * the link_list entry + */ + } + } + } + } + + closedir(dp); + } + + /* cleanup link list */ + while (link_list) { + struct link_list_item *tmp_list = link_list; + + link_list = link_list->next; + free(tmp_list->dev_node); + free(tmp_list->link_name); + free(tmp_list); + } +} + +static void +watchdog_checkbusy() +{ + DIR *dproc; + struct dirent *entry; + + dproc = opendir("/proc"); + if (!dproc) { + /* no proc directory to search through */ + return; + } + + while ((entry = readdir(dproc)) != NULL) { + pid_t local_pid; + char *leftover; + DIR *dpid; + char procpath[NAME_MAX+10] = { 0 }; + + if (entry->d_name[0] == '.') { + continue; + } + + local_pid = strtol(entry->d_name, &leftover, 10); + if (leftover[0] != '\0') + continue; + + snprintf(procpath, sizeof(procpath), "/proc/%s/fd", entry->d_name); + dpid = opendir(procpath); + if (!dpid) { + /* silently continue - might be just a race */ + continue; + } + while ((entry = readdir(dpid)) != NULL) { + struct watchdog_list_item *wdg; + char entry_name[sizeof(procpath)+NAME_MAX+1] = { 0 }; + char buf[NAME_MAX+1] = { 0 }; + int len; + + if (entry->d_type != DT_LNK) { + continue; + } + snprintf(entry_name, sizeof(entry_name), + "%s/%s", procpath, entry->d_name); + len = readlink(entry_name, buf, sizeof(buf) - 1); + if (len < 1) { + continue; + } + buf[len] = '\0'; + for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { + if (!strcmp(buf, wdg->dev_node)) { + char name[16]; + FILE *file; + + wdg->busy_pid = local_pid; + snprintf(procpath, sizeof(procpath), "/proc/%d/status", + local_pid); + file = fopen(procpath, "r"); + if (file) { + if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", + name) == 1) { + wdg->busy_name = strdup(name); + } + fclose(file); + } + } + } + } + closedir(dpid); + } + + closedir(dproc); + + return; +} + +int watchdog_info(void) +{ + struct watchdog_list_item *wdg; + int wdg_cnt = 0; + + watchdog_populate_list(); + watchdog_checkbusy(); + printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items); + for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { + wdg_cnt++; + if (wdg->busy_pid) { + printf("\n[%d] %s\nIdentity: Busy: PID %d (%s)\nDriver: %s\n", + wdg_cnt, wdg->dev_node, + wdg->busy_pid, + wdg->busy_name?wdg->busy_name:"", + wdg->dev_driver?wdg->dev_driver:""); + } else { + printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n", + wdg_cnt, wdg->dev_node, + wdg->dev_ident?wdg->dev_ident: + "Error: device hogged via alias major/minor?", + wdg->dev_driver?wdg->dev_driver:""); + } + if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) { + printf("CAUTION: Not recommended for use with sbd.\n"); + } + } + + return 0; +} + +int watchdog_test(void) +{ + int i; + + if ((watchdog_set_timeout == 0) || !watchdog_use) { + printf("\nWatchdog is disabled - aborting test!!!\n"); + return 0; + } + if (watchdogdev_is_default) { + watchdog_populate_list(); + if (watchdog_list_items > 1) { + printf("\nError: Multiple watchdog devices discovered." + "\n Use -w or SBD_WATCHDOG_DEV to specify" + "\n which device to reset the system with\n"); + watchdog_info(); + return -1; + } + } + if ((isatty(fileno(stdin)))) { + char buffer[16]; + printf("\n"); + printf( + "WARNING: This operation is expected to force-reboot this system\n" + " without following any shutdown procedures.\n\n" + "Proceed? [NO/Proceed] "); + + if ((fgets(buffer, 16, stdin) == NULL) || + strcmp(buffer, "Proceed\n")) { + printf("\nAborting watchdog test!!!\n"); + return 0; + } + printf("\n"); + } + printf("Initializing %s with a reset countdown of %d seconds ...\n", + watchdogdev, (int) timeout_watchdog); + if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) { + printf("Failed to initialize watchdog!!!\n"); + watchdog_info(); + return -1; + } + printf("\n"); + printf( + "NOTICE: The watchdog device is expected to reset the system\n" + " in %d seconds. If system remains active beyond that time,\n" + " watchdog may not be functional.\n\n", timeout_watchdog); + for (i=timeout_watchdog; i>1; i--) { + printf("Reset countdown ... %d seconds\n", i); + sleep(1); + } + for (i=2; i>0; i--) { + printf("System expected to reset any moment ...\n"); + sleep(1); + } + for (i=5; i>0; i--) { + printf("System should have reset ...\n"); + sleep(1); + } + printf("Error: The watchdog device has failed to reboot the system,\n" + " and it may not be suitable for usage with sbd.\n"); + + /* test should trigger a reboot thus returning is actually bad */ + return -1; +} diff --git a/src/sbd.h b/src/sbd.h index bbdc6f1..ffeace9 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,219 +1,223 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ /* exit status for disk-servant */ #define EXIT_MD_SERVANT_IO_FAIL 20 #define EXIT_MD_SERVANT_REQUEST_RESET 21 #define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 #define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 /* exit status for pcmk-servant */ #define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 #define HOG_CHAR 0xff #define SECTOR_NAME_MAX 63 /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[SECTOR_NAME_MAX+1]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[SECTOR_NAME_MAX+1]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; void *buffer; }; enum pcmk_health { pcmk_health_unknown, pcmk_health_pending, pcmk_health_transient, pcmk_health_unclean, pcmk_health_shutdown, pcmk_health_online, pcmk_health_noquorum, }; void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); int watchdog_info(void); int watchdog_test(void); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); void do_timeout_action(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); +int sigqueue_zero(pid_t pid, int sig); void notify_parent(void); /* Tunable defaults: */ -extern unsigned long timeout_watchdog; -extern unsigned long timeout_watchdog_warn; -extern bool do_calculate_timeout_watchdog_warn; -extern unsigned long timeout_watchdog_crashdump; +extern int timeout_watchdog; +extern int timeout_watchdog_warn; +extern bool do_calculate_timeout_watchdog_warn; +extern int timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern bool watchdogdev_is_default; extern char* local_uname; extern bool do_flush; extern char timeout_sysrq_char; extern bool move_to_root_cgroup; extern bool enforce_moving_to_root_cgroup; extern bool sync_resource_startup; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant_md(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) extern int servant_health; void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4))); bool sbd_is_disk(struct servants_list_item *servant); bool sbd_is_pcmk(struct servants_list_item *servant); bool sbd_is_cluster(struct servants_list_item *servant); #define calculate_timeout_watchdog_warn(timeout) \ (timeout < 5 ? 2 : \ - (timeout < (ULONG_MAX / 3) ? \ - (((unsigned long) timeout) * 3 / 5) : (((unsigned long) timeout) / 5 * 3))) + (timeout < (INT_MAX / 3) ? \ + (((int) timeout) * 3 / 5) : (((int) timeout) / 5 * 3))) + +int seconds_diff_time_t(time_t a, time_t b); +int seconds_diff_timespec(struct timespec *a, struct timespec *b);