diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index 52ede8a..962725e 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,1278 +1,1298 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "sbd.h" #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int disk_priority = 1; int check_pcmk = 1; int check_cluster = 1; int disk_count = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; bool do_flush = true; char timeout_sysrq_char = 'b'; bool move_to_root_cgroup = true; bool enforce_moving_to_root_cgroup = false; +bool sync_resource_startup = false; int parse_device_line(const char *line); void recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { cl_log(LOG_DEBUG, "Servant %s already exists", devname); return; } newbie = malloc(sizeof(*newbie)); if (newbie) { memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; } if (!newbie || !newbie->devname) { fprintf(stderr, "heap allocation failed in recruit_servant.\n"); exit(1); } /* some sanity-check on our newbie */ if (sbd_is_disk(newbie)) { cl_log(LOG_INFO, "Monitoring %s", devname); disk_count++; } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { /* alive just after pcmk and cluster servants have shown up */ newbie->outdated = 1; } else { /* toss our newbie */ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); free((void *) newbie->devname); free(newbie); return; } if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strcasecmp(s->devname, devname) == 0) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif } else if(sbd_is_pcmk(s)) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else if(sbd_is_cluster(s)) { DBGLOG(LOG_INFO, "Starting Cluster servant"); s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); } else { cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } static inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int cluster_alive(bool all) { int alive = 1; struct servants_list_item* s; if(servant_count == disk_count) { return 0; } for (s = servants_leader; s; s = s->next) { if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if(s->outdated) { alive = 0; } else if(all == false) { return 1; } } } return alive; } int quorum_read(int good_servants) { if (disk_count > 2) return (good_servants > disk_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int cluster_appeared = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { bool tickle = 0; bool can_detach = 0; int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; default: break; } } } else if (sbd_is_pcmk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); /* revert to state prior to pacemaker-detection */ s->restarts = 0; s->restart_blocked = 0; cluster_appeared = 0; s->outdated = 1; s->t_last.tv_sec = 0; break; default: break; } } } cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if (s->outdated == 0) { cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname); } s->t_last.tv_sec = 1; } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { if (sbd_is_disk(s)) { good_servants++; } if (s->outdated) { cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age); } s->outdated = 0; } else if (!s->outdated) { if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(disk_count == 0) { /* NO disks, everything is up to the cluster */ if(cluster_alive(true)) { /* We LIVE! */ if(cluster_appeared == false) { cl_log(LOG_INFO, "Active cluster detected"); } tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(cluster_alive(false)) { if(!decoupled) { /* On the way up, detach and arm the watchdog */ cl_log(LOG_INFO, "Partial cluster detected, detaching"); } can_detach = 1; tickle = !cluster_appeared; } else if(!decoupled) { /* Stay alive until the cluster comes up */ tickle = !cluster_appeared; } } else if(disk_priority == 1 || servant_count == disk_count) { if (quorum_read(good_servants)) { /* There are disks and we're connected to the majority of them */ tickle = 1; can_detach = 1; pcmk_override = 0; } else if (servant_count > disk_count && cluster_alive(true)) { tickle = 1; if(!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Only log this message once */ } } } else if(cluster_alive(true) && quorum_read(good_servants)) { /* Both disk and cluster servants are healthy */ tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(quorum_read(good_servants)) { /* The cluster takes priority but only once * connected for the first time. * * Until then, we tickle based on disk quorum. */ can_detach = 1; tickle = !cluster_appeared; } /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, tickle, disk_count); */ if(tickle) { watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } if (!decoupled && can_detach) { /* We only do this at the point either the disk or * cluster servants become healthy */ cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_timeout_action(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_timeout_action(); } } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { size_t lpc = 0; size_t last = 0; size_t max = 0; int found = 0; bool skip_space = true; int space_run = 0; if (!line) { return 0; } max = strlen(line); cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); for (lpc = 0; lpc <= max; lpc++) { if (isspace(line[lpc])) { if (skip_space) { last = lpc + 1; } else { space_run++; } continue; } skip_space = false; if (line[lpc] == ';' || line[lpc] == 0) { int rc = 0; char *entry = calloc(1, 1 + lpc - last); if (entry) { rc = sscanf(line + last, "%[^;]", entry); } else { fprintf(stderr, "Heap allocation failed parsing device-line.\n"); exit(1); } if (rc != 1) { cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); recruit_servant(entry, 0); found++; } free(entry); skip_space = true; last = lpc + 1; } space_run = 0; } return found; } #define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c" static void sbd_log_filter_ctl(const char *files, uint8_t priority) { if (files == NULL) { files = SBD_SOURCE_FILES; } qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); } int arg_enabled(int arg_count) { return arg_count % 2; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int W_count = 0; int c_count = 0; int P_count = 0; int qb_facility; const char *value = NULL; bool delay_start = false; long delay = 0; char *timeout_action = NULL; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); watchdogdev_is_default = true; qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_log_filter_ctl(NULL, LOG_NOTICE); sbd_get_uname(); value = getenv("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); check_cluster = crm_is_true(value); } cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default"); value = getenv("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = getenv("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); watchdogdev_is_default = false; } /* SBD_WATCHDOG has been dropped from sbd.sysconfig example. * This is for backward compatibility. */ value = getenv("SBD_WATCHDOG"); if(value) { watchdog_use = crm_is_true(value); } value = getenv("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } } value = getenv("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = getenv("SBD_DELAY_START"); if(value) { delay_start = crm_is_true(value); if (!delay_start) { delay = crm_get_msec(value) / 1000; if (delay > 0) { delay_start = true; } } } cl_log(LOG_DEBUG, "Delay start: %s%s%s", delay_start? "yes (" : "no", delay_start? (delay > 0 ? value: "msgwait") : "", delay_start? ")" : ""); value = getenv("SBD_TIMEOUT_ACTION"); if(value) { timeout_action = strdup(value); } value = getenv("SBD_MOVE_TO_ROOT_CGROUP"); if(value) { move_to_root_cgroup = crm_is_true(value); if (move_to_root_cgroup) { enforce_moving_to_root_cgroup = true; } else { if (strcmp(value, "auto") == 0) { move_to_root_cgroup = true; } } } + value = getenv("SBD_SYNC_RESOURCE_STARTUP"); + if(value) { + sync_resource_startup = crm_is_true(value); + } +#if !USE_PACEMAKERD_API + if (sync_resource_startup) { + fprintf(stderr, "Failed to sync resource-startup as " + "SBD was built against pacemaker not supporting pacemakerd-API.\n"); + exit_status = -1; + goto out; + } +#else + if (!sync_resource_startup) { + cl_log(LOG_WARNING, "SBD built against pacemaker supporting " + "pacemakerd-API. Should think about enabling " + "SBD_SYNC_RESOURCE_STARTUP."); + } +#endif + while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) { switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = atoi(optarg); cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = atoi(optarg); cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { sbd_log_filter_ctl(NULL, LOG_INFO); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { sbd_log_filter_ctl(NULL, LOG_DEBUG); cl_log(LOG_INFO, "Debug mode enabled."); } else if(debug == 3) { /* Go nuts, turn on pacemaker's logging too */ sbd_log_filter_ctl("*", LOG_DEBUG); cl_log(LOG_INFO, "Debug library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': W_count++; break; case 'w': cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); free(watchdogdev); watchdogdev = strdup(optarg); watchdogdev_is_default = false; break; case 'd': #if SUPPORT_SHARED_DISK recruit_servant(optarg, 0); #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; case 'c': c_count++; break; case 'P': P_count++; break; case 'z': disk_priority = 0; break; case 'n': local_uname = strdup(optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = atoi(optarg); cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = atoi(optarg); if(timeout_watchdog > 5) { timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3; } break; case '2': timeout_allocate = atoi(optarg); break; case '3': timeout_loop = atoi(optarg); break; case '4': timeout_msgwait = atoi(optarg); break; case '5': timeout_watchdog_warn = atoi(optarg); cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = atoi(optarg); cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = atoi(optarg); cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = atoi(optarg); cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'r': if (timeout_action) { free(timeout_action); } timeout_action = strdup(optarg); break; case 'h': usage(); goto out; break; default: exit_status = -2; goto out; break; } } if (disk_count == 0) { /* if we already have disks from commandline then it is probably undesirable to add those from environment (general rule cmdline has precedence) */ value = getenv("SBD_DEVICE"); if ((value) && strlen(value)) { #if SUPPORT_SHARED_DISK int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); exit_status = -2; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif } } if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } else if (W_count > 0) { watchdog_use = arg_enabled(W_count); } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (c_count > 0) { check_cluster = arg_enabled(c_count); } if (P_count > 0) { check_pcmk = arg_enabled(P_count); } if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) { fprintf(stderr, "Node name mustn't be longer than %d chars.\n", SECTOR_NAME_MAX); fprintf(stderr, "If uname is longer define a name to be used by sbd.\n"); exit_status = -1; goto out; } if (disk_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } if (timeout_action) { char *p[2]; int i; char c; int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c); bool parse_error = (nrflags < 1) || (nrflags > 2); for (i = 0; (i < nrflags) && (i < 2); i++) { if (!strcmp(p[i], "reboot")) { timeout_sysrq_char = 'b'; } else if (!strcmp(p[i], "crashdump")) { timeout_sysrq_char = 'c'; } else if (!strcmp(p[i], "off")) { timeout_sysrq_char = 'o'; } else if (!strcmp(p[i], "flush")) { do_flush = true; } else if (!strcmp(p[i], "noflush")) { do_flush = false; } else { parse_error = true; } free(p[i]); } if (parse_error) { fprintf(stderr, "Failed to parse timeout-action \"%s\".\n", timeout_action); exit_status = -1; goto out; } } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "watch") == 0) { if(disk_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); } if (delay_start) { if (delay <= 0) { delay = get_first_msgwait(servants_leader); } sleep((unsigned long) delay); } } else { exit_status = -2; } #endif if (strcmp(argv[optind], "query-watchdog") == 0) { exit_status = watchdog_info(); } else if (strcmp(argv[optind], "test-watchdog") == 0) { exit_status = watchdog_test(); } else if (strcmp(argv[optind], "watch") == 0) { /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); #if SUPPORT_PLUGIN check_cluster = 1; #endif } cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); if (check_cluster) { recruit_servant("cluster", 0); } cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout", do_flush?"Do":"Skip", timeout_sysrq_char); exit_status = inquisitor(); } out: if (timeout_action) { free(timeout_action); } if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 1243bfc..aa1fb57 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -1,699 +1,699 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sbd.h" #ifndef HAVE_PE_NEW_WORKING_SET #define pe_reset_working_set(data_set) cleanup_calculations(data_set) static pe_working_set_t * pe_new_working_set() { pe_working_set_t *data_set = calloc(1, sizeof(pe_working_set_t)); if (data_set != NULL) { set_working_set_defaults(data_set); } return data_set; } static void pe_free_working_set(pe_working_set_t *data_set) { if (data_set != NULL) { pe_reset_working_set(data_set); free(data_set); } } #endif static void clean_up(int rc); #if USE_PACEMAKERD_API #include static pcmk_ipc_api_t *pacemakerd_api = NULL; static time_t last_ok = (time_t) 0; static void pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, enum pcmk_ipc_event event_type, crm_exit_t status, void *event_data, void *user_data) { pcmk_pacemakerd_api_reply_t *reply = event_data; switch (event_type) { case pcmk_ipc_event_disconnect: /* Unexpected */ cl_log(LOG_ERR, "Lost connection to pacemakerd\n"); return; case pcmk_ipc_event_reply: break; default: return; } if (status != CRM_EX_OK) { cl_log(LOG_ERR, "Bad reply from pacemakerd: %s", crm_exit_str(status)); return; } if (reply->reply_type != pcmk_pacemakerd_reply_ping) { cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n", reply->reply_type); } else { if ((reply->data.ping.last_good != (time_t) 0) && (reply->data.ping.status == pcmk_rc_ok)) { switch (reply->data.ping.state) { case pcmk_pacemakerd_state_running: case pcmk_pacemakerd_state_shutting_down: last_ok = reply->data.ping.last_good; break; case pcmk_pacemakerd_state_shutdown_complete: clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); break; default: break; } } } } #endif extern int disk_count; static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static int cib_connect(gboolean full); static void compute_status(pe_working_set_t * data_set); static gboolean mon_refresh_state(gpointer user_data); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; static int cib_connected = 0; static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static pe_working_set_t *data_set = NULL; static long last_refresh = 0; static int pcmk_clean_shutdown = 0; static int pcmk_shutdown = 0; static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { cib->cmds->signoff(cib); /* retrigger as last one might have been skipped */ mon_refresh_state(NULL); -#if !USE_PACEMAKERD_API - if (pcmk_clean_shutdown) { + + if ((pcmk_clean_shutdown) && (!sync_resource_startup)) { /* assume a graceful pacemaker-shutdown */ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); } -#endif + /* getting here we aren't sure about the pacemaker-state so try to use the timeout to reconnect and get everything sorted out again */ pcmk_shutdown = 0; set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; /* no sense in looking into outdated cib, trying to apply patch, ... */ if (current_cib) { free_xml(current_cib); current_cib = NULL; } return; } static void mon_retrieve_current_cib() { xmlNode *xml_cib = NULL; int options = cib_scope_local | cib_sync_call; int rc = pcmk_ok; free_xml(current_cib); current_cib = NULL; rc = cib->cmds->query(cib, NULL, &xml_cib, options); if (rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); free_xml(xml_cib); return; } else if (xml_cib == NULL) { crm_err("Couldn't retrieve the CIB: empty result"); return; } if (safe_str_eq(crm_element_name(xml_cib), XML_TAG_CIB)) { current_cib = xml_cib; } else { free_xml(xml_cib); } return; } static gboolean mon_timer_notify(gpointer data) { static int counter = 0; int counter_max = timeout_watchdog / timeout_loop / 2; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } #if USE_PACEMAKERD_API { time_t now = time(NULL); if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) { #endif if (cib_connected) { if (counter == counter_max) { mon_retrieve_current_cib(); mon_refresh_state(NULL); counter = 0; } else { cib->cmds->noop(cib, 0); notify_parent(); counter++; } } #if USE_PACEMAKERD_API } } if (pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main) == pcmk_rc_ok) { pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); } #endif timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; CRM_CHECK(cib != NULL, return -EINVAL); cib_connected = 0; crm_xml_init(); if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } mon_retrieve_current_cib(); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } static void compute_status(pe_working_set_t * data_set) { static int updates = 0; static int ever_had_quorum = FALSE; node_t *node = NULL; updates++; if (data_set->dc_node == NULL) { set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); notify_parent(); return; } node = pe_find_node(data_set->nodes, local_uname); if ((node == NULL) || (node->details == NULL)) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); notify_parent(); return; } if (node->details->online == FALSE) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); } else if (node->details->pending) { set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); } else if (data_set->flags & pe_flag_have_quorum) { set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; } else if(disk_count > 0) { set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); } else if(ever_had_quorum == FALSE) { set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); } else { /* We lost quorum, and there are no disks present * Setting healthy > 2 here will result in us self-fencing */ switch (data_set->no_quorum_policy) { case no_quorum_freeze: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); break; #if HAVE_ENUM_NO_QUORUM_DEMOTE case no_quorum_demote: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Demote promotable resources and stop others"); break; #endif case no_quorum_stop: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); break; case no_quorum_ignore: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); break; default: /* immediate reboot is the most excessive action we take use for no_quorum_suicide and everything we don't know yet */ set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); break; } } /* If we are in shutdown-state once this will go on till the end. * If we've on top reached a state of 0 locally running resources * we can assume a clean shutdown. * Tricky are the situations where the node is in maintenance-mode * or resources are unmanaged. So if the node is in maintenance or * all left-over running resources are unmanaged we assume intention. */ if (node->details->shutdown) { pcmk_shutdown = 1; } if (pcmk_shutdown) { pcmk_clean_shutdown = 1; if (!(node->details->maintenance)) { GListPtr iter; for (iter = node->details->running_rsc; iter != NULL; iter = iter->next) { resource_t *rsc = (resource_t *) iter->data; if (is_set(rsc->flags, pe_rsc_managed)) { pcmk_clean_shutdown = 0; crm_debug("not clean as %s managed and still running", rsc->id); break; } } if (pcmk_clean_shutdown) { crm_debug("pcmk_clean_shutdown because " "all managed resources down"); } } else { crm_debug("pcmk_clean_shutdown because node is in maintenance"); } } notify_parent(); return; } static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger(refresh_trigger); mon_refresh_state(NULL); return FALSE; } #define XPATH_SHUTDOWN "//" XML_CIB_TAG_STATE "[@uname='%s']/" \ XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" \ XML_CIB_TAG_NVPAIR "[@name='" XML_CIB_ATTR_SHUTDOWN "']" static gboolean shutdown_attr_in_cib(void) { xmlNode *match = NULL; char *xpath_string; xpath_string = crm_strdup_printf(XPATH_SHUTDOWN, local_uname); if (xpath_string) { match = get_xpath_object(xpath_string, current_cib, LOG_TRACE); free(xpath_string); } return (match != NULL); } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; long now = time(NULL); static int updates = 0; static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); break; case pcmk_ok: updates++; break; default: crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); break; } } if (current_cib == NULL) { mon_retrieve_current_cib(); } /* Refresh * - immediately if the last update was more than 1s ago * - every 10 updates * - at most 1s after the last update * - shutdown attribute for our node set for the first time */ if ((!pcmk_shutdown && shutdown_attr_in_cib()) || (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000))) { mon_refresh_state(refresh_timer); updates = 0; } else { mainloop_set_trigger(refresh_trigger); mainloop_timer_start(refresh_timer); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = NULL; if(current_cib == NULL) { return FALSE; } if(user_data) { mainloop_timer_t *timer = user_data; mainloop_timer_stop(timer); } cib_copy = copy_xml(current_cib); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { last_refresh = time(NULL); data_set->input = cib_copy; data_set->flags |= pe_flag_have_stonith_resource; cluster_status(data_set); compute_status(data_set); pe_reset_working_set(data_set); } return FALSE; } static void clean_up(int rc) { if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); timer_id_reconnect = 0; } if (timer_id_notify > 0) { g_source_remove(timer_id_notify); timer_id_notify = 0; } if (data_set != NULL) { pe_free_working_set(data_set); data_set = NULL; } if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } #if USE_PACEMAKERD_API if (pacemakerd_api != NULL) { pcmk_ipc_api_t *capi = pacemakerd_api; pacemakerd_api = NULL; // Ensure we can't free this twice pcmk_free_ipc_api(capi); } #endif if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, int mode, const void* argp) { int exit_code = 0; crm_system_name = strdup("sbd:pcmk"); cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); } if (data_set == NULL) { data_set = pe_new_working_set(); } if (data_set == NULL) { return -1; } #if USE_PACEMAKERD_API { int rc; rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd); if (pacemakerd_api == NULL) { cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n", pcmk_rc_str(rc)); return -1; } pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL); do { rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main); if (rc != pcmk_rc_ok) { cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n", pcmk_rc_str(rc)); sleep(reconnect_msec / 1000); } } while (rc != pcmk_rc_ok); /* send a ping to pacemakerd to wake it up */ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); /* cib should come up now as well so it's time * to have the inquisitor have a closer look */ notify_parent(); } #endif if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_loop_new(NULL, FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); clean_up(0); return 0; /* never reached */ } diff --git a/src/sbd.h b/src/sbd.h index 382e553..3b6647c 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,211 +1,212 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ /* exit status for disk-servant */ #define EXIT_MD_SERVANT_IO_FAIL 20 #define EXIT_MD_SERVANT_REQUEST_RESET 21 #define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 #define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 /* exit status for pcmk-servant */ #define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 #define HOG_CHAR 0xff #define SECTOR_NAME_MAX 63 /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[SECTOR_NAME_MAX+1]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[SECTOR_NAME_MAX+1]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; }; enum pcmk_health { pcmk_health_unknown, pcmk_health_pending, pcmk_health_transient, pcmk_health_unclean, pcmk_health_shutdown, pcmk_health_online, pcmk_health_noquorum, }; void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); int watchdog_info(void); int watchdog_test(void); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); void do_timeout_action(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); void notify_parent(void); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern bool watchdogdev_is_default; extern char* local_uname; extern bool do_flush; extern char timeout_sysrq_char; extern bool move_to_root_cgroup; extern bool enforce_moving_to_root_cgroup; +extern bool sync_resource_startup; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant_md(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) extern int servant_health; void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4))); bool sbd_is_disk(struct servants_list_item *servant); bool sbd_is_pcmk(struct servants_list_item *servant); bool sbd_is_cluster(struct servants_list_item *servant); diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig index 33b50d0..b32e826 100644 --- a/src/sbd.sysconfig +++ b/src/sbd.sysconfig @@ -1,114 +1,128 @@ ## Type: string ## Default: "" # # SBD_DEVICE specifies the devices to use for exchanging sbd messages # and to monitor. If specifying more than one path, use ";" as # separator. # #SBD_DEVICE="" ## Type: yesno ## Default: yes # # Whether to enable the pacemaker integration. # SBD_PACEMAKER=yes ## Type: always / clean ## Default: always # # Specify the start mode for sbd. Setting this to "clean" will only # allow sbd to start if it was not previously fenced. See the -S option # in the man page. # SBD_STARTMODE=always ## Type: yesno / integer ## Default: no # # Whether to delay after starting sbd on boot for "msgwait" seconds. # This may be necessary if your cluster nodes reboot so fast that the # other nodes are still waiting in the fence acknowledgement phase. # This is an occasional issue with virtual machines. # # This can also be enabled by being set to a specific delay value, in # seconds. Sometimes a longer delay than the default, "msgwait", is # needed, for example in the cases where it's considered to be safer to # wait longer than: # corosync token timeout + consensus timeout + pcmk_delay_max + msgwait # # Be aware that the special value "1" means "yes" rather than "1s". # # Consider that you might have to adapt the startup-timeout accordingly # if the default isn't sufficient. (TimeoutStartSec for systemd) # # This option may be ignored at a later point, once pacemaker handles # this case better. # SBD_DELAY_START=no ## Type: string ## Default: /dev/watchdog # # Watchdog device to use. If set to /dev/null, no watchdog device will # be used. # SBD_WATCHDOG_DEV=/dev/watchdog ## Type: integer ## Default: 5 # # How long, in seconds, the watchdog will wait before panicking the # node if no-one tickles it. # # This depends mostly on your storage latency; the majority of devices # must be successfully read within this time, or else the node will # self-fence. # # If your sbd device(s) reside on a multipath setup or iSCSI, this # should be the time required to detect a path failure. # # Be aware that watchdog timeout set in the on-disk metadata takes # precedence. # SBD_WATCHDOG_TIMEOUT=5 ## Type: string ## Default: "flush,reboot" # # Actions to be executed when the watchers don't timely report to the sbd # master process or one of the watchers detects that the master process # has died. # # Set timeout-action to comma-separated combination of # noflush|flush plus reboot|crashdump|off. # If just one of both is given the other stays at the default. # # This doesn't affect actions like off, crashdump, reboot explicitly # triggered via message slots. # And it does as well not configure the action a watchdog would # trigger should it run off (there is no generic interface). # SBD_TIMEOUT_ACTION=flush,reboot ## Type: yesno / auto ## Default: auto # # If CPUAccounting is enabled default is not to assign any RT-budget # to the system.slice which prevents sbd from running RR-scheduled. # # One way to escape that issue is to move sbd-processes from the # slice they were originally started to root-slice. # Of course starting sbd in a certain slice might be intentional. # Thus in auto-mode sbd will check if the slice has RT-budget assigned. # If that is the case sbd will stay in that slice while it will # be moved to root-slice otherwise. # SBD_MOVE_TO_ROOT_CGROUP=auto +## Type: yesno +## Default: no +# +# If resource startup syncing is enabled then pacemakerd is +# gonna wait to be pinged via IPC before it starts resources. +# On shutdown pacemakerd is going to wait in a state where it +# has cleanly shutdown resources till sbd fetches that state. +# +# Default is 'no' to prevent pacemaker from waiting for a +# ping that will never come when working together with an sbd +# version that doesn't support the feature. +# +SBD_SYNC_RESOURCE_STARTUP=no + ## Type: string ## Default: "" # # Additional options for starting sbd # SBD_OPTS=