diff --git a/src/sbd-common.c b/src/sbd-common.c index f9827b9..36d3710 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,1265 +1,594 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" #include #include #include #ifdef _POSIX_MEMLOCK # include #endif -/* These have to match the values in the header of the partition */ -static char sbd_magic[8] = "SBD_SBD_"; -static char sbd_version = 0x02; - /* Tunable defaults: */ unsigned long timeout_watchdog = 5; unsigned long timeout_watchdog_warn = 3; int timeout_allocate = 2; int timeout_loop = 1; int timeout_msgwait = 10; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; unsigned long timeout_watchdog_crashdump = 240; int skip_rt = 0; -int check_pcmk = 0; int debug = 0; int debug_mode = 0; const char *watchdogdev = "/dev/watchdog"; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v Enable some verbose debug logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping (def: 240s, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "Commands:\n" "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "watch Loop forever, monitoring own slot\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|clear|exit)\n" " Writes the specified message to node's slot.\n" , cmdname); } int watchdog_init_interval(void) { int timeout = timeout_watchdog; if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } else { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", timeout); } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { if (write(watchdogfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", watchdogdev); return -1; } } return 0; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { watchdogfd = open(watchdogdev, O_WRONLY); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device: %s", watchdogdev); if ((watchdog_init_interval() < 0) || (watchdog_tickle() < 0)) { return -1; } }else{ cl_perror("Cannot open watchdog device: %s", watchdogdev); return -1; } } return 0; } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(watchdogfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", watchdogdev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(watchdogfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", watchdogdev); } } if (close(watchdogfd) < 0) { cl_perror("Watchdog close(%d) failed", watchdogfd); } watchdogfd = -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static unsigned char sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return HOG_CHAR; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { return sbd_stack_hogger(buf, kbytes-1); } else { return buf[sizeof(buf)-1]; } } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } #ifdef SCHED_RR { int pcurrent = 0; int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } pcurrent = sched_getscheduler(0); if (pcurrent < 0) { cl_perror("Unable to get scheduler priority"); } else if(pcurrent < priority) { struct sched_param sp; memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror("Unable to set scheduler priority to %d", priority); } else { cl_log(LOG_INFO, "Scheduler priority is now %d", priority); } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler priority"); #endif sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } - -void -close_device(struct sbd_context *st) -{ - close(st->devfd); - free(st); -} - -struct sbd_context * -open_device(const char* devname, int loglevel) -{ - struct sbd_context *st; - - if (!devname) - return NULL; - - st = malloc(sizeof(struct sbd_context)); - if (!st) - return NULL; - memset(st, 0, sizeof(struct sbd_context)); - - if (io_setup(1, &st->ioctx) != 0) { - cl_perror("io_setup failed"); - free(st); - return NULL; - } - - st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT); - - if (st->devfd == -1) { - if (loglevel == LOG_DEBUG) { - DBGLOG(loglevel, "Opening device %s failed.", devname); - } else { - cl_log(loglevel, "Opening device %s failed.", devname); - } - free(st); - return NULL; - } - - ioctl(st->devfd, BLKSSZGET, §or_size); - - if (sector_size == 0) { - cl_perror("Get sector size failed.\n"); - close_device(st); - return NULL; - } - - return st; -} - -signed char -cmd2char(const char *cmd) -{ - if (strcmp("clear", cmd) == 0) { - return SBD_MSG_EMPTY; - } else if (strcmp("test", cmd) == 0) { - return SBD_MSG_TEST; - } else if (strcmp("reset", cmd) == 0) { - return SBD_MSG_RESET; - } else if (strcmp("off", cmd) == 0) { - return SBD_MSG_OFF; - } else if (strcmp("exit", cmd) == 0) { - return SBD_MSG_EXIT; - } else if (strcmp("crashdump", cmd) == 0) { - return SBD_MSG_CRASHDUMP; - } - return -1; -} - -void * -sector_alloc(void) -{ - void *x; - - x = valloc(sector_size); - if (!x) { - exit(1); - } - memset(x, 0, sector_size); - - return x; -} - -const char* -char2cmd(const char cmd) -{ - switch (cmd) { - case SBD_MSG_EMPTY: - return "clear"; - break; - case SBD_MSG_TEST: - return "test"; - break; - case SBD_MSG_RESET: - return "reset"; - break; - case SBD_MSG_OFF: - return "off"; - break; - case SBD_MSG_EXIT: - return "exit"; - break; - case SBD_MSG_CRASHDUMP: - return "crashdump"; - break; - default: - return "undefined"; - break; - } -} - -static int -sector_io(struct sbd_context *st, int sector, void *data, int rw) -{ - struct timespec timeout; - struct io_event event; - struct iocb *ios[1] = { &st->io }; - long r; - - timeout.tv_sec = timeout_io; - timeout.tv_nsec = 0; - - memset(&st->io, 0, sizeof(struct iocb)); - if (rw) { - io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector); - } else { - io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector); - } - - if (io_submit(st->ioctx, 1, ios) != 1) { - cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw); - return -1; - } - - errno = 0; - r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout); - - if (r < 0 ) { - cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw); - return -1; - } else if (r < 1L) { - cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d)", rw); - r = io_cancel(st->ioctx, ios[0], &event); - if (r) { - DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw); - /* Doesn't really matter, debugging information. - */ - } - return -1; - } else if (r > 1L) { - cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r); - return -1; - } - - - /* IO is happy */ - if (event.res == sector_size) { - return 0; - } else { - cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)", - rw, event.res, sector_size); - return -1; - } -} - -int -sector_write(struct sbd_context *st, int sector, void *data) -{ - return sector_io(st, sector, data, 1); -} - -int -sector_read(struct sbd_context *st, int sector, void *data) -{ - return sector_io(st, sector, data, 0); -} - -int -slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node) -{ - return sector_read(st, SLOT_TO_SECTOR(slot), s_node); -} - -int -slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node) -{ - return sector_write(st, SLOT_TO_SECTOR(slot), s_node); -} - -int -mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) -{ - return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox); -} - -int -mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) -{ - return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox); -} - -int -mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) -{ - void *data; - int rc = 0; - - if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0) - return -1; - - data = sector_alloc(); - if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) { - rc = -1; - goto out; - } - - - if (memcmp(s_mbox, data, sector_size) != 0) { - cl_log(LOG_ERR, "Write verification failed!"); - rc = -1; - goto out; - } - rc = 0; -out: - free(data); - return rc; -} - -int header_write(struct sbd_context *st, struct sector_header_s *s_header) -{ - s_header->sector_size = htonl(s_header->sector_size); - s_header->timeout_watchdog = htonl(s_header->timeout_watchdog); - s_header->timeout_allocate = htonl(s_header->timeout_allocate); - s_header->timeout_loop = htonl(s_header->timeout_loop); - s_header->timeout_msgwait = htonl(s_header->timeout_msgwait); - return sector_write(st, 0, s_header); -} - -int -header_read(struct sbd_context *st, struct sector_header_s *s_header) -{ - if (sector_read(st, 0, s_header) < 0) - return -1; - - s_header->sector_size = ntohl(s_header->sector_size); - s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog); - s_header->timeout_allocate = ntohl(s_header->timeout_allocate); - s_header->timeout_loop = ntohl(s_header->timeout_loop); - s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait); - /* This sets the global defaults: */ - timeout_watchdog = s_header->timeout_watchdog; - timeout_allocate = s_header->timeout_allocate; - timeout_loop = s_header->timeout_loop; - timeout_msgwait = s_header->timeout_msgwait; - - return 0; -} - -int -valid_header(const struct sector_header_s *s_header) -{ - if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) { - cl_log(LOG_ERR, "Header magic does not match."); - return -1; - } - if (s_header->version != sbd_version) { - cl_log(LOG_ERR, "Header version does not match."); - return -1; - } - if (s_header->sector_size != sector_size) { - cl_log(LOG_ERR, "Header sector size does not match."); - return -1; - } - return 0; -} - -struct sector_header_s * -header_get(struct sbd_context *st) -{ - struct sector_header_s *s_header; - s_header = sector_alloc(); - - if (header_read(st, s_header) < 0) { - cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd); - return NULL; - } - - if (valid_header(s_header) < 0) { - cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd); - return NULL; - } - - /* cl_log(LOG_INFO, "Found version %d header with %d slots", - s_header->version, s_header->slots); */ - - return s_header; -} - -int -init_device(struct sbd_context *st) -{ - struct sector_header_s *s_header; - struct sector_node_s *s_node; - struct sector_mbox_s *s_mbox; - struct stat s; - char uuid[37]; - int i; - int rc = 0; - - s_header = sector_alloc(); - s_node = sector_alloc(); - s_mbox = sector_alloc(); - memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic)); - s_header->version = sbd_version; - s_header->slots = 255; - s_header->sector_size = sector_size; - s_header->timeout_watchdog = timeout_watchdog; - s_header->timeout_allocate = timeout_allocate; - s_header->timeout_loop = timeout_loop; - s_header->timeout_msgwait = timeout_msgwait; - - s_header->minor_version = 1; - uuid_generate(s_header->uuid); - uuid_unparse_lower(s_header->uuid, uuid); - - fstat(st->devfd, &s); - /* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n", - s.st_size, s.st_blksize, s.st_blocks); */ - - cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", - s_header->version, s_header->minor_version, - st->devfd, uuid); - fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n", - s_header->version, s_header->minor_version, - st->devfd, uuid); - if (header_write(st, s_header) < 0) { - rc = -1; goto out; - } - cl_log(LOG_INFO, "Initializing %d slots on device %d", - s_header->slots, - st->devfd); - fprintf(stdout, "Initializing %d slots on device %d\n", - s_header->slots, - st->devfd); - for (i=0;i < s_header->slots;i++) { - if (slot_write(st, i, s_node) < 0) { - rc = -1; goto out; - } - if (mbox_write(st, i, s_mbox) < 0) { - rc = -1; goto out; - } - } - -out: free(s_node); - free(s_header); - free(s_mbox); - return(rc); -} - -/* Check if there already is a slot allocated to said name; returns the - * slot number. If not found, returns -1. - * This is necessary because slots might not be continuous. */ -int -slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name) -{ - struct sector_node_s *s_node = NULL; - int i; - int rc = -1; - - if (!name) { - cl_log(LOG_ERR, "slot_lookup(): No name specified.\n"); - goto out; - } - - s_node = sector_alloc(); - - for (i=0; i < s_header->slots; i++) { - if (slot_read(st, i, s_node) < 0) { - rc = -2; goto out; - } - if (s_node->in_use != 0) { - if (strncasecmp(s_node->name, name, - sizeof(s_node->name)) == 0) { - DBGLOG(LOG_INFO, "%s owns slot %d", name, i); - rc = i; goto out; - } - } - } - -out: free(s_node); - return rc; -} - -int -slot_unused(struct sbd_context *st, const struct sector_header_s *s_header) -{ - struct sector_node_s *s_node; - int i; - int rc = -1; - - s_node = sector_alloc(); - - for (i=0; i < s_header->slots; i++) { - if (slot_read(st, i, s_node) < 0) { - rc = -1; goto out; - } - if (s_node->in_use == 0) { - rc = i; goto out; - } - } - -out: free(s_node); - return rc; -} - - -int -slot_allocate(struct sbd_context *st, const char *name) -{ - struct sector_header_s *s_header = NULL; - struct sector_node_s *s_node = NULL; - struct sector_mbox_s *s_mbox = NULL; - int i; - int rc = 0; - - if (!name) { - cl_log(LOG_ERR, "slot_allocate(): No name specified.\n"); - fprintf(stderr, "slot_allocate(): No name specified.\n"); - rc = -1; goto out; - } - - s_header = header_get(st); - if (!s_header) { - rc = -1; goto out; - } - - s_node = sector_alloc(); - s_mbox = sector_alloc(); - - while (1) { - i = slot_lookup(st, s_header, name); - if ((i >= 0) || (i == -2)) { - /* -1 is "no slot found", in which case we - * proceed to allocate a new one. - * -2 is "read error during lookup", in which - * case we error out too - * >= 0 is "slot already allocated" */ - rc = i; goto out; - } - - i = slot_unused(st, s_header); - if (i >= 0) { - cl_log(LOG_INFO, "slot %d is unused - trying to own", i); - fprintf(stdout, "slot %d is unused - trying to own\n", i); - memset(s_node, 0, sizeof(*s_node)); - s_node->in_use = 1; - strncpy(s_node->name, name, sizeof(s_node->name)); - if (slot_write(st, i, s_node) < 0) { - rc = -1; goto out; - } - sleep(timeout_allocate); - } else { - cl_log(LOG_ERR, "No more free slots."); - fprintf(stderr, "No more free slots.\n"); - rc = -1; goto out; - } - } - -out: free(s_node); - free(s_header); - free(s_mbox); - return(rc); -} - -int -slot_list(struct sbd_context *st) -{ - struct sector_header_s *s_header = NULL; - struct sector_node_s *s_node = NULL; - struct sector_mbox_s *s_mbox = NULL; - int i; - int rc = 0; - - s_header = header_get(st); - if (!s_header) { - rc = -1; goto out; - } - - s_node = sector_alloc(); - s_mbox = sector_alloc(); - - for (i=0; i < s_header->slots; i++) { - if (slot_read(st, i, s_node) < 0) { - rc = -1; goto out; - } - if (s_node->in_use > 0) { - if (mbox_read(st, i, s_mbox) < 0) { - rc = -1; goto out; - } - printf("%d\t%s\t%s\t%s\n", - i, s_node->name, char2cmd(s_mbox->cmd), - s_mbox->from); - } - } - -out: free(s_node); - free(s_header); - free(s_mbox); - return rc; -} - -int -slot_msg(struct sbd_context *st, const char *name, const char *cmd) -{ - struct sector_header_s *s_header = NULL; - struct sector_mbox_s *s_mbox = NULL; - int mbox; - int rc = 0; - char uuid[37]; - - if (!name || !cmd) { - cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n"); - rc = -1; goto out; - } - - s_header = header_get(st); - if (!s_header) { - rc = -1; goto out; - } - - if (strcmp(name, "LOCAL") == 0) { - name = local_uname; - } - - if (s_header->minor_version > 0) { - uuid_unparse_lower(s_header->uuid, uuid); - cl_log(LOG_INFO, "Device UUID: %s", uuid); - } - - mbox = slot_lookup(st, s_header, name); - if (mbox < 0) { - cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); - rc = -1; goto out; - } - - s_mbox = sector_alloc(); - - s_mbox->cmd = cmd2char(cmd); - if (s_mbox->cmd < 0) { - cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd); - rc = -1; goto out; - } - - strncpy(s_mbox->from, local_uname, sizeof(s_mbox->from)-1); - - cl_log(LOG_INFO, "Writing %s to node slot %s", - cmd, name); - if (mbox_write_verify(st, mbox, s_mbox) < -1) { - rc = -1; goto out; - } - if (strcasecmp(cmd, "exit") != 0) { - cl_log(LOG_INFO, "Messaging delay: %d", - (int)timeout_msgwait); - sleep(timeout_msgwait); - } - cl_log(LOG_INFO, "%s successfully delivered to %s", - cmd, name); - -out: free(s_mbox); - free(s_header); - return rc; -} - -int -slot_ping(struct sbd_context *st, const char *name) -{ - struct sector_header_s *s_header = NULL; - struct sector_mbox_s *s_mbox = NULL; - int mbox; - int waited = 0; - int rc = 0; - - if (!name) { - cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n"); - rc = -1; goto out; - } - - s_header = header_get(st); - if (!s_header) { - rc = -1; goto out; - } - - if (strcmp(name, "LOCAL") == 0) { - name = local_uname; - } - - mbox = slot_lookup(st, s_header, name); - if (mbox < 0) { - cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); - rc = -1; goto out; - } - - s_mbox = sector_alloc(); - s_mbox->cmd = SBD_MSG_TEST; - - strncpy(s_mbox->from, local_uname, sizeof(s_mbox->from)-1); - - DBGLOG(LOG_DEBUG, "Pinging node %s", name); - if (mbox_write(st, mbox, s_mbox) < -1) { - rc = -1; goto out; - } - - rc = -1; - while (waited <= timeout_msgwait) { - if (mbox_read(st, mbox, s_mbox) < 0) - break; - if (s_mbox->cmd != SBD_MSG_TEST) { - rc = 0; - break; - } - sleep(1); - waited++; - } - - if (rc == 0) { - cl_log(LOG_DEBUG, "%s successfully pinged.", name); - } else { - cl_log(LOG_ERR, "%s failed to ping.", name); - } - -out: free(s_mbox); - free(s_header); - return rc; -} - void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicing the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicing the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); sync(); if(kind == 'c') { watchdog_close(true); sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if(reboot(RB_AUTOBOOT) < 0) { cl_perror("Reboot failed"); } } exit(1); } void do_crashdump(void) { do_exit('c'); } void do_reset(void) { do_exit('b'); } void do_off(void) { do_exit('o'); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; struct passwd* pwent; static const char *dir = NULL; if (dir == NULL) { dir = HA_COREDIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; return rc; } pwent = getpwuid(getuid()); if (pwent == NULL) { int errsave = errno; cl_perror("Cannot get name for uid [%d]", getuid()); errno = errsave; return -1; } if ((rc=chdir(pwent->pw_name)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s/%s]", dir, pwent->pw_name); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } -int -header_dump(struct sbd_context *st) -{ - struct sector_header_s *s_header; - char uuid[37]; - - s_header = header_get(st); - if (s_header == NULL) - return -1; - - printf("Header version : %u.%u\n", s_header->version, - s_header->minor_version); - if (s_header->minor_version > 0) { - uuid_unparse_lower(s_header->uuid, uuid); - printf("UUID : %s\n", uuid); - } - - printf("Number of slots : %u\n", s_header->slots); - printf("Sector size : %lu\n", - (unsigned long)s_header->sector_size); - printf("Timeout (watchdog) : %lu\n", - (unsigned long)s_header->timeout_watchdog); - printf("Timeout (allocate) : %lu\n", - (unsigned long)s_header->timeout_allocate); - printf("Timeout (loop) : %lu\n", - (unsigned long)s_header->timeout_loop); - printf("Timeout (msgwait) : %lu\n", - (unsigned long)s_header->timeout_msgwait); - return 0; -} - void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c index cdfa226..aa40050 100644 --- a/src/sbd-inquisitor.c +++ b/src/sbd-inquisitor.c @@ -1,807 +1,804 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" #define LOCKSTRLEN 11 -struct servants_list_item *servants_leader = NULL; - -extern int check_pcmk; +static struct servants_list_item *servants_leader = NULL; +int check_pcmk = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; void recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; newbie = malloc(sizeof(*newbie)); if (!newbie) { fprintf(stderr, "malloc failed in recruit_servant.\n"); exit(1); } memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strncasecmp(s->devname, devname, strlen(s->devname))) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } - void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (strcmp("pcmk",s->devname) == 0) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else { - DBGLOG(LOG_INFO, "Starting servant for device %s", - s->devname); + DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant, start_mode, s); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%lu", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int pcmk_healthy = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE IS ACTIVE - DO NOT RUN IN PRODUCTION!"); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_IO_FAIL); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (s && strcmp(s->devname, "pcmk") == 0) { if (pcmk_healthy != 0) { cl_log(LOG_WARNING, "Pacemaker health check: UNHEALTHY"); } pcmk_healthy = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_IO_FAIL) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); cleanup_servant_by_pid(sinfo.si_pid); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { if (strcmp(s->devname, "pcmk") == 0) { if (pcmk_healthy != 1) { cl_log(LOG_INFO, "Pacemaker health check: OK"); } pcmk_healthy = 1; }; s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { if (strcmp(s->devname, "pcmk") != 0) { good_servants++; } s->outdated = 0; } else if (!s->outdated) { if (strcmp(s->devname, "pcmk") == 0) { /* If the state is outdated, we * override the last reported * state */ pcmk_healthy = 0; cl_log(LOG_WARNING, "Pacemaker state outdated (age: %d)", age); } else if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant for %s outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if (quorum_read(good_servants) || pcmk_healthy) { if (!decoupled) { if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } if (!quorum_read(good_servants)) { if (!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Just to ensure the message is only logged once */ } } else { pcmk_override = 0; } watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_reset(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int w = 0; int qb_facility; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_ERR); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_get_uname(); while ((c = getopt(argc, argv, "C:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:")) != -1) { switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = atoi(optarg); cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = atoi(optarg); cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug = 1; cl_log(LOG_INFO, "Verbose mode enabled."); break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': w++; break; case 'w': watchdogdev = strdup(optarg); break; case 'd': recruit_servant(optarg, 0); break; case 'P': check_pcmk = 1; break; case 'n': local_uname = strdup(optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = atoi(optarg); cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = atoi(optarg); break; case '2': timeout_allocate = atoi(optarg); break; case '3': timeout_loop = atoi(optarg); break; case '4': timeout_msgwait = atoi(optarg); break; case '5': timeout_watchdog_warn = atoi(optarg); cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = atoi(optarg); cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = atoi(optarg); cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = atoi(optarg); cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'h': usage(); return (0); default: exit_status = -2; goto out; break; } } if (w > 0) { watchdog_use = w % 2; } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (servant_count < 1 || servant_count > 3) { fprintf(stderr, "You must specify 1 to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } maximize_priority(); if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "watch") == 0) { open_any_device(servants_leader); /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ if (check_pcmk) { recruit_servant("pcmk", 0); servant_count--; } exit_status = inquisitor(); } else { exit_status = -2; } out: if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } diff --git a/src/sbd-md.c b/src/sbd-md.c index bb961a0..6d69322 100644 --- a/src/sbd-md.c +++ b/src/sbd-md.c @@ -1,552 +1,1231 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "sbd.h" +#define SBD_MSG_EMPTY 0x00 +#define SBD_MSG_TEST 0x01 +#define SBD_MSG_RESET 0x02 +#define SBD_MSG_OFF 0x03 +#define SBD_MSG_EXIT 0x04 +#define SBD_MSG_CRASHDUMP 0x05 + +#define SLOT_TO_SECTOR(slot) (1+slot*2) +#define MBOX_TO_SECTOR(mbox) (2+mbox*2) + extern int servant_count; static int servant_inform_parent = 0; +/* These have to match the values in the header of the partition */ +static char sbd_magic[8] = "SBD_SBD_"; +static char sbd_version = 0x02; + +static signed char +cmd2char(const char *cmd) +{ + if (strcmp("clear", cmd) == 0) { + return SBD_MSG_EMPTY; + } else if (strcmp("test", cmd) == 0) { + return SBD_MSG_TEST; + } else if (strcmp("reset", cmd) == 0) { + return SBD_MSG_RESET; + } else if (strcmp("off", cmd) == 0) { + return SBD_MSG_OFF; + } else if (strcmp("exit", cmd) == 0) { + return SBD_MSG_EXIT; + } else if (strcmp("crashdump", cmd) == 0) { + return SBD_MSG_CRASHDUMP; + } + return -1; +} + +static const char* +char2cmd(const char cmd) +{ + switch (cmd) { + case SBD_MSG_EMPTY: + return "clear"; + break; + case SBD_MSG_TEST: + return "test"; + break; + case SBD_MSG_RESET: + return "reset"; + break; + case SBD_MSG_OFF: + return "off"; + break; + case SBD_MSG_EXIT: + return "exit"; + break; + case SBD_MSG_CRASHDUMP: + return "crashdump"; + break; + default: + return "undefined"; + break; + } +} + +static void +close_device(struct sbd_context *st) +{ + close(st->devfd); + free(st); +} + +static struct sbd_context * +open_device(const char* devname, int loglevel) +{ + struct sbd_context *st; + + if (!devname) + return NULL; + + st = malloc(sizeof(struct sbd_context)); + if (!st) + return NULL; + memset(st, 0, sizeof(struct sbd_context)); + + if (io_setup(1, &st->ioctx) != 0) { + cl_perror("io_setup failed"); + free(st); + return NULL; + } + + st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT); + + if (st->devfd == -1) { + if (loglevel == LOG_DEBUG) { + DBGLOG(loglevel, "Opening device %s failed.", devname); + } else { + cl_log(loglevel, "Opening device %s failed.", devname); + } + free(st); + return NULL; + } + + ioctl(st->devfd, BLKSSZGET, §or_size); + + if (sector_size == 0) { + cl_perror("Get sector size failed.\n"); + close_device(st); + return NULL; + } + + return st; +} + +static void * +sector_alloc(void) +{ + void *x; + + x = valloc(sector_size); + if (!x) { + exit(1); + } + memset(x, 0, sector_size); + + return x; +} + +static int +sector_io(struct sbd_context *st, int sector, void *data, int rw) +{ + struct timespec timeout; + struct io_event event; + struct iocb *ios[1] = { &st->io }; + long r; + + timeout.tv_sec = timeout_io; + timeout.tv_nsec = 0; + + memset(&st->io, 0, sizeof(struct iocb)); + if (rw) { + io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector); + } else { + io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector); + } + + if (io_submit(st->ioctx, 1, ios) != 1) { + cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw); + return -1; + } + + errno = 0; + r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout); + + if (r < 0 ) { + cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw); + return -1; + } else if (r < 1L) { + cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d)", rw); + r = io_cancel(st->ioctx, ios[0], &event); + if (r) { + DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw); + /* Doesn't really matter, debugging information. + */ + } + return -1; + } else if (r > 1L) { + cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r); + return -1; + } + + + /* IO is happy */ + if (event.res == sector_size) { + return 0; + } else { + cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)", + rw, event.res, sector_size); + return -1; + } +} + +static int +sector_write(struct sbd_context *st, int sector, void *data) +{ + return sector_io(st, sector, data, 1); +} + +static int +sector_read(struct sbd_context *st, int sector, void *data) +{ + return sector_io(st, sector, data, 0); +} + +static int +slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node) +{ + return sector_read(st, SLOT_TO_SECTOR(slot), s_node); +} + +static int +slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node) +{ + return sector_write(st, SLOT_TO_SECTOR(slot), s_node); +} + +static int +mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) +{ + return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox); +} + +static int +mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) +{ + return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox); +} + +static int +mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) +{ + void *data; + int rc = 0; + + if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0) + return -1; + + data = sector_alloc(); + if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) { + rc = -1; + goto out; + } + + + if (memcmp(s_mbox, data, sector_size) != 0) { + cl_log(LOG_ERR, "Write verification failed!"); + rc = -1; + goto out; + } + rc = 0; +out: + free(data); + return rc; +} + +static int header_write(struct sbd_context *st, struct sector_header_s *s_header) +{ + s_header->sector_size = htonl(s_header->sector_size); + s_header->timeout_watchdog = htonl(s_header->timeout_watchdog); + s_header->timeout_allocate = htonl(s_header->timeout_allocate); + s_header->timeout_loop = htonl(s_header->timeout_loop); + s_header->timeout_msgwait = htonl(s_header->timeout_msgwait); + return sector_write(st, 0, s_header); +} + +static int +header_read(struct sbd_context *st, struct sector_header_s *s_header) +{ + if (sector_read(st, 0, s_header) < 0) + return -1; + + s_header->sector_size = ntohl(s_header->sector_size); + s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog); + s_header->timeout_allocate = ntohl(s_header->timeout_allocate); + s_header->timeout_loop = ntohl(s_header->timeout_loop); + s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait); + /* This sets the global defaults: */ + timeout_watchdog = s_header->timeout_watchdog; + timeout_allocate = s_header->timeout_allocate; + timeout_loop = s_header->timeout_loop; + timeout_msgwait = s_header->timeout_msgwait; + + return 0; +} + +static int +valid_header(const struct sector_header_s *s_header) +{ + if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) { + cl_log(LOG_ERR, "Header magic does not match."); + return -1; + } + if (s_header->version != sbd_version) { + cl_log(LOG_ERR, "Header version does not match."); + return -1; + } + if (s_header->sector_size != sector_size) { + cl_log(LOG_ERR, "Header sector size does not match."); + return -1; + } + return 0; +} + +static struct sector_header_s * +header_get(struct sbd_context *st) +{ + struct sector_header_s *s_header; + s_header = sector_alloc(); + + if (header_read(st, s_header) < 0) { + cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd); + return NULL; + } + + if (valid_header(s_header) < 0) { + cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd); + return NULL; + } + + /* cl_log(LOG_INFO, "Found version %d header with %d slots", + s_header->version, s_header->slots); */ + + return s_header; +} + +static int +header_dump(struct sbd_context *st) +{ + struct sector_header_s *s_header; + char uuid[37]; + + s_header = header_get(st); + if (s_header == NULL) + return -1; + + printf("Header version : %u.%u\n", s_header->version, + s_header->minor_version); + if (s_header->minor_version > 0) { + uuid_unparse_lower(s_header->uuid, uuid); + printf("UUID : %s\n", uuid); + } + + printf("Number of slots : %u\n", s_header->slots); + printf("Sector size : %lu\n", + (unsigned long)s_header->sector_size); + printf("Timeout (watchdog) : %lu\n", + (unsigned long)s_header->timeout_watchdog); + printf("Timeout (allocate) : %lu\n", + (unsigned long)s_header->timeout_allocate); + printf("Timeout (loop) : %lu\n", + (unsigned long)s_header->timeout_loop); + printf("Timeout (msgwait) : %lu\n", + (unsigned long)s_header->timeout_msgwait); + return 0; +} + +static int +init_device(struct sbd_context *st) +{ + struct sector_header_s *s_header; + struct sector_node_s *s_node; + struct sector_mbox_s *s_mbox; + struct stat s; + char uuid[37]; + int i; + int rc = 0; + + s_header = sector_alloc(); + s_node = sector_alloc(); + s_mbox = sector_alloc(); + memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic)); + s_header->version = sbd_version; + s_header->slots = 255; + s_header->sector_size = sector_size; + s_header->timeout_watchdog = timeout_watchdog; + s_header->timeout_allocate = timeout_allocate; + s_header->timeout_loop = timeout_loop; + s_header->timeout_msgwait = timeout_msgwait; + + s_header->minor_version = 1; + uuid_generate(s_header->uuid); + uuid_unparse_lower(s_header->uuid, uuid); + + fstat(st->devfd, &s); + /* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n", + s.st_size, s.st_blksize, s.st_blocks); */ + + cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", + s_header->version, s_header->minor_version, + st->devfd, uuid); + fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n", + s_header->version, s_header->minor_version, + st->devfd, uuid); + if (header_write(st, s_header) < 0) { + rc = -1; goto out; + } + cl_log(LOG_INFO, "Initializing %d slots on device %d", + s_header->slots, + st->devfd); + fprintf(stdout, "Initializing %d slots on device %d\n", + s_header->slots, + st->devfd); + for (i=0;i < s_header->slots;i++) { + if (slot_write(st, i, s_node) < 0) { + rc = -1; goto out; + } + if (mbox_write(st, i, s_mbox) < 0) { + rc = -1; goto out; + } + } + +out: free(s_node); + free(s_header); + free(s_mbox); + return(rc); +} + +/* Check if there already is a slot allocated to said name; returns the + * slot number. If not found, returns -1. + * This is necessary because slots might not be continuous. */ +static int +slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name) +{ + struct sector_node_s *s_node = NULL; + int i; + int rc = -1; + + if (!name) { + cl_log(LOG_ERR, "slot_lookup(): No name specified.\n"); + goto out; + } + + s_node = sector_alloc(); + + for (i=0; i < s_header->slots; i++) { + if (slot_read(st, i, s_node) < 0) { + rc = -2; goto out; + } + if (s_node->in_use != 0) { + if (strncasecmp(s_node->name, name, + sizeof(s_node->name)) == 0) { + DBGLOG(LOG_INFO, "%s owns slot %d", name, i); + rc = i; goto out; + } + } + } + +out: free(s_node); + return rc; +} + +static int +slot_unused(struct sbd_context *st, const struct sector_header_s *s_header) +{ + struct sector_node_s *s_node; + int i; + int rc = -1; + + s_node = sector_alloc(); + + for (i=0; i < s_header->slots; i++) { + if (slot_read(st, i, s_node) < 0) { + rc = -1; goto out; + } + if (s_node->in_use == 0) { + rc = i; goto out; + } + } + +out: free(s_node); + return rc; +} + + +static int +slot_allocate(struct sbd_context *st, const char *name) +{ + struct sector_header_s *s_header = NULL; + struct sector_node_s *s_node = NULL; + struct sector_mbox_s *s_mbox = NULL; + int i; + int rc = 0; + + if (!name) { + cl_log(LOG_ERR, "slot_allocate(): No name specified.\n"); + fprintf(stderr, "slot_allocate(): No name specified.\n"); + rc = -1; goto out; + } + + s_header = header_get(st); + if (!s_header) { + rc = -1; goto out; + } + + s_node = sector_alloc(); + s_mbox = sector_alloc(); + + while (1) { + i = slot_lookup(st, s_header, name); + if ((i >= 0) || (i == -2)) { + /* -1 is "no slot found", in which case we + * proceed to allocate a new one. + * -2 is "read error during lookup", in which + * case we error out too + * >= 0 is "slot already allocated" */ + rc = i; goto out; + } + + i = slot_unused(st, s_header); + if (i >= 0) { + cl_log(LOG_INFO, "slot %d is unused - trying to own", i); + fprintf(stdout, "slot %d is unused - trying to own\n", i); + memset(s_node, 0, sizeof(*s_node)); + s_node->in_use = 1; + strncpy(s_node->name, name, sizeof(s_node->name)); + if (slot_write(st, i, s_node) < 0) { + rc = -1; goto out; + } + sleep(timeout_allocate); + } else { + cl_log(LOG_ERR, "No more free slots."); + fprintf(stderr, "No more free slots.\n"); + rc = -1; goto out; + } + } + +out: free(s_node); + free(s_header); + free(s_mbox); + return(rc); +} + +static int +slot_list(struct sbd_context *st) +{ + struct sector_header_s *s_header = NULL; + struct sector_node_s *s_node = NULL; + struct sector_mbox_s *s_mbox = NULL; + int i; + int rc = 0; + + s_header = header_get(st); + if (!s_header) { + rc = -1; goto out; + } + + s_node = sector_alloc(); + s_mbox = sector_alloc(); + + for (i=0; i < s_header->slots; i++) { + if (slot_read(st, i, s_node) < 0) { + rc = -1; goto out; + } + if (s_node->in_use > 0) { + if (mbox_read(st, i, s_mbox) < 0) { + rc = -1; goto out; + } + printf("%d\t%s\t%s\t%s\n", + i, s_node->name, char2cmd(s_mbox->cmd), + s_mbox->from); + } + } + +out: free(s_node); + free(s_header); + free(s_mbox); + return rc; +} + +static int +slot_msg(struct sbd_context *st, const char *name, const char *cmd) +{ + struct sector_header_s *s_header = NULL; + struct sector_mbox_s *s_mbox = NULL; + int mbox; + int rc = 0; + char uuid[37]; + + if (!name || !cmd) { + cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n"); + rc = -1; goto out; + } + + s_header = header_get(st); + if (!s_header) { + rc = -1; goto out; + } + + if (strcmp(name, "LOCAL") == 0) { + name = local_uname; + } + + if (s_header->minor_version > 0) { + uuid_unparse_lower(s_header->uuid, uuid); + cl_log(LOG_INFO, "Device UUID: %s", uuid); + } + + mbox = slot_lookup(st, s_header, name); + if (mbox < 0) { + cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); + rc = -1; goto out; + } + + s_mbox = sector_alloc(); + + s_mbox->cmd = cmd2char(cmd); + if (s_mbox->cmd < 0) { + cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd); + rc = -1; goto out; + } + + strncpy(s_mbox->from, local_uname, sizeof(s_mbox->from)-1); + + cl_log(LOG_INFO, "Writing %s to node slot %s", + cmd, name); + if (mbox_write_verify(st, mbox, s_mbox) < -1) { + rc = -1; goto out; + } + if (strcasecmp(cmd, "exit") != 0) { + cl_log(LOG_INFO, "Messaging delay: %d", + (int)timeout_msgwait); + sleep(timeout_msgwait); + } + cl_log(LOG_INFO, "%s successfully delivered to %s", + cmd, name); + +out: free(s_mbox); + free(s_header); + return rc; +} + +static int +slot_ping(struct sbd_context *st, const char *name) +{ + struct sector_header_s *s_header = NULL; + struct sector_mbox_s *s_mbox = NULL; + int mbox; + int waited = 0; + int rc = 0; + + if (!name) { + cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n"); + rc = -1; goto out; + } + + s_header = header_get(st); + if (!s_header) { + rc = -1; goto out; + } + + if (strcmp(name, "LOCAL") == 0) { + name = local_uname; + } + + mbox = slot_lookup(st, s_header, name); + if (mbox < 0) { + cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); + rc = -1; goto out; + } + + s_mbox = sector_alloc(); + s_mbox->cmd = SBD_MSG_TEST; + + strncpy(s_mbox->from, local_uname, sizeof(s_mbox->from)-1); + + DBGLOG(LOG_DEBUG, "Pinging node %s", name); + if (mbox_write(st, mbox, s_mbox) < -1) { + rc = -1; goto out; + } + + rc = -1; + while (waited <= timeout_msgwait) { + if (mbox_read(st, mbox, s_mbox) < 0) + break; + if (s_mbox->cmd != SBD_MSG_TEST) { + rc = 0; + break; + } + sleep(1); + waited++; + } + + if (rc == 0) { + cl_log(LOG_DEBUG, "%s successfully pinged.", name); + } else { + cl_log(LOG_ERR, "%s failed to ping.", name); + } + +out: free(s_mbox); + free(s_header); + return rc; +} + int init_devices(struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Initializing device %s\n", s->devname); st = open_device(s->devname, LOG_ERR); if (!st) { return -1; } rc = init_device(st); close_device(st); if (rc == -1) { fprintf(stderr, "Failed to init device %s\n", s->devname); return rc; } fprintf(stdout, "Device %s is initialized.\n", s->devname); } return 0; } static int slot_msg_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; struct sbd_context *st; const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp; st = open_device(devname, LOG_WARNING); if (!st) return -1; cl_log(LOG_INFO, "Delivery process handling %s", devname); rc = slot_msg(st, arg->name, arg->msg); close_device(st); return rc; } static int slot_ping_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; const char* name = (const char*)argp; struct sbd_context *st; st = open_device(devname, LOG_WARNING); if (!st) return -1; rc = slot_ping(st, name); close_device(st); return rc; } int allocate_slots(const char *name, struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Trying to allocate slot for %s on device %s.\n", name, s->devname); st = open_device(s->devname, LOG_WARNING); if (!st) { return -1; } rc = slot_allocate(st, name); close_device(st); if (rc < 0) return rc; fprintf(stdout, "Slot for %s has been allocated on %s.\n", name, s->devname); } return 0; } int list_slots(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s; struct sbd_context *st; for (s = servants; s; s = s->next) { st = open_device(s->devname, LOG_WARNING); if (!st) { fprintf(stdout, "== disk %s unreadable!\n", s->devname); continue; } rc = slot_list(st); close_device(st); if (rc == -1) { fprintf(stdout, "== Slots on disk %s NOT dumped\n", s->devname); } } return 0; } int ping_via_slots(const char *name, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name); } while (servants_finished < servant_count) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = wait(&status))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (s) { servants_finished++; } } } } } return 0; } int quorum_write(int good_servants) { return (good_servants > servant_count/2); } int quorum_read(int good_servants) { if (servant_count >= 3) return (good_servants > servant_count/2); else return (good_servants >= 1); } int messenger(const char *name, const char *msg, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; int successful_delivery = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; struct slot_msg_arg_t slot_msg_arg = {name, msg}; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg); } while (!(quorum_write(successful_delivery) || (servants_finished == servant_count))) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { servants_finished++; if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { DBGLOG(LOG_INFO, "Process %d succeeded.", (int)pid); successful_delivery++; } else { cl_log(LOG_WARNING, "Process %d failed to deliver!", (int)pid); } } } } } if (quorum_write(successful_delivery)) { cl_log(LOG_INFO, "Message successfully delivered."); return 0; } else { cl_log(LOG_ERR, "Message is not delivered via more then a half of devices"); return -1; } } int dump_headers(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s = servants; struct sbd_context *st; for (s = servants; s; s = s->next) { fprintf(stdout, "==Dumping header on disk %s\n", s->devname); st = open_device(s->devname, LOG_WARNING); if (!st) { fprintf(stdout, "== disk %s unreadable!\n", s->devname); continue; } rc = header_dump(st); close_device(st); if (rc == -1) { fprintf(stdout, "==Header on disk %s NOT dumped\n", s->devname); } else { fprintf(stdout, "==Header on disk %s is dumped\n", s->devname); } } return rc; } void open_any_device(struct servants_list_item *servants) { struct sector_header_s *hdr_cur = NULL; struct timespec t_0; int t_wait = 0; clock_gettime(CLOCK_MONOTONIC, &t_0); while (!hdr_cur && t_wait < timeout_startup) { struct timespec t_now; struct servants_list_item* s; for (s = servants; s; s = s->next) { struct sbd_context *st = open_device(s->devname, LOG_DEBUG); if (!st) continue; hdr_cur = header_get(st); close_device(st); if (hdr_cur) break; } clock_gettime(CLOCK_MONOTONIC, &t_now); t_wait = t_now.tv_sec - t_0.tv_sec; if (!hdr_cur) { sleep(timeout_loop); } } if (hdr_cur) { timeout_watchdog = hdr_cur->timeout_watchdog; timeout_allocate = hdr_cur->timeout_allocate; timeout_loop = hdr_cur->timeout_loop; timeout_msgwait = hdr_cur->timeout_msgwait; } else { cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.", timeout_startup); exit(1); } free(hdr_cur); return; } /* ::-::-::-::-::-::-::-::-::-::-::-::-:: Begin disk based servant code ::-::-::-::-::-::-::-::-::-::-::-::-:: */ static int servant_check_timeout_inconsistent(struct sector_header_s *hdr) { if (timeout_watchdog != hdr->timeout_watchdog) { cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device", (int)timeout_watchdog, (int)hdr->timeout_watchdog); return -1; } if (timeout_allocate != hdr->timeout_allocate) { cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device", (int)timeout_allocate, (int)hdr->timeout_allocate); return -1; } if (timeout_loop != hdr->timeout_loop) { cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device", (int)timeout_loop, (int)hdr->timeout_loop); return -1; } if (timeout_msgwait != hdr->timeout_msgwait) { cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device", (int)timeout_msgwait, (int)hdr->timeout_msgwait); return -1; } return 0; } /* This is a bit hackish, but the easiest way to rewire all process * exits to send the desired signal to the parent. */ void servant_exit(void) { pid_t ppid; union sigval signal_value; ppid = getppid(); if (servant_inform_parent) { memset(&signal_value, 0, sizeof(signal_value)); sigqueue(ppid, SIG_IO_FAIL, signal_value); } } int servant(const char *diskname, int mode, const void* argp) { struct sector_mbox_s *s_mbox = NULL; struct sector_node_s *s_node = NULL; struct sector_header_s *s_header = NULL; int mbox; int rc = 0; time_t t0, t1, latency; union sigval signal_value; sigset_t servant_masks; struct sbd_context *st; pid_t ppid; char uuid[37]; const struct servants_list_item *s = argp; if (!diskname) { cl_log(LOG_ERR, "Empty disk name %s.", diskname); return -1; } cl_log(LOG_INFO, "Servant starting for device %s", diskname); /* Block most of the signals */ sigfillset(&servant_masks); sigdelset(&servant_masks, SIGKILL); sigdelset(&servant_masks, SIGFPE); sigdelset(&servant_masks, SIGILL); sigdelset(&servant_masks, SIGSEGV); sigdelset(&servant_masks, SIGBUS); sigdelset(&servant_masks, SIGALRM); /* FIXME: check error */ sigprocmask(SIG_SETMASK, &servant_masks, NULL); atexit(servant_exit); servant_inform_parent = 1; st = open_device(diskname, LOG_WARNING); if (!st) { return -1; } s_header = header_get(st); if (!s_header) { cl_log(LOG_ERR, "Not a valid header on %s", diskname); return -1; } if (servant_check_timeout_inconsistent(s_header) < 0) { cl_log(LOG_ERR, "Timeouts on %s do not match first device", diskname); return -1; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid); } mbox = slot_allocate(st, local_uname); if (mbox < 0) { cl_log(LOG_ERR, "No slot allocated, and automatic allocation failed for disk %s.", diskname); rc = -1; goto out; } s_node = sector_alloc(); if (slot_read(st, mbox, s_node) < 0) { cl_log(LOG_ERR, "Unable to read node entry on %s", diskname); exit(1); } DBGLOG(LOG_INFO, "Monitoring slot %d on disk %s", mbox, diskname); if (s_header->minor_version == 0) { set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox); } else { set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s", diskname, mbox, uuid); } s_mbox = sector_alloc(); if (s->first_start) { if (mode > 0) { if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed during start-up in servant."); rc = -1; goto out; } if (s_mbox->cmd != SBD_MSG_EXIT && s_mbox->cmd != SBD_MSG_EMPTY) { /* Not a clean stop. Abort start-up */ cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!"); ppid = getppid(); sigqueue(ppid, SIG_EXITREQ, signal_value); rc = 0; goto out; } } DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); memset(s_mbox, 0, sizeof(*s_mbox)); if (mbox_write(st, mbox, s_mbox) < 0) { rc = -1; goto out; } } memset(&signal_value, 0, sizeof(signal_value)); while (1) { struct sector_header_s *s_header_retry = NULL; struct sector_node_s *s_node_retry = NULL; t0 = time(NULL); sleep(timeout_loop); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ do_reset(); } /* These attempts are, by definition, somewhat racy. If * the device is wiped out or corrupted between here and * us reading our mbox, there is nothing we can do about * that. But at least we tried. */ s_header_retry = header_get(st); if (!s_header_retry) { cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); exit(1); } if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); exit(1); } free(s_header_retry); s_node_retry = sector_alloc(); if (slot_read(st, mbox, s_node_retry) < 0) { cl_log(LOG_ERR, "slot read failed in servant."); exit(1); } if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); exit(1); } free(s_node_retry); if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed in servant."); exit(1); } if (s_mbox->cmd > 0) { cl_log(LOG_INFO, "Received command %s from %s on disk %s", char2cmd(s_mbox->cmd), s_mbox->from, diskname); switch (s_mbox->cmd) { case SBD_MSG_TEST: memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); sigqueue(ppid, SIG_TEST, signal_value); break; case SBD_MSG_RESET: do_reset(); break; case SBD_MSG_OFF: do_off(); break; case SBD_MSG_EXIT: sigqueue(ppid, SIG_EXITREQ, signal_value); break; case SBD_MSG_CRASHDUMP: do_crashdump(); break; default: /* FIXME: An "unknown" message might result from a partial write. log it and clear the slot. */ cl_log(LOG_ERR, "Unknown message on disk %s", diskname); memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); break; } } sigqueue(ppid, SIG_LIVENESS, signal_value); t1 = time(NULL); latency = t1 - t0; if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: %d exceeded threshold %d on disk %s", (int)latency, (int)timeout_watchdog_warn, diskname); } else if (debug) { DBGLOG(LOG_INFO, "Latency: %d on disk %s", (int)latency, diskname); } } out: free(s_mbox); close_device(st); if (rc == 0) { servant_inform_parent = 0; } return rc; } diff --git a/src/sbd.h b/src/sbd.h index 7f19f55..07ff101 100644 --- a/src/sbd.h +++ b/src/sbd.h @@ -1,224 +1,173 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_IO_FAIL (SIGRTMIN + 5) /* the IO child requests to be considered failed */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 6) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ #define HOG_CHAR 0xff #define HA_COREDIR "/var/lib/heartbeat/cores" /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[64]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[64]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; }; -#define SBD_MSG_EMPTY 0x00 -#define SBD_MSG_TEST 0x01 -#define SBD_MSG_RESET 0x02 -#define SBD_MSG_OFF 0x03 -#define SBD_MSG_EXIT 0x04 -#define SBD_MSG_CRASHDUMP 0x05 - -#define SLOT_TO_SECTOR(slot) (1+slot*2) -#define MBOX_TO_SECTOR(mbox) (2+mbox*2) - void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); -struct sbd_context *open_device(const char* devname, int loglevel); -void open_any_device(struct servants_list_item *servants); -void close_device(struct sbd_context *st); -signed char cmd2char(const char *cmd); -void * sector_alloc(void); -const char* char2cmd(const char cmd); -int sector_write(struct sbd_context *st, int sector, void *data); -int sector_read(struct sbd_context *st, int sector, void *data); -int slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node); -int slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node); -int mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox); -int mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox); -int mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox); -/* After a call to header_write(), certain data fields will have been - * converted to on-disk byte-order; the header should not be accessed - * afterwards anymore! */ -int header_write(struct sbd_context *st, struct sector_header_s *s_header); -int header_read(struct sbd_context *st, struct sector_header_s *s_header); -int valid_header(const struct sector_header_s *s_header); -struct sector_header_s * header_get(struct sbd_context *st); -int init_device(struct sbd_context *st); -int slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name); -int slot_unused(struct sbd_context *st, const struct sector_header_s *s_header); -int slot_allocate(struct sbd_context *st, const char *name); -int slot_list(struct sbd_context *st); -int slot_ping(struct sbd_context *st, const char *name); -int slot_msg(struct sbd_context *st, const char *name, const char *cmd); -int header_dump(struct sbd_context *st); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern const char *watchdogdev; extern char* local_uname; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); int init_devices(struct servants_list_item *servants); struct slot_msg_arg_t { const char* name; const char* msg; }; +void open_any_device(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); +int messenger(const char *name, const char *msg, struct servants_list_item *servants); -int check_all_dead(void); -void servant_exit(void); int servant(const char *diskname, int mode, const void* argp); -void recruit_servant(const char *devname, pid_t pid); +int servant_pcmk(const char *diskname, int mode, const void* argp); + struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); -void servants_kill(void); -void servants_start(void); -void servant_start(struct servants_list_item *s); -void inquisitor_child(void); -int inquisitor(void); -int inquisitor_decouple(void); -int messenger(const char *name, const char *msg, struct servants_list_item *servants); -void cleanup_servant_by_pid(pid_t pid); -int quorum_write(int good_servants); -int quorum_read(int good_servants); - -int pcmk_have_quorum(void); -int servant_pcmk(const char *diskname, int mode, const void* argp); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); - #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0)