diff --git a/sbd.spec b/sbd.spec index e9ff7bc..b9e7f72 100644 --- a/sbd.spec +++ b/sbd.spec @@ -1,339 +1,339 @@ # # spec file for package sbd # # Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. # Copyright (c) 2013 Lars Marowsky-Bree # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed # upon. The license for this file, and modifications and additions to the # file, is the same license as for the pristine package itself (unless the # license for the pristine package is not an Open Source License, in which # case the license is the MIT License). An "Open Source License" is a # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. # Please submit bugfixes or comments via http://bugs.opensuse.org/ # %global longcommit 7bcdf69597042c31ea0b4a523e732d4bbb99b3a0 %global shortcommit %(echo %{longcommit}|cut -c1-8) %global modified %(echo %{longcommit}-|cut -f2 -d-) %global github_owner Clusterlabs %global commit_counter 0 %global build_counter 0 %global buildnum %(expr %{commit_counter} + %{build_counter}) %ifarch s390x s390 # minimum timeout on LPAR diag288 watchdog is 15s %global watchdog_timeout_default 15 %else %global watchdog_timeout_default 5 %endif # Be careful with sync_resource_startup_default # being enabled. This configuration has # to be in sync with configuration in pacemaker # where it is called sbd_sync - assure by e.g. # mutual rpm dependencies. %bcond_without sync_resource_startup_default # Syncing enabled per default will lead to # syncing enabled on upgrade without adaption # of the config. # Setting can still be overruled via sysconfig. # The setting in the config-template packaged # will follow the default if below is is left # empty. But it is possible to have the setting # in the config-template deviate from the default # by setting below to an explicit 'yes' or 'no'. %global sync_resource_startup_sysconfig "" Name: sbd Summary: Storage-based death -License: GPLv2+ +License: GPL-2.0-or-later Group: System Environment/Daemons Version: 1.5.2 Release: 99.%{buildnum}.%{shortcommit}.%{modified}git%{?dist} Url: https://github.com/%{github_owner}/%{name} Source0: https://github.com/%{github_owner}/%{name}/archive/%{longcommit}/%{name}-%{longcommit}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: autoconf BuildRequires: automake BuildRequires: libuuid-devel BuildRequires: glib2-devel BuildRequires: libaio-devel BuildRequires: corosync-devel %if 0%{?suse_version} %if 0%{?suse_version} > 1500 BuildRequires: libpacemaker3-devel %else BuildRequires: libpacemaker-devel %endif %else BuildRequires: pacemaker-libs-devel %endif BuildRequires: libtool BuildRequires: libuuid-devel BuildRequires: libxml2-devel BuildRequires: pkgconfig BuildRequires: make Conflicts: fence-agents-sbd < 4.5.0 %if 0%{?rhel} > 0 ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le %endif %if %{defined systemd_requires} %systemd_requires %endif %description This package contains the storage-based death functionality. Available rpmbuild rebuild options: --with(out) : sync_resource_startup_default %package tests Summary: Storage-based death environment for regression tests -License: GPLv2+ +License: GPL-2.0-or-later Group: System Environment/Daemons %description tests This package provides an environment + testscripts for regression-testing sbd. %prep ########################################################### # %setup -n sbd-%{version} -q %setup -q -n %{name}-%{longcommit} ########################################################### %build ./autogen.sh export CFLAGS="$RPM_OPT_FLAGS -Wall -Werror" %configure --with-watchdog-timeout-default=%{watchdog_timeout_default} \ --with-sync-resource-startup-default=%{?with_sync_resource_startup_default:yes}%{!?with_sync_resource_startup_default:no} \ --with-sync-resource-startup-sysconfig=%{sync_resource_startup_sysconfig} \ --with-runstatedir=%{_rundir} make %{?_smp_mflags} ########################################################### %install ########################################################### make DESTDIR=$RPM_BUILD_ROOT LIBDIR=%{_libdir} install rm -rf ${RPM_BUILD_ROOT}%{_libdir}/stonith install -D -m 0755 src/sbd.sh $RPM_BUILD_ROOT/usr/share/sbd/sbd.sh install -D -m 0755 tests/regressions.sh $RPM_BUILD_ROOT/usr/share/sbd/regressions.sh %if %{defined _unitdir} install -D -m 0644 src/sbd.service $RPM_BUILD_ROOT/%{_unitdir}/sbd.service install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote.service %endif mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd # Don't package static libs find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f %clean rm -rf %{buildroot} %if %{defined _unitdir} %post %systemd_post sbd.service %systemd_post sbd_remote.service %preun %systemd_preun sbd.service %systemd_preun sbd_remote.service %postun %systemd_postun sbd.service %systemd_postun sbd_remote.service %endif %files ########################################################### %defattr(-,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/sbd %{_sbindir}/sbd %{_datadir}/sbd %{_datadir}/pkgconfig/sbd.pc %exclude %{_datadir}/sbd/regressions.sh %doc %{_mandir}/man8/sbd* %if %{defined _unitdir} %{_unitdir}/sbd.service %{_unitdir}/sbd_remote.service %endif %doc COPYING %files tests %defattr(-,root,root) %dir %{_datadir}/sbd %{_datadir}/sbd/regressions.sh %{_libdir}/libsbdtestbed* %changelog * Thu Jan 5 2023 - 1.5.2-99.0.8ec8e011.git - fail startup if pacemaker integration is disabled while SBD_SYNC_RESOURCE_STARTUP is conflicting (+ hint to overcome) - improve logs - when logging state of SBD_PACEMAKER tell it is just that as this might still be overridden via cmdline options - log a warning if SBD_PACEMAKER is overridden by -P or -PP option - do not warn about startup syncing with pacemaker integration disabled - when watchdog-device is busy give a hint on who is hogging it - improve build environment - have --with-runstatedir overrule --runstatedir - use new package name for pacemaker devel on opensuse - make config location configurable for man-page-creation - reverse alloc/de-alloc order to make gcc-12 static analysis happy - improve test environment - have image-files in /dev/shm to assure they are in memory and sbd opening the files with O_SYNC doesn't trigger unnecessary syncs on a heavily loaded test-machine fallback to /tmp if /dev/shm doesn't exist - wrapping away libaio and usage of device-mapper for block-device simulation can now be passed into make via SBD_USE_DM & SBD_TRANSLATE_AIO - have variables that configure test-environment be printed out prior to running tests - finally assure we clean environment when interrupted by a signal (bash should have done it with just setting EXIT handler - but avoiding bashism might come handy one day) * Mon Nov 15 2021 - 1.5.1-99.0.7bcdf695.git - improve/fix cmdline handling - tell the actual watchdog device specified with -w - tolerate and strip any leading spaces of commandline option values - Sanitize numeric arguments - if start-delay enabled, not explicitly given and msgwait can't be read from disk (diskless) use 2 * watchdog-timeout - avoid using deprecated valloc for disk-io-buffers - avoid frequent alloc/free of aligned buffers to prevent fragmentation - fix memory-leak in one-time-allocations of sector-buffers - fix AIO-API usage: properly destroy io-context - improve/fix build environment - validate configure options for paths - remove unneeded complexity of configure.ac hierarchy - correctly derive package version from git (regression since 1.5.0) - make runstatedir configurable and derive from distribution * Tue Jun 8 2021 - 1.5.0-99.0.2a00ac70.git - default to resource-syncing with pacemaker in spec-file and configure.ac This default has to match between sbd and pacemaker and thus qualifies this release for a minor-version-bump - fix some regressions introduced by adding configurability previously - adapt description of startup/shutdown sync with pacemaker - make watchdog warning messages more understandable * Wed Dec 2 2020 - 1.4.2-99.1.bfeee963.git - improve build/CI-friendlyness - * travis: switch to F32 as build-host - switch to F32 & leap-15.2 - changes for mock-2.0 - turn off loop-devices & device-mapper on x86_64 targets because - of changes in GCE - * regressions.sh: get timeouts from disk-header to go with proper defaults - for architecture - * use configure for watchdog-default-timeout & others - * ship sbd.pc with basic sbd build information for downstream packages - to use - * add number of commits since version-tag to build-counter - add robustness against misconfiguration / improve documentation - * add environment section to man-page previously just available in - template-config - * inform the user to restart the sbd service after disk-initialization - * refuse to start if any of the configured device names is invalid - * add handshake to sync startup/shutdown with pacemakerd - Previously sbd just waited for the cib-connnection to show up/go away - which isn't robust at all. - The new feature needs new pacemakerd-api as counterpart. - Thus build checks for presence of pacemakerd-api. - To simplify downstream adoption behavior is configurable at runtime - via configure-file with a build-time-configurable default. - * refuse to start if qdevice-sync_timeout doesn't match watchdog-timeout - Needed in particular as qdevice-sync_timeout delays quorum-state-update - and has a default of 30s that doesn't match the 5s watchdog-timeout - default. - Fix: sbd-pacemaker: handle new no_quorum_demote + robustness against new - policies added - Fix: agent: correctly compare string values when calculating timeout - Fix: scheduling: overhaul the whole thing - * prevent possible lockup when format in proc changes - * properly get and handle scheduler policy & prio - * on SCHED_RR failing push to the max with SCHED_OTHER * Tue Nov 19 2019 - 1.4.1-99.1.aca7907c.git - improvements/clarifications in documentation - properly finalize cmap connection when disconnected from cluster - make handling of cib-connection loss more robust - silence some coverity findings - overhaul log for reasonable prios and details - if current slice doesn't have rt-budget move to root-slice - periodically ping corosync daemon for liveness - actually use crashdump timeout if configured - avoid deprecated names for g_main-loop-funcitons - conflict with fence-agents-sbd < 4.5.0 - rather require corosync-devel provided by most distributions - make devices on cmdline overrule those coming via SBD_DEVICE - make 15s timeout on s390 be used consistently - improve build/test for CI-friendlyness - * add autogen.sh - * enable/improve out-of-tree-building - * make tar generation smarter - * don't modify sbd.spec - * make distcheck-target work - * Add tests/regressions.sh to check-target - * use unique devmapper names for multiple tests in parallel - * consistently use serial test-harness for visible progress - * package tests into separate package (not packaged before) - * add preload-library to intercept reboots while testing - * add tests for sbd in daemon-mode & watchdog-dev-handling - * make tests work in non-privileged containers * Mon Jan 14 2019 - 1.4.0-0.1.2d595fdd.git - updated travis-CI (ppc64le-build, fedora29, remove need for alectolytic-build-container) - make watchdog-device-query easier to be handled by an SELinux-policy - configurable delay value for SBD_DELAY_START - use pacemaker's new pe api with constructors/destructors - make timeout-action executed by sbd configurable - init script for sysv systems - version bump to v1.4.0 to denote Pacemaker 2.0.0 compatibility * Fri Jun 29 2018 - 1.3.1-0.1.e102d9ed.git - removed unneeded python-devel build-requirement - changed legacy corosync-devel to corosynclib-devel * Fri Nov 3 2017 - 1.3.1-0.1.a180176c.git - Add commands to test/query watchdogs - Allow 2-node-operation with a single shared-disk - Overhaul of the command-line options & config-file - Proper handling of off instead of reboot - Refactored disk-servant for more robust communication with parent - Fix config for Debian + configurable location of config - Fixes in sbd.sh - multiple SBD devices and others * Sun Mar 27 2016 - 1.3.0-0.1.4ee36fa3.git - Changes since v1.2.0 like adding the possibility to have a watchdog-only setup without shared-block-devices legitimate a bump to v1.3.0. * Mon Oct 13 2014 - 1.2.1-0.4.3de531ed.git - Fixes for suitability to the el7 environment * Tue Sep 30 2014 - 1.2.1-0.3.8f912945.git - Only build on archs supported by the HA Add-on * Fri Aug 29 2014 - 1.2.1-0.2.8f912945.git - Remove some additional SUSE-isms * Fri Aug 29 2014 - 1.2.1-0.1.8f912945.git - Prepare for package review Resolves: rhbz#1134245 diff --git a/src/sbd-common.c b/src/sbd-common.c index f3f226a..3abf75f 100644 --- a/src/sbd-common.c +++ b/src/sbd-common.c @@ -1,1330 +1,1355 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #include #include #ifdef __GLIBC__ #include #endif #include #include #include #include #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ unsigned long timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT; int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT; unsigned long timeout_watchdog_warn = calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT); bool do_calculate_timeout_watchdog_warn = true; int timeout_allocate = 2; int timeout_loop = 1; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; unsigned long timeout_watchdog_crashdump = 0; int skip_rt = 0; int debug = 0; int debug_mode = 0; char *watchdogdev = NULL; bool watchdogdev_is_default = false; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; int servant_health = 0; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v|-vv|-vvv Enable verbose|debug|debug-library logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping\n" " (def: 0s = disable gracefully, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "-r Set timeout-action to comma-separated combination of\n" " noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|crashdump|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" "query-watchdog Check for available watchdog-devices and print some info\n" "test-watchdog Test the watchdog-device selected.\n" " Attention: This will arm the watchdog and have your system reset\n" " in case your watchdog is working properly!\n" , cmdname); } #define MAX_WATCHDOGS 64 #define SYS_CLASS_WATCHDOG "/sys/class/watchdog" #define SYS_CHAR_DEV_DIR "/sys/dev/char" #define WATCHDOG_NODEDIR "/dev/" static bool is_watchdog(dev_t device) { static int num_watchdog_devs = 0; static dev_t watchdog_devs[MAX_WATCHDOGS]; struct dirent *entry; int i; /* populate on first call */ if (num_watchdog_devs == 0) { DIR *dp; watchdog_devs[0] = makedev(10,130); num_watchdog_devs = 1; /* get additional devices from /sys/class/watchdog */ dp = opendir(SYS_CLASS_WATCHDOG); if (dp) { while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { FILE *file; char entry_name[NAME_MAX+sizeof(SYS_CLASS_WATCHDOG)+5]; snprintf(entry_name, sizeof(entry_name), SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); file = fopen(entry_name, "r"); if (file) { int major, minor; if (fscanf(file, "%d:%d", &major, &minor) == 2) { watchdog_devs[num_watchdog_devs++] = makedev(major, minor); } fclose(file); if (num_watchdog_devs == MAX_WATCHDOGS) { break; } } } } closedir(dp); } } for (i=0; i < num_watchdog_devs; i++) { if (device == watchdog_devs[i]) { return true; } } return false; } static int watchdog_init_interval_fd(int wdfd, int timeout) { if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } return 0; } int watchdog_init_interval(void) { if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) { return -1; } cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); return 0; } static int watchdog_tickle_fd(int wdfd, char *wddev) { if (write(wdfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", wddev); return -1; } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { return watchdog_tickle_fd(watchdogfd, watchdogdev); } return 0; } static int watchdog_init_fd(char *wddev, int timeout) { int wdfd; wdfd = open(wddev, O_WRONLY); if (wdfd >= 0) { if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0)) || (watchdog_tickle_fd(wdfd, wddev) < 0)) { close(wdfd); return -1; } } else { struct stat statbuf; if(!stat(wddev, &statbuf) && S_ISCHR(statbuf.st_mode) && is_watchdog(statbuf.st_rdev)) { cl_perror("Cannot open watchdog device '%s'", wddev); } else { cl_perror("Seems as if '%s' isn't a valid watchdog-device", wddev); } return -1; } return wdfd; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { int timeout = timeout_watchdog; if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); timeout = -1; } watchdogfd = watchdog_init_fd(watchdogdev, timeout); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); if (watchdog_set_timeout) { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); } } else { return -1; } } return 0; } static void watchdog_close_fd(int wdfd, char *wddev, bool disarm) { if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", wddev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(wdfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", wddev); } } if (close(wdfd) < 0) { cl_perror("Watchdog close(%d) failed", wdfd); } } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } watchdog_close_fd(watchdogfd, watchdogdev, disarm); watchdogfd = -1; } struct watchdog_list_item { dev_t dev; char *dev_node; char *dev_ident; char *dev_driver; pid_t busy_pid; char *busy_name; struct watchdog_list_item *next; }; struct link_list_item { char *dev_node; char *link_name; struct link_list_item *next; }; static struct watchdog_list_item *watchdog_list = NULL; static int watchdog_list_items = 0; static void watchdog_populate_list(void) { struct dirent *entry; char entry_name[sizeof(WATCHDOG_NODEDIR)+NAME_MAX]; DIR *dp; char buf[NAME_MAX+sizeof(WATCHDOG_NODEDIR)] = ""; struct link_list_item *link_list = NULL; if (watchdog_list != NULL) { return; } /* search for watchdog nodes in /dev */ dp = opendir(WATCHDOG_NODEDIR); if (dp) { /* first go for links and memorize them */ while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { int len; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); /* realpath(entry_name, buf) unfortunately does a stat on * target so we can't really use it to check if links stay * within /dev without triggering e.g. AVC-logs (with * SELinux policy that just allows stat within /dev). * Without canonicalization that doesn't actually touch the * filesystem easily available introduce some limitations * for simplicity: * - just simple path without '..' * - just one level of symlinks (avoid e.g. loop-checking) */ len = readlink(entry_name, buf, sizeof(buf) - 1); if ((len < 1) || (len > sizeof(buf) - sizeof(WATCHDOG_NODEDIR) -1 - 1)) { continue; } buf[len] = '\0'; if (buf[0] != '/') { memmove(&buf[sizeof(WATCHDOG_NODEDIR)-1], buf, len+1); memcpy(buf, WATCHDOG_NODEDIR, sizeof(WATCHDOG_NODEDIR)-1); len += sizeof(WATCHDOG_NODEDIR)-1; } if (strstr(buf, "/../") || strncmp(WATCHDOG_NODEDIR, buf, sizeof(WATCHDOG_NODEDIR)-1)) { continue; } else { /* just memorize to avoid statting the target - SELinux */ struct link_list_item *lli = calloc(1, sizeof(struct link_list_item)); + if (lli == NULL) { + break; + } lli->dev_node = strdup(buf); lli->link_name = strdup(entry_name); + if ((lli->dev_node == NULL) || (lli->link_name == NULL)) { + free(lli->dev_node); + free(lli->link_name); + free(lli); + break; + } lli->next = link_list; link_list = lli; } } } rewinddir(dp); while ((entry = readdir(dp))) { if (entry->d_type == DT_CHR) { struct stat statbuf; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode) && is_watchdog(statbuf.st_rdev)) { - int wdfd = watchdog_init_fd(entry_name, -1); + int wdfd; struct watchdog_list_item *wdg = calloc(1, sizeof(struct watchdog_list_item)); int len; struct link_list_item *tmp_list = NULL; + if (wdg == NULL) { + break; + } + wdg->dev = statbuf.st_rdev; wdg->dev_node = strdup(entry_name); + if (wdg->dev_node == NULL) { + free(wdg); + break; + } wdg->next = watchdog_list; watchdog_list = wdg; watchdog_list_items++; + wdfd = watchdog_init_fd(entry_name, -1); if (wdfd >= 0) { struct watchdog_info ident; ident.identity[0] = '\0'; ioctl(wdfd, WDIOC_GETSUPPORT, &ident); watchdog_close_fd(wdfd, entry_name, true); if (ident.identity[0]) { wdg->dev_ident = strdup((char *) ident.identity); } } snprintf(entry_name, sizeof(entry_name), SYS_CHAR_DEV_DIR "/%d:%d/device/driver", major(wdg->dev), minor(wdg->dev)); len = readlink(entry_name, buf, sizeof(buf) - 1); if (len > 0) { buf[len] = '\0'; wdg->dev_driver = strdup(basename(buf)); } else if ((wdg->dev_ident) && (strcmp(wdg->dev_ident, "Software Watchdog") == 0)) { wdg->dev_driver = strdup("softdog"); } /* create dupes if we have memorized links * to this node */ for (tmp_list = link_list; tmp_list; tmp_list = tmp_list->next) { if (!strcmp(tmp_list->dev_node, wdg->dev_node)) { struct watchdog_list_item *dupe_wdg = calloc(1, sizeof(struct watchdog_list_item)); + if (dupe_wdg == NULL) { + break; + } /* as long as we never purge watchdog_list * there is no need to dupe strings */ *dupe_wdg = *wdg; dupe_wdg->dev_node = strdup(tmp_list->link_name); + if (dupe_wdg->dev_node == NULL) { + free(dupe_wdg); + break; + } dupe_wdg->next = watchdog_list; watchdog_list = dupe_wdg; watchdog_list_items++; } /* for performance reasons we could remove * the link_list entry */ } } } } closedir(dp); } /* cleanup link list */ while (link_list) { struct link_list_item *tmp_list = link_list; link_list = link_list->next; free(tmp_list->dev_node); free(tmp_list->link_name); free(tmp_list); } } static void watchdog_checkbusy() { DIR *dproc; struct dirent *entry; dproc = opendir("/proc"); if (!dproc) { /* no proc directory to search through */ return; } while ((entry = readdir(dproc)) != NULL) { pid_t local_pid; char *leftover; DIR *dpid; char procpath[NAME_MAX+10] = { 0 }; if (entry->d_name[0] == '.') { continue; } local_pid = strtol(entry->d_name, &leftover, 10); if (leftover[0] != '\0') continue; snprintf(procpath, sizeof(procpath), "/proc/%s/fd", entry->d_name); dpid = opendir(procpath); if (!dpid) { /* silently continue - might be just a race */ continue; } while ((entry = readdir(dpid)) != NULL) { struct watchdog_list_item *wdg; char entry_name[sizeof(procpath)+NAME_MAX+1] = { 0 }; char buf[NAME_MAX+1] = { 0 }; int len; if (entry->d_type != DT_LNK) { continue; } snprintf(entry_name, sizeof(entry_name), "%s/%s", procpath, entry->d_name); len = readlink(entry_name, buf, sizeof(buf) - 1); if (len < 1) { continue; } buf[len] = '\0'; for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { if (!strcmp(buf, wdg->dev_node)) { char name[16]; FILE *file; wdg->busy_pid = local_pid; snprintf(procpath, sizeof(procpath), "/proc/%d/status", local_pid); file = fopen(procpath, "r"); if (file) { if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) == 1) { wdg->busy_name = strdup(name); } fclose(file); } } } } closedir(dpid); } closedir(dproc); return; } int watchdog_info(void) { struct watchdog_list_item *wdg; int wdg_cnt = 0; watchdog_populate_list(); watchdog_checkbusy(); printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items); for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { wdg_cnt++; if (wdg->busy_pid) { printf("\n[%d] %s\nIdentity: Busy: PID %d (%s)\nDriver: %s\n", wdg_cnt, wdg->dev_node, wdg->busy_pid, wdg->busy_name?wdg->busy_name:"", wdg->dev_driver?wdg->dev_driver:""); } else { printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n", wdg_cnt, wdg->dev_node, wdg->dev_ident?wdg->dev_ident: "Error: device hogged via alias major/minor?", wdg->dev_driver?wdg->dev_driver:""); } if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) { printf("CAUTION: Not recommended for use with sbd.\n"); } } return 0; } int watchdog_test(void) { int i; if ((watchdog_set_timeout == 0) || !watchdog_use) { printf("\nWatchdog is disabled - aborting test!!!\n"); return 0; } if (watchdogdev_is_default) { watchdog_populate_list(); if (watchdog_list_items > 1) { printf("\nError: Multiple watchdog devices discovered.\n" " Use -w or SBD_WATCHDOG_DEV to specify\n" " which device to reset the system with\n"); watchdog_info(); return -1; } } if ((isatty(fileno(stdin)))) { char buffer[16]; printf("\nWARNING: This operation is expected to force-reboot this system\n" " without following any shutdown procedures.\n\n" "Proceed? [NO/Proceed] "); if ((fgets(buffer, 16, stdin) == NULL) || strcmp(buffer, "Proceed\n")) { printf("\nAborting watchdog test!!!\n"); return 0; } printf("\n"); } printf("Initializing %s with a reset countdown of %d seconds ...\n", watchdogdev, (int) timeout_watchdog); if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) { printf("Failed to initialize watchdog!!!\n"); watchdog_info(); return -1; } printf("\n"); printf("NOTICE: The watchdog device is expected to reset the system\n" " in %d seconds. If system remains active beyond that time,\n" " watchdog may not be functional.\n\n", (int) timeout_watchdog); for (i=timeout_watchdog; i>1; i--) { printf("Reset countdown ... %d seconds\n", i); sleep(1); } for (i=2; i>0; i--) { printf("System expected to reset any moment ...\n"); sleep(1); } for (i=5; i>0; i--) { printf("System should have reset ...\n"); sleep(1); } printf("Error: The watchdog device has failed to reboot the system,\n" " and it may not be suitable for usage with sbd.\n"); /* test should trigger a reboot thus returning is actually bad */ return -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static void sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { sbd_stack_hogger(buf, kbytes-1); } return; } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } static int get_realtime_budget(void) { FILE *f; char fname[PATH_MAX]; int res = -1, lnum = 0, num; char *cgroup = NULL, *namespecs = NULL; snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd", (intmax_t)getpid()); goto exit_res; } while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum, &namespecs, &cgroup)) !=EOF ) { if (namespecs && strstr(namespecs, "cpuacct")) { free(namespecs); break; } if (cgroup) { free(cgroup); cgroup = NULL; } if (namespecs) { free(namespecs); namespecs = NULL; } /* not to get stuck if format changes */ if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) || (fscanf(f, "\n") == EOF))) { break; } } fclose(f); if (cgroup == NULL) { cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd", (intmax_t)getpid()); goto exit_res; } snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us", cgroup); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but " "doesn't for '%s'", cgroup); goto exit_res; } if (fscanf(f, "%d", &res) != 1) { cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname); } else { cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res); } fclose(f); exit_res: if (cgroup) { free(cgroup); } return res; } /* stolen from corosync */ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } fclose(f); if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) { cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are " "-> skip moving to root-slice"); res = 0; goto exit_res; } f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { cl_log(LOG_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } do { #ifdef SCHED_RR if (move_to_root_cgroup) { sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); } { int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); struct sched_param sp; int pcurrent; if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } if (sched_getparam(0, &sp) < 0) { cl_perror("Unable to get scheduler priority"); } else if ((pcurrent = sched_getscheduler(0)) < 0) { cl_perror("Unable to get scheduler policy"); } else if ((pcurrent == SCHED_RR) && (sp.sched_priority >= priority)) { cl_log(LOG_INFO, "Stay with priority (%d) for policy SCHED_RR", sp.sched_priority); break; } else { memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror( "Unable to set scheduler policy to SCHED_RR priority %d", priority); } else { cl_log(LOG_INFO, "Scheduler policy is now SCHED_RR priority %d", priority); break; } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler policy"); #endif #ifdef PRIO_PGRP if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) { cl_perror("Unable to raise the scheduler priority"); } else { cl_log(LOG_INFO, "Scheduler priority raised to the maximum"); } #else cl_perror("System does not support setting the scheduler priority"); #endif } while (0); sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind, bool do_flush) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicking the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicking the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); if (do_flush) { sync(); } if (kind == 'c') { if (timeout_watchdog_crashdump) { if (timeout_watchdog != timeout_watchdog_crashdump) { timeout_watchdog = timeout_watchdog_crashdump; watchdog_init_interval(); } watchdog_close(false); } else { watchdog_close(true); } sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) { cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot"); } } exit(1); } void do_crashdump(void) { do_exit('c', true); } void do_reset(void) { do_exit('b', true); } void do_off(void) { do_exit('o', true); } void do_timeout_action(void) { do_exit(timeout_sysrq_char, do_flush); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } void notify_parent(void) { pid_t ppid; union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_timeout_action(); } switch (servant_health) { case pcmk_health_pending: case pcmk_health_shutdown: case pcmk_health_transient: DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health); break; case pcmk_health_unknown: case pcmk_health_unclean: case pcmk_health_noquorum: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; case pcmk_health_online: DBGLOG(LOG_DEBUG, "Notifying parent: healthy"); sigqueue(ppid, SIG_LIVENESS, signal_value); break; default: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; } } void set_servant_health(enum pcmk_health state, int level, char const *format, ...) { if (servant_health != state) { va_list ap; int len = 0; char *string = NULL; servant_health = state; va_start(ap, format); len = vasprintf (&string, format, ap); if(len > 0) { cl_log(level, "%s", string); } va_end(ap); free(string); } } bool sbd_is_disk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (servant->devname[0] == '/')) { return true; } return false; } bool sbd_is_cluster(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("cluster", servant->devname) == 0)) { return true; } return false; } bool sbd_is_pcmk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("pcmk", servant->devname) == 0)) { return true; } return false; } diff --git a/src/sbd-md.c b/src/sbd-md.c index 7a37522..2a237ad 100644 --- a/src/sbd-md.c +++ b/src/sbd-md.c @@ -1,1289 +1,1288 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #define SBD_MSG_EMPTY 0x00 #define SBD_MSG_TEST 0x01 #define SBD_MSG_RESET 0x02 #define SBD_MSG_OFF 0x03 #define SBD_MSG_EXIT 0x04 #define SBD_MSG_CRASHDUMP 0x05 #define SLOT_TO_SECTOR(slot) (1+slot*2) #define MBOX_TO_SECTOR(mbox) (2+mbox*2) extern int disk_count; /* These have to match the values in the header of the partition */ static char sbd_magic[8] = "SBD_SBD_"; static char sbd_version = 0x02; struct slot_msg_arg_t { const char* name; const char* msg; }; static signed char cmd2char(const char *cmd) { if (strcmp("clear", cmd) == 0) { return SBD_MSG_EMPTY; } else if (strcmp("test", cmd) == 0) { return SBD_MSG_TEST; } else if (strcmp("reset", cmd) == 0) { return SBD_MSG_RESET; } else if (strcmp("off", cmd) == 0) { return SBD_MSG_OFF; } else if (strcmp("exit", cmd) == 0) { return SBD_MSG_EXIT; } else if (strcmp("crashdump", cmd) == 0) { return SBD_MSG_CRASHDUMP; } return -1; } static const char* char2cmd(const char cmd) { switch (cmd) { case SBD_MSG_EMPTY: return "clear"; break; case SBD_MSG_TEST: return "test"; break; case SBD_MSG_RESET: return "reset"; break; case SBD_MSG_OFF: return "off"; break; case SBD_MSG_EXIT: return "exit"; break; case SBD_MSG_CRASHDUMP: return "crashdump"; break; default: return "undefined"; break; } } static void close_device(struct sbd_context *st) { if (!st) { return; } if (st->ioctx) { io_destroy(st->ioctx); } if (st->devfd >= 0) { close(st->devfd); } free(st->buffer); free(st); } static struct sbd_context * open_device(const char* devname, int loglevel) { struct sbd_context *st; if (!devname) return NULL; st = calloc(1, sizeof(struct sbd_context)); if (!st) { return NULL; } st->devfd = -1; if (io_setup(1, &st->ioctx) != 0) { cl_perror("io_setup failed"); goto out; } st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT); if (st->devfd == -1) { if (loglevel == LOG_DEBUG) { DBGLOG(loglevel, "Opening device %s failed.", devname); } else { cl_log(loglevel, "Opening device %s failed.", devname); } goto out; } ioctl(st->devfd, BLKSSZGET, §or_size); if (sector_size == 0) { cl_perror("Get sector size failed.\n"); goto out; } if (posix_memalign(&st->buffer, sector_size, sector_size)) { cl_perror("Couldn't allocate sector-buffer."); goto out; } return st; out: close_device(st); return NULL; } static void * sector_alloc(void) { void *x; x = calloc(1, sector_size); if (!x) { exit(1); } return x; } static int sector_io(struct sbd_context *st, int sector, void *data, int rw) { struct timespec timeout; struct io_event event; struct iocb *ios[1] = { &st->io }; long r; timeout.tv_sec = timeout_io; timeout.tv_nsec = 0; memset(&st->io, 0, sizeof(struct iocb)); if (rw) { memcpy(st->buffer, data, sector_size); io_prep_pwrite(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } else { memset(st->buffer, 0, sector_size); io_prep_pread(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } if (io_submit(st->ioctx, 1, ios) != 1) { cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw); return -1; } errno = 0; r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout); if (r < 0 ) { cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw); return -1; } else if (r < 1L) { cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%ld)", rw, r); r = io_cancel(st->ioctx, ios[0], &event); if (r) { DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw); /* Doesn't really matter, debugging information. */ } return -1; } else if (r > 1L) { cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r); return -1; } /* IO is happy */ if (event.res == sector_size) { if (!rw) { memcpy(data, st->buffer, sector_size); } return 0; } else { cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)", rw, event.res, sector_size); return -1; } } static int sector_write(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 1); } static int sector_read(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 0); } static int slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_read(st, SLOT_TO_SECTOR(slot), s_node); } static int slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_write(st, SLOT_TO_SECTOR(slot), s_node); } static int mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { void *data; int rc = 0; if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0) return -1; data = sector_alloc(); if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) { rc = -1; goto out; } if (memcmp(s_mbox, data, sector_size) != 0) { cl_log(LOG_ERR, "Write verification failed!"); rc = -1; goto out; } rc = 0; out: free(data); return rc; } static int header_write(struct sbd_context *st, struct sector_header_s *s_header) { s_header->sector_size = htonl(s_header->sector_size); s_header->timeout_watchdog = htonl(s_header->timeout_watchdog); s_header->timeout_allocate = htonl(s_header->timeout_allocate); s_header->timeout_loop = htonl(s_header->timeout_loop); s_header->timeout_msgwait = htonl(s_header->timeout_msgwait); return sector_write(st, 0, s_header); } static int header_read(struct sbd_context *st, struct sector_header_s *s_header) { if (sector_read(st, 0, s_header) < 0) return -1; s_header->sector_size = ntohl(s_header->sector_size); s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog); s_header->timeout_allocate = ntohl(s_header->timeout_allocate); s_header->timeout_loop = ntohl(s_header->timeout_loop); s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait); /* This sets the global defaults: */ timeout_watchdog = s_header->timeout_watchdog; timeout_allocate = s_header->timeout_allocate; timeout_loop = s_header->timeout_loop; timeout_msgwait = s_header->timeout_msgwait; return 0; } static int valid_header(const struct sector_header_s *s_header) { if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) { cl_log(LOG_ERR, "Header magic does not match."); return -1; } if (s_header->version != sbd_version) { cl_log(LOG_ERR, "Header version does not match."); return -1; } if (s_header->sector_size != sector_size) { cl_log(LOG_ERR, "Header sector size does not match."); return -1; } return 0; } static struct sector_header_s * header_get(struct sbd_context *st) { struct sector_header_s *s_header; s_header = sector_alloc(); if (header_read(st, s_header) < 0) { cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd); free(s_header); return NULL; } if (valid_header(s_header) < 0) { cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd); free(s_header); return NULL; } /* cl_log(LOG_INFO, "Found version %d header with %d slots", s_header->version, s_header->slots); */ return s_header; } static int header_dump(struct sbd_context *st) { struct sector_header_s *s_header; char uuid[37]; s_header = header_get(st); if (s_header == NULL) return -1; printf("Header version : %u.%u\n", s_header->version, s_header->minor_version); if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); printf("UUID : %s\n", uuid); } printf("Number of slots : %u\n", s_header->slots); printf("Sector size : %lu\n", (unsigned long)s_header->sector_size); printf("Timeout (watchdog) : %lu\n", (unsigned long)s_header->timeout_watchdog); printf("Timeout (allocate) : %lu\n", (unsigned long)s_header->timeout_allocate); printf("Timeout (loop) : %lu\n", (unsigned long)s_header->timeout_loop); printf("Timeout (msgwait) : %lu\n", (unsigned long)s_header->timeout_msgwait); free(s_header); return 0; } static int init_device(struct sbd_context *st) { struct sector_header_s *s_header; struct sector_node_s *s_node; struct sector_mbox_s *s_mbox; char uuid[37]; int i; int rc = 0; s_header = sector_alloc(); s_node = sector_alloc(); s_mbox = sector_alloc(); memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic)); s_header->version = sbd_version; s_header->slots = 255; s_header->sector_size = sector_size; s_header->timeout_watchdog = timeout_watchdog; s_header->timeout_allocate = timeout_allocate; s_header->timeout_loop = timeout_loop; s_header->timeout_msgwait = timeout_msgwait; s_header->minor_version = 1; uuid_generate(s_header->uuid); uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", s_header->version, s_header->minor_version, st->devfd, uuid); fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n", s_header->version, s_header->minor_version, st->devfd, uuid); if (header_write(st, s_header) < 0) { rc = -1; goto out; } cl_log(LOG_INFO, "Initializing %d slots on device %d", s_header->slots, st->devfd); fprintf(stdout, "Initializing %d slots on device %d\n", s_header->slots, st->devfd); for (i=0;i < s_header->slots;i++) { if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } if (mbox_write(st, i, s_mbox) < 0) { rc = -1; goto out; } } -out: free(s_node); +out: free(s_mbox); + free(s_node); free(s_header); - free(s_mbox); return(rc); } /* Check if there already is a slot allocated to said name; returns the * slot number. If not found, returns -1. * This is necessary because slots might not be continuous. */ static int slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name) { struct sector_node_s *s_node = NULL; int i; int rc = -1; if (!name) { cl_log(LOG_ERR, "slot_lookup(): No name specified.\n"); goto out; } s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -2; goto out; } if (s_node->in_use != 0) { if (strncasecmp(s_node->name, name, SECTOR_NAME_MAX) == 0) { DBGLOG(LOG_INFO, "%s owns slot %d", name, i); rc = i; goto out; } } } out: free(s_node); return rc; } static int slot_unused(struct sbd_context *st, const struct sector_header_s *s_header) { struct sector_node_s *s_node; int i; int rc = -1; s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use == 0) { rc = i; goto out; } } out: free(s_node); return rc; } static int slot_allocate(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_allocate(): No name specified.\n"); fprintf(stderr, "slot_allocate(): No name specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); while (1) { i = slot_lookup(st, s_header, name); if ((i >= 0) || (i == -2)) { /* -1 is "no slot found", in which case we * proceed to allocate a new one. * -2 is "read error during lookup", in which * case we error out too * >= 0 is "slot already allocated" */ rc = i; goto out; } i = slot_unused(st, s_header); if (i >= 0) { cl_log(LOG_INFO, "slot %d is unused - trying to own", i); fprintf(stdout, "slot %d is unused - trying to own\n", i); memset(s_node, 0, sizeof(*s_node)); s_node->in_use = 1; strncpy(s_node->name, name, SECTOR_NAME_MAX); if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } sleep(timeout_allocate); } else { cl_log(LOG_ERR, "No more free slots."); fprintf(stderr, "No more free slots.\n"); rc = -1; goto out; } } -out: free(s_node); +out: free(s_mbox); + free(s_node); free(s_header); - free(s_mbox); return(rc); } static int slot_list(struct sbd_context *st) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use > 0) { if (mbox_read(st, i, s_mbox) < 0) { rc = -1; goto out; } printf("%d\t%s\t%s\t%s\n", i, s_node->name, char2cmd(s_mbox->cmd), s_mbox->from); } } out: free(s_mbox); free(s_node); free(s_header); return rc; } static int slot_msg(struct sbd_context *st, const char *name, const char *cmd) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int rc = 0; char uuid[37]; if (!name || !cmd) { cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device UUID: %s", uuid); } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = cmd2char(cmd); if (s_mbox->cmd < 0) { cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd); rc = -1; goto out; } strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); cl_log(LOG_INFO, "Writing %s to node slot %s", cmd, name); if (mbox_write_verify(st, mbox, s_mbox) < -1) { rc = -1; goto out; } if (strcasecmp(cmd, "exit") != 0) { cl_log(LOG_INFO, "Messaging delay: %d", (int)timeout_msgwait); sleep(timeout_msgwait); } cl_log(LOG_INFO, "%s successfully delivered to %s", cmd, name); out: free(s_mbox); free(s_header); return rc; } static int slot_ping(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int waited = 0; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = SBD_MSG_TEST; strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); DBGLOG(LOG_DEBUG, "Pinging node %s", name); if (mbox_write(st, mbox, s_mbox) < -1) { rc = -1; goto out; } rc = -1; while (waited <= timeout_msgwait) { if (mbox_read(st, mbox, s_mbox) < 0) break; if (s_mbox->cmd != SBD_MSG_TEST) { rc = 0; break; } sleep(1); waited++; } if (rc == 0) { cl_log(LOG_DEBUG, "%s successfully pinged.", name); } else { cl_log(LOG_ERR, "%s failed to ping.", name); } out: free(s_mbox); free(s_header); return rc; } int init_devices(struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Initializing device %s\n", s->devname); st = open_device(s->devname, LOG_ERR); if (!st) { return -1; } rc = init_device(st); close_device(st); if (rc == -1) { fprintf(stderr, "Failed to init device %s\n", s->devname); return rc; } fprintf(stdout, "Device %s is initialized.\n", s->devname); } fprintf(stdout, "Did you check sbd service down on all nodes before? If not do so now and restart afterwards.\n"); return 0; } static int slot_msg_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; struct sbd_context *st; const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp; st = open_device(devname, LOG_WARNING); if (!st) return -1; cl_log(LOG_INFO, "Delivery process handling %s", devname); rc = slot_msg(st, arg->name, arg->msg); close_device(st); return rc; } static int slot_ping_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; const char* name = (const char*)argp; struct sbd_context *st; st = open_device(devname, LOG_WARNING); if (!st) return -1; rc = slot_ping(st, name); close_device(st); return rc; } int allocate_slots(const char *name, struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Trying to allocate slot for %s on device %s.\n", name, s->devname); st = open_device(s->devname, LOG_WARNING); if (!st) { return -1; } rc = slot_allocate(st, name); close_device(st); if (rc < 0) return rc; fprintf(stdout, "Slot for %s has been allocated on %s.\n", name, s->devname); } return 0; } int list_slots(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv = 0; st = open_device(s->devname, LOG_WARNING); if (!st) { rc = -1; fprintf(stderr, "== disk %s unreadable!\n", s->devname); continue; } rv = slot_list(st); close_device(st); if (rv == -1) { rc = -1; fprintf(stderr, "== Slots on disk %s NOT dumped\n", s->devname); } } return rc; } int ping_via_slots(const char *name, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { if(sbd_is_disk(s)) { s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name); } } while (servants_finished < disk_count) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = wait(&status))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { servants_finished++; } } } } } return 0; } int quorum_write(int good_servants) { return (good_servants > disk_count/2); } int messenger(const char *name, const char *msg, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; int successful_delivery = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; struct slot_msg_arg_t slot_msg_arg = {name, msg}; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg); } while (!(quorum_write(successful_delivery) || (servants_finished == disk_count))) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { servants_finished++; if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { DBGLOG(LOG_INFO, "Process %d succeeded.", (int)pid); successful_delivery++; } else { cl_log(LOG_WARNING, "Process %d failed to deliver!", (int)pid); } } } } } if (quorum_write(successful_delivery)) { cl_log(LOG_INFO, "Message successfully delivered."); return 0; } else { cl_log(LOG_ERR, "Message is not delivered via more then a half of devices"); return -1; } } unsigned long get_first_msgwait(struct servants_list_item *servants) { unsigned long msgwait = 0; struct servants_list_item *s = servants; for (s = servants; s; s = s->next) { struct sbd_context *st; struct sector_header_s *s_header; st = open_device(s->devname, LOG_WARNING); if (!st) { continue; } s_header = header_get(st); if (s_header != NULL) { msgwait = (unsigned long)s_header->timeout_msgwait; close_device(st); free(s_header); return msgwait; } close_device(st); } return msgwait; } int dump_headers(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s = servants; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv; fprintf(stdout, "==Dumping header on disk %s\n", s->devname); st = open_device(s->devname, LOG_WARNING); if (st) { rv = header_dump(st); close_device(st); } else { fprintf(stderr, "== disk %s unreadable!\n", s->devname); rv = -1; } if (rv == -1) { rc = -1; fprintf(stderr, "==Header on disk %s NOT dumped\n", s->devname); } else { fprintf(stdout, "==Header on disk %s is dumped\n", s->devname); } } return rc; } void open_any_device(struct servants_list_item *servants) { struct sector_header_s *hdr_cur = NULL; struct timespec t_0; int t_wait = 0; bool logged_once = false; clock_gettime(CLOCK_MONOTONIC, &t_0); while (!hdr_cur && t_wait < timeout_startup) { struct timespec t_now; struct servants_list_item* s; for (s = servants; s; s = s->next) { struct sbd_context *st = open_device(s->devname, LOG_DEBUG); if (!st) { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to open %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } continue; } hdr_cur = header_get(st); close_device(st); if (hdr_cur) { break; } else { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to read header from %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } } } clock_gettime(CLOCK_MONOTONIC, &t_now); t_wait = t_now.tv_sec - t_0.tv_sec; if (!hdr_cur) { sleep(timeout_loop); } } if (hdr_cur) { timeout_watchdog = hdr_cur->timeout_watchdog; timeout_allocate = hdr_cur->timeout_allocate; timeout_loop = hdr_cur->timeout_loop; timeout_msgwait = hdr_cur->timeout_msgwait; } else { cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.", timeout_startup); exit(1); } free(hdr_cur); return; } /* ::-::-::-::-::-::-::-::-::-::-::-::-:: Begin disk based servant code ::-::-::-::-::-::-::-::-::-::-::-::-:: */ static int servant_check_timeout_inconsistent(struct sector_header_s *hdr) { if (timeout_watchdog != hdr->timeout_watchdog) { cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device", (int)timeout_watchdog, (int)hdr->timeout_watchdog); return -1; } if (timeout_allocate != hdr->timeout_allocate) { cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device", (int)timeout_allocate, (int)hdr->timeout_allocate); return -1; } if (timeout_loop != hdr->timeout_loop) { cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device", (int)timeout_loop, (int)hdr->timeout_loop); return -1; } if (timeout_msgwait != hdr->timeout_msgwait) { cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device", (int)timeout_msgwait, (int)hdr->timeout_msgwait); return -1; } return 0; } int servant_md(const char *diskname, int mode, const void* argp) { struct sector_mbox_s *s_mbox = NULL; struct sector_node_s *s_node = NULL; struct sector_header_s *s_header = NULL; int mbox; int rc = 0; time_t t0, t1, latency; union sigval signal_value; sigset_t servant_masks; struct sbd_context *st; pid_t ppid; char uuid[37]; const struct servants_list_item *s = argp; cl_log(LOG_INFO, "Servant starting for device %s", diskname); /* Block most of the signals */ sigfillset(&servant_masks); sigdelset(&servant_masks, SIGKILL); sigdelset(&servant_masks, SIGFPE); sigdelset(&servant_masks, SIGILL); sigdelset(&servant_masks, SIGSEGV); sigdelset(&servant_masks, SIGBUS); sigdelset(&servant_masks, SIGALRM); /* FIXME: check error */ sigprocmask(SIG_SETMASK, &servant_masks, NULL); st = open_device(diskname, LOG_WARNING); if (!st) { exit(EXIT_MD_SERVANT_IO_FAIL); } s_header = header_get(st); if (!s_header) { cl_log(LOG_ERR, "Not a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (servant_check_timeout_inconsistent(s_header) < 0) { cl_log(LOG_ERR, "Timeouts on %s do not match first device", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid); } mbox = slot_allocate(st, local_uname); if (mbox < 0) { cl_log(LOG_ERR, "No slot allocated, and automatic allocation failed for disk %s.", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } s_node = sector_alloc(); if (slot_read(st, mbox, s_node) < 0) { cl_log(LOG_ERR, "Unable to read node entry on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname); if (s_header->minor_version == 0) { set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox); } else { set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s", diskname, mbox, uuid); } s_mbox = sector_alloc(); if (s->first_start) { if (mode > 0) { if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed during start-up in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd != SBD_MSG_EXIT && s_mbox->cmd != SBD_MSG_EMPTY) { /* Not a clean stop. Abort start-up */ cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!"); ppid = getppid(); sigqueue(ppid, SIG_EXITREQ, signal_value); rc = 0; goto out; } } DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); memset(s_mbox, 0, sizeof(*s_mbox)); if (mbox_write(st, mbox, s_mbox) < 0) { rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } } memset(&signal_value, 0, sizeof(signal_value)); while (1) { struct sector_header_s *s_header_retry = NULL; struct sector_node_s *s_node_retry = NULL; t0 = time(NULL); sleep(timeout_loop); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ do_timeout_action(); } /* These attempts are, by definition, somewhat racy. If * the device is wiped out or corrupted between here and * us reading our mbox, there is nothing we can do about * that. But at least we tried. */ s_header_retry = header_get(st); if (!s_header_retry) { cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); free(s_header_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_header_retry); s_node_retry = sector_alloc(); if (slot_read(st, mbox, s_node_retry) < 0) { cl_log(LOG_ERR, "slot read failed in servant."); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_node_retry); if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd > 0) { cl_log(LOG_NOTICE, "Received command %s from %s on disk %s", char2cmd(s_mbox->cmd), s_mbox->from, diskname); switch (s_mbox->cmd) { case SBD_MSG_TEST: memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); sigqueue(ppid, SIG_TEST, signal_value); break; case SBD_MSG_RESET: rc = EXIT_MD_SERVANT_REQUEST_RESET; goto out; case SBD_MSG_OFF: rc = EXIT_MD_SERVANT_REQUEST_SHUTOFF; goto out; case SBD_MSG_EXIT: sigqueue(ppid, SIG_EXITREQ, signal_value); break; case SBD_MSG_CRASHDUMP: rc = EXIT_MD_SERVANT_REQUEST_CRASHDUMP; goto out; default: /* FIXME: An "unknown" message might result from a partial write. log it and clear the slot. */ cl_log(LOG_ERR, "Unknown message on disk %s", diskname); memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); break; } } sigqueue(ppid, SIG_LIVENESS, signal_value); t1 = time(NULL); latency = t1 - t0; if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: %ds exceeded watchdog warning timeout %ds on disk %s", (int)latency, (int)timeout_watchdog_warn, diskname); } else if (debug) { DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", (int)latency, diskname); } } out: - free(s_header); free(s_node); free(s_mbox); + free(s_header); close_device(st); exit(rc); } -