Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index ae4750e..c7328af 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -1,552 +1,553 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <config.h>
#include <crm_config.h>
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
#if CHECK_TWO_NODE
#include <glib-unix.h>
#endif
#include "sbd.h"
//undef SUPPORT_PLUGIN
//define SUPPORT_PLUGIN 1
static bool remote_node = false;
static pid_t remoted_pid = 0;
static int reconnect_msec = 1000;
static GMainLoop *mainloop = NULL;
static guint notify_timer = 0;
static crm_cluster_t cluster;
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
#if SUPPORT_PLUGIN
static void
sbd_plugin_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
{
if(msg_len > 0) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s", name_for_cluster_type(get_cluster_type()));
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Broken %s message", name_for_cluster_type(get_cluster_type()));
}
notify_parent();
return;
}
#endif
#if SUPPORT_COROSYNC
static bool two_node = false;
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
#if CHECK_TWO_NODE
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
static cmap_track_handle_t track_handle = 0;
static GSource *cmap_source = NULL;
#endif
void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
bool quorum_is_suspect =
(two_node && ever_seen_both && cpg_membership_entries == 1);
if (!quorum_is_suspect) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s (%u members)",
name_for_cluster_type(get_cluster_type()),
cpg_membership_entries
);
} else {
/* Alternative would be asking votequorum for number of votes.
* Using pacemaker's cpg as source for number of active nodes
* avoids binding to an additional library, is definitely
* less code to write and we wouldn't have to combine data
* from 3 sources (cmap, cpq & votequorum) in a potentially
* racy environment.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but requires both nodes present",
name_for_cluster_type(get_cluster_type())
);
}
if (cpg_membership_entries > 1) {
ever_seen_both = true;
}
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Empty %s membership", name_for_cluster_type(get_cluster_type()));
}
}
void
sbd_cpg_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
const struct cpg_address *member_list, size_t member_list_entries,
const struct cpg_address *left_list, size_t left_list_entries,
const struct cpg_address *joined_list, size_t joined_list_entries)
{
cpg_membership_entries = member_list_entries;
sbd_cpg_membership_health_update();
notify_parent();
}
#if CHECK_TWO_NODE
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
int32_t event,
const char *key_name,
struct cmap_notify_value new_val,
struct cmap_notify_value old_val,
void *user_data)
{
if (new_val.type == CMAP_VALUETYPE_UINT8) {
switch (event) {
case CMAP_TRACK_ADD:
case CMAP_TRACK_MODIFY:
two_node = *((uint8_t *) new_val.data);
break;
case CMAP_TRACK_DELETE:
two_node = false;
break;
default:
return;
}
sbd_cpg_membership_health_update();
notify_parent();
}
}
static gboolean
cmap_dispatch_callback (gpointer user_data)
{
cmap_dispatch(cmap_handle, CS_DISPATCH_ALL);
return TRUE;
}
static gboolean
sbd_get_two_node(void)
{
uint8_t two_node_u8 = 0;
int cmap_fd;
if (!track_handle) {
if (cmap_initialize(&cmap_handle) != CS_OK) {
cl_log(LOG_WARNING, "Cannot initialize CMAP service\n");
goto out;
}
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n");
goto out;
}
if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) {
cl_log(LOG_WARNING, "Couldn't create source for cmap\n");
goto out;
}
g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL);
g_source_attach(cmap_source, NULL);
}
if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) {
- cl_log(LOG_NOTICE, "Corosync is%s in 2Node-mode", two_node_u8?"":" not");
+ cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
+ "Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
- cl_log(LOG_NOTICE, "quorum.two_node present in cmap\n");
+ cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
return TRUE;
out:
if (cmap_source) {
g_source_destroy(cmap_source);
cmap_source = NULL;
}
if (track_handle) {
cmap_track_delete(cmap_handle, track_handle);
track_handle = 0;
}
if (cmap_handle) {
cmap_finalize(cmap_handle);
cmap_handle = 0;
}
return FALSE;
}
#endif
#endif
static gboolean
notify_timer_cb(gpointer data)
{
cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":"");
if(remote_node) {
sbd_remote_check(NULL);
return TRUE;
}
switch (get_cluster_type()) {
#if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS
case pcmk_cluster_classic_ais:
send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais);
break;
#endif
case pcmk_cluster_corosync:
#if HAVE_DECL_PCMK_CLUSTER_CMAN
case pcmk_cluster_cman:
#endif
/* TODO - Make a CPG call and only call notify_parent() when we get a reply */
notify_parent();
break;
default:
break;
}
return TRUE;
}
static void
sbd_membership_connect(void)
{
bool connected = false;
- cl_log(LOG_NOTICE, "Attempting cluster connection");
+ cl_log(LOG_INFO, "Attempting cluster connection");
cluster.destroy = sbd_membership_destroy;
#if SUPPORT_PLUGIN
cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch;
#endif
#if SUPPORT_COROSYNC
cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch;
#endif
while(connected == false) {
enum cluster_type_e stack = get_cluster_type();
if(get_cluster_type() == pcmk_cluster_unknown) {
crm_debug("Attempting pacemaker remote connection");
/* Nothing is up, go looking for the pacemaker remote process */
if(find_pacemaker_remote() > 0) {
connected = true;
}
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
#if SUPPORT_COROSYNC && CHECK_TWO_NODE
if (sbd_get_two_node()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
#if SUPPORT_COROSYNC && CHECK_TWO_NODE
}
#endif
}
if(connected == false) {
cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000);
sleep(reconnect_msec / 1000);
}
}
- set_servant_health(pcmk_health_transient, LOG_NOTICE, "Connected, waiting for initial membership");
+ set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership");
notify_parent();
notify_timer_cb(NULL);
}
static void
sbd_membership_destroy(gpointer user_data)
{
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated");
notify_parent();
/* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */
sbd_membership_connect();
}
/*
* \internal
* \brief Get process ID and name associated with a /proc directory entry
*
* \param[in] entry Directory entry (must be result of readdir() on /proc)
* \param[out] name If not NULL, a char[64] to hold the process name
* \param[out] pid If not NULL, will be set to process ID of entry
*
* \return 0 on success, -1 if entry is not for a process or info not found
*
* \note This should be called only on Linux systems, as not all systems that
* support /proc store process names and IDs in the same way.
* Copied from the Pacemaker implementation.
*/
int
sbd_procfs_process_info(struct dirent *entry, char *name, int *pid)
{
int fd, local_pid;
FILE *file;
struct stat statbuf;
char key[16] = { 0 }, procpath[128] = { 0 };
/* We're only interested in entries whose name is a PID,
* so skip anything non-numeric or that is too long.
*
* 114 = 128 - strlen("/proc/") - strlen("/status") - 1
*/
local_pid = atoi(entry->d_name);
if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) {
return -1;
}
if (pid) {
*pid = local_pid;
}
/* Get this entry's file information */
strcpy(procpath, "/proc/");
strcat(procpath, entry->d_name);
fd = open(procpath, O_RDONLY);
if (fd < 0 ) {
return -1;
}
if (fstat(fd, &statbuf) < 0) {
close(fd);
return -1;
}
close(fd);
/* We're only interested in subdirectories */
if (!S_ISDIR(statbuf.st_mode)) {
return -1;
}
/* Read the first entry ("Name:") from the process's status file.
* We could handle the valgrind case if we parsed the cmdline file
* instead, but that's more of a pain than it's worth.
*/
if (name != NULL) {
strcat(procpath, "/status");
file = fopen(procpath, "r");
if (!file) {
return -1;
}
if ((fscanf(file, "%15s%63s", key, name) != 2)
|| safe_str_neq(key, "Name:")) {
fclose(file);
return -1;
}
fclose(file);
}
return 0;
}
static gboolean
sbd_remote_check(gpointer user_data)
{
static int have_proc_pid = 0;
int running = 0;
cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid);
if(have_proc_pid == 0) {
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid());
have_proc_pid = 1;
if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) {
have_proc_pid = -1;
}
}
if (remoted_pid <= 0) {
set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection");
goto notify;
} else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) {
/* Not running */
} else if(have_proc_pid == -1) {
running = 1;
cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid);
} else {
int rc = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], expected_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid);
rc = readlink(proc_path, exe_path, PATH_MAX - 1);
if (rc < 0) {
crm_perror(LOG_ERR, "Could not read from %s", proc_path);
goto done;
}
exe_path[rc] = 0;
rc = snprintf(expected_path, sizeof(proc_path), "%s/pacemaker_remoted", SBINDIR);
expected_path[rc] = 0;
if (strcmp(exe_path, expected_path) == 0) {
cl_log(LOG_DEBUG, "Process %s (%ld) is active",
exe_path, (long)remoted_pid);
running = 1;
}
}
done:
if(running) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
notify:
notify_parent();
if(running == 0) {
sbd_membership_connect();
}
return true;
}
static long unsigned int
find_pacemaker_remote(void)
{
DIR *dp;
char entry_name[64];
struct dirent *entry;
dp = opendir("/proc");
if (!dp) {
/* no proc directory to search through */
cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components");
return FALSE;
}
while ((entry = readdir(dp)) != NULL) {
int pid;
if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) {
continue;
}
/* entry_name is truncated to 16 characters including the nul terminator */
cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid);
if (strcmp(entry_name, "pacemaker_remot") == 0) {
cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid);
remoted_pid = pid;
remote_node = true;
break;
}
}
closedir(dp);
return remoted_pid;
}
static void
clean_up(int rc)
{
return;
}
static void
cluster_shutdown(int nsig)
{
clean_up(0);
}
int
servant_cluster(const char *diskname, int mode, const void* argp)
{
enum cluster_type_e cluster_stack = get_cluster_type();
crm_system_name = strdup("sbd:cluster");
- cl_log(LOG_INFO, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
+ cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
set_proc_title("sbd: watcher: Cluster");
sbd_membership_connect();
/* stonith_our_uname = cluster.uname; */
/* stonith_our_uuid = cluster.uuid; */
mainloop = g_main_new(FALSE);
notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL);
mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
g_main_run(mainloop);
g_main_destroy(mainloop);
clean_up(0);
return 0; /* never reached */
}
diff --git a/src/sbd-common.c b/src/sbd-common.c
index f22c4f2..0ce6478 100644
--- a/src/sbd-common.c
+++ b/src/sbd-common.c
@@ -1,971 +1,971 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#include <sys/reboot.h>
#include <sys/types.h>
#ifdef __GLIBC__
#include <sys/sysmacros.h>
#endif
#include <sys/stat.h>
#include <pwd.h>
#include <unistd.h>
#include <dirent.h>
#ifdef _POSIX_MEMLOCK
# include <sys/mman.h>
#endif
/* Tunable defaults: */
#if defined(__s390__) || defined(__s390x__)
unsigned long timeout_watchdog = 15;
int timeout_msgwait = 30;
#else
unsigned long timeout_watchdog = 5;
int timeout_msgwait = 10;
#endif
unsigned long timeout_watchdog_warn = 3;
int timeout_allocate = 2;
int timeout_loop = 1;
int timeout_io = 3;
int timeout_startup = 120;
int watchdog_use = 1;
int watchdog_set_timeout = 1;
unsigned long timeout_watchdog_crashdump = 240;
int skip_rt = 0;
int debug = 0;
int debug_mode = 0;
char *watchdogdev = NULL;
bool watchdogdev_is_default = false;
char * local_uname;
/* Global, non-tunable variables: */
int sector_size = 0;
int watchdogfd = -1;
int servant_health = 0;
/*const char *devname;*/
const char *cmdname;
void
usage(void)
{
fprintf(stderr,
"Shared storage fencing tool.\n"
"Syntax:\n"
" %s <options> <command> <cmdarguments>\n"
"Options:\n"
"-d <devname> Block device to use (mandatory; can be specified up to 3 times)\n"
"-h Display this help.\n"
"-n <node> Set local node name; defaults to uname -n (optional)\n"
"\n"
"-R Do NOT enable realtime priority (debugging only)\n"
"-W Use watchdog (recommended) (watch only)\n"
"-w <dev> Specify watchdog device (optional) (watch only)\n"
"-T Do NOT initialize the watchdog timeout (watch only)\n"
"-S <0|1> Set start mode if the node was previously fenced (watch only)\n"
"-p <path> Write pidfile to the specified path (watch only)\n"
"-v Enable some verbose debug logging (optional)\n"
"\n"
"-1 <N> Set watchdog timeout to N seconds (optional, create only)\n"
"-2 <N> Set slot allocation timeout to N seconds (optional, create only)\n"
"-3 <N> Set daemon loop timeout to N seconds (optional, create only)\n"
"-4 <N> Set msgwait timeout to N seconds (optional, create only)\n"
"-5 <N> Warn if loop latency exceeds threshold (optional, watch only)\n"
" (default is 3, set to 0 to disable)\n"
"-C <N> Watchdog timeout to set before crashdumping (def: 240s, optional)\n"
"-I <N> Async IO read timeout (defaults to 3 * loop timeout, optional)\n"
"-s <N> Timeout to wait for devices to become available (def: 120s)\n"
"-t <N> Dampening delay before faulty servants are restarted (optional)\n"
" (default is 5, set to 0 to disable)\n"
"-F <N> # of failures before a servant is considered faulty (optional)\n"
" (default is 1, set to 0 to disable)\n"
"-P Check Pacemaker quorum and node health (optional, watch only)\n"
"-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n"
"Commands:\n"
#if SUPPORT_SHARED_DISK
"create initialize N slots on <dev> - OVERWRITES DEVICE!\n"
"list List all allocated slots on device, and messages.\n"
"dump Dump meta-data header from device.\n"
"allocate <node>\n"
" Allocate a slot for node (optional)\n"
"message <node> (test|reset|off|clear|exit)\n"
" Writes the specified message to node's slot.\n"
#endif
"watch Loop forever, monitoring own slot\n"
"query-watchdog Check for available watchdog-devices and print some info\n"
"test-watchdog Test the watchdog-device selected.\n"
" Attention: This will arm the watchdog and have your system reset\n"
" in case your watchdog is working properly!\n"
, cmdname);
}
static int
watchdog_init_interval_fd(int wdfd, int timeout)
{
if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) {
cl_perror( "WDIOC_SETTIMEOUT"
": Failed to set watchdog timer to %u seconds.",
timeout);
cl_log(LOG_CRIT, "Please validate your watchdog configuration!");
cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure.");
return -1;
}
return 0;
}
int
watchdog_init_interval(void)
{
if (watchdogfd < 0) {
return 0;
}
if (watchdog_set_timeout == 0) {
cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
return 0;
}
if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) {
return -1;
}
cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
return 0;
}
static int
watchdog_tickle_fd(int wdfd, char *wddev)
{
if (write(wdfd, "", 1) != 1) {
cl_perror("Watchdog write failure: %s!", wddev);
return -1;
}
return 0;
}
int
watchdog_tickle(void)
{
if (watchdogfd >= 0) {
return watchdog_tickle_fd(watchdogfd, watchdogdev);
}
return 0;
}
static int
watchdog_init_fd(char *wddev, int timeout)
{
int wdfd;
wdfd = open(wddev, O_WRONLY);
if (wdfd >= 0) {
if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0))
|| (watchdog_tickle_fd(wdfd, wddev) < 0)) {
close(wdfd);
return -1;
}
} else {
cl_perror("Cannot open watchdog device '%s'", wddev);
return -1;
}
return wdfd;
}
int
watchdog_init(void)
{
if (watchdogfd < 0 && watchdogdev != NULL) {
int timeout = timeout_watchdog;
if (watchdog_set_timeout == 0) {
cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
timeout = -1;
}
watchdogfd = watchdog_init_fd(watchdogdev, timeout);
if (watchdogfd >= 0) {
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
if (watchdog_set_timeout) {
cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
}
} else {
return -1;
}
}
return 0;
}
static void
watchdog_close_fd(int wdfd, char *wddev, bool disarm)
{
if (disarm) {
int r;
int flags = WDIOS_DISABLECARD;;
/* Explicitly disarm it */
r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags);
if (r < 0) {
cl_perror("Failed to disable hardware watchdog %s", wddev);
}
/* To be sure, use magic close logic, too */
for (;;) {
if (write(wdfd, "V", 1) > 0) {
break;
}
cl_perror("Cannot disable watchdog device %s", wddev);
}
}
if (close(wdfd) < 0) {
cl_perror("Watchdog close(%d) failed", wdfd);
}
}
void
watchdog_close(bool disarm)
{
if (watchdogfd < 0) {
return;
}
watchdog_close_fd(watchdogfd, watchdogdev, disarm);
watchdogfd = -1;
}
#define MAX_WATCHDOGS 64
#define SYS_CLASS_WATCHDOG "/sys/class/watchdog"
#define SYS_CHAR_DEV_DIR "/sys/dev/char"
#define WATCHDOG_NODEDIR "/dev"
struct watchdog_list_item {
dev_t dev;
char *dev_node;
char *dev_ident;
char *dev_driver;
struct watchdog_list_item *next;
};
static struct watchdog_list_item *watchdog_list = NULL;
static int watchdog_list_items = 0;
static void
watchdog_populate_list(void)
{
dev_t watchdogs[MAX_WATCHDOGS + 1] =
{makedev(10,130), 0};
int num_watchdogs = 1;
struct dirent *entry;
char entry_name[280];
DIR *dp;
char buf[256] = "";
if (watchdog_list != NULL) {
return;
}
/* get additional devices from /sys/class/watchdog */
dp = opendir(SYS_CLASS_WATCHDOG);
if (dp) {
while ((entry = readdir(dp))) {
if (entry->d_type == DT_LNK) {
FILE *file;
snprintf(entry_name, sizeof(entry_name),
SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
file = fopen(entry_name, "r");
if (file) {
int major, minor;
if (fscanf(file, "%d:%d", &major, &minor) == 2) {
watchdogs[num_watchdogs++] = makedev(major, minor);
}
fclose(file);
if (num_watchdogs == MAX_WATCHDOGS) {
break;
}
}
}
}
closedir(dp);
}
/* search for watchdog nodes in /dev */
dp = opendir(WATCHDOG_NODEDIR);
if (dp) {
while ((entry = readdir(dp))) {
if ((entry->d_type == DT_CHR) || (entry->d_type == DT_LNK)) {
struct stat statbuf;
snprintf(entry_name, sizeof(entry_name),
WATCHDOG_NODEDIR "/%s", entry->d_name);
if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode)) {
int i;
for (i=0; i<num_watchdogs; i++) {
if (statbuf.st_rdev == watchdogs[i]) {
int wdfd = watchdog_init_fd(entry_name, -1);
struct watchdog_list_item *wdg =
calloc(1, sizeof(struct watchdog_list_item));
wdg->dev = watchdogs[i];
wdg->dev_node = strdup(entry_name);
wdg->next = watchdog_list;
watchdog_list = wdg;
watchdog_list_items++;
if (wdfd >= 0) {
struct watchdog_info ident;
ident.identity[0] = '\0';
ioctl(wdfd, WDIOC_GETSUPPORT, &ident);
watchdog_close_fd(wdfd, entry_name, true);
if (ident.identity[0]) {
wdg->dev_ident = strdup((char *) ident.identity);
}
}
snprintf(entry_name, sizeof(entry_name),
SYS_CHAR_DEV_DIR "/%d:%d/device/driver",
major(watchdogs[i]), minor(watchdogs[i]));
if (readlink(entry_name, buf, sizeof(buf)) > 0) {
wdg->dev_driver = strdup(basename(buf));
} else if ((wdg->dev_ident) &&
(strcmp(wdg->dev_ident,
"Software Watchdog") == 0)) {
wdg->dev_driver = strdup("softdog");
}
break;
}
}
}
}
}
closedir(dp);
}
}
int watchdog_info(void)
{
struct watchdog_list_item *wdg;
int wdg_cnt = 0;
watchdog_populate_list();
printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items);
for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
wdg_cnt++;
printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n",
wdg_cnt, wdg->dev_node,
wdg->dev_ident?wdg->dev_ident:"Error: Check if hogged by e.g. sbd-daemon!",
wdg->dev_driver?wdg->dev_driver:"<unknown>");
if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) {
printf("CAUTION: Not recommended for use with sbd.\n");
}
}
return 0;
}
int watchdog_test(void)
{
int i;
if ((watchdog_set_timeout == 0) || !watchdog_use) {
printf("\nWatchdog is disabled - aborting test!!!\n");
return 0;
}
if (watchdogdev_is_default) {
watchdog_populate_list();
if (watchdog_list_items > 1) {
printf("\nError: Multiple watchdog devices discovered.\n"
" Use -w <watchdog> or SBD_WATCHDOG_DEV to specify\n"
" which device to reset the system with\n");
watchdog_info();
return -1;
}
}
if ((isatty(fileno(stdin)))) {
char buffer[16];
printf("\nWARNING: This operation is expected to force-reboot this system\n"
" without following any shutdown procedures.\n\n"
"Proceed? [NO/Proceed] ");
if ((fgets(buffer, 16, stdin) == NULL) ||
strcmp(buffer, "Proceed\n")) {
printf("\nAborting watchdog test!!!\n");
return 0;
}
printf("\n");
}
printf("Initializing %s with a reset countdown of %d seconds ...\n",
watchdogdev, (int) timeout_watchdog);
if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) {
printf("Failed to initialize watchdog!!!\n");
return -1;
}
printf("\n");
printf("NOTICE: The watchdog device is expected to reset the system\n"
" in %d seconds. If system remains active beyond that time,\n"
" watchdog may not be functional.\n\n", (int) timeout_watchdog);
for (i=timeout_watchdog; i>1; i--) {
printf("Reset countdown ... %d seconds\n", i);
sleep(1);
}
for (i=2; i>0; i--) {
printf("System expected to reset any moment ...\n");
sleep(1);
}
for (i=5; i>0; i--) {
printf("System should have reset ...\n");
sleep(1);
}
printf("Error: The watchdog device has failed to reboot the system,\n"
" and it may not be suitable for usage with sbd.\n");
/* test should trigger a reboot thus returning is actually bad */
return -1;
}
/* This duplicates some code from linux/ioprio.h since these are not included
* even in linux-kernel-headers. Sucks. See also
* /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */
extern int sys_ioprio_set(int, int, int);
int ioprio_set(int which, int who, int ioprio);
inline int ioprio_set(int which, int who, int ioprio)
{
return syscall(__NR_ioprio_set, which, who, ioprio);
}
enum {
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
static unsigned char
sbd_stack_hogger(unsigned char * inbuf, int kbytes)
{
unsigned char buf[1024];
if(kbytes <= 0) {
return HOG_CHAR;
}
if (inbuf == NULL) {
memset(buf, HOG_CHAR, sizeof(buf));
} else {
memcpy(buf, inbuf, sizeof(buf));
}
if (kbytes > 0) {
return sbd_stack_hogger(buf, kbytes-1);
} else {
return buf[sizeof(buf)-1];
}
}
static void
sbd_malloc_hogger(int kbytes)
{
int j;
void**chunks;
int chunksize = 1024;
if(kbytes <= 0) {
return;
}
/*
* We could call mallopt(M_MMAP_MAX, 0) to disable it completely,
* but we've already called mlockall()
*
* We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc
* from giving memory back to the system, but we've already called
* mlockall(MCL_FUTURE), so there's no need.
*/
chunks = malloc(kbytes * sizeof(void *));
if (chunks == NULL) {
cl_log(LOG_WARNING, "Could not preallocate chunk array");
return;
}
for (j=0; j < kbytes; ++j) {
chunks[j] = malloc(chunksize);
if (chunks[j] == NULL) {
cl_log(LOG_WARNING, "Could not preallocate block %d", j);
} else {
memset(chunks[j], 0, chunksize);
}
}
for (j=0; j < kbytes; ++j) {
free(chunks[j]);
}
free(chunks);
}
static void sbd_memlock(int stackgrowK, int heapgrowK)
{
#ifdef _POSIX_MEMLOCK
/*
* We could call setrlimit(RLIMIT_MEMLOCK,...) with a large
* number, but the mcp runs as root and mlock(2) says:
*
* Since Linux 2.6.9, no limits are placed on the amount of memory
* that a privileged process may lock, and this limit instead
* governs the amount of memory that an unprivileged process may
* lock.
*/
if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) {
cl_log(LOG_INFO, "Locked ourselves in memory");
/* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */
sbd_malloc_hogger(heapgrowK);
sbd_stack_hogger(NULL, stackgrowK);
} else {
cl_perror("Unable to lock ourselves into memory");
}
#else
cl_log(LOG_ERR, "Unable to lock ourselves into memory");
#endif
}
void
sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
{
if(priority < 0) {
return;
}
#ifdef SCHED_RR
{
int pcurrent = 0;
int pmin = sched_get_priority_min(SCHED_RR);
int pmax = sched_get_priority_max(SCHED_RR);
if (priority == 0) {
priority = pmax;
} else if (priority < pmin) {
priority = pmin;
} else if (priority > pmax) {
priority = pmax;
}
pcurrent = sched_getscheduler(0);
if (pcurrent < 0) {
cl_perror("Unable to get scheduler priority");
} else if(pcurrent < priority) {
struct sched_param sp;
memset(&sp, 0, sizeof(sp));
sp.sched_priority = priority;
if (sched_setscheduler(0, SCHED_RR, &sp) < 0) {
cl_perror("Unable to set scheduler priority to %d", priority);
} else {
cl_log(LOG_INFO, "Scheduler priority is now %d", priority);
}
}
}
#else
cl_log(LOG_ERR, "System does not support updating the scheduler priority");
#endif
sbd_memlock(heapgrowK, stackgrowK);
}
void
maximize_priority(void)
{
if (skip_rt) {
cl_log(LOG_INFO, "Not elevating to realtime (-R specified).");
return;
}
sbd_make_realtime(0, 256, 256);
if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(),
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) {
cl_perror("ioprio_set() call failed.");
}
}
void
sysrq_init(void)
{
FILE* procf;
int c;
procf = fopen("/proc/sys/kernel/sysrq", "r");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for read.");
return;
}
if (fscanf(procf, "%d", &c) != 1) {
cl_perror("Parsing sysrq failed");
c = 0;
}
fclose(procf);
if (c == 1)
return;
/* 8 for debugging dumps of processes,
128 for reboot/poweroff */
c |= 136;
procf = fopen("/proc/sys/kernel/sysrq", "w");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for writing");
return;
}
fprintf(procf, "%d", c);
fclose(procf);
return;
}
void
sysrq_trigger(char t)
{
FILE *procf;
procf = fopen("/proc/sysrq-trigger", "a");
if (!procf) {
cl_perror("Opening sysrq-trigger failed.");
return;
}
cl_log(LOG_INFO, "sysrq-trigger: %c\n", t);
fprintf(procf, "%c\n", t);
fclose(procf);
return;
}
static void
do_exit(char kind)
{
/* TODO: Turn debug_mode into a bit field? Delay + kdump for example */
const char *reason = NULL;
if (kind == 'c') {
cl_log(LOG_NOTICE, "Initiating kdump");
} else if (debug_mode == 1) {
cl_log(LOG_WARNING, "Initiating kdump instead of panicing the node (debug mode)");
kind = 'c';
}
if (debug_mode == 2) {
cl_log(LOG_WARNING, "Shutting down SBD instead of panicing the node (debug mode)");
watchdog_close(true);
exit(0);
}
if (debug_mode == 3) {
/* Give the system some time to flush logs to disk before rebooting. */
cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)");
watchdog_close(true);
sync();
sleep(10);
}
switch(kind) {
case 'b':
reason = "reboot";
break;
case 'c':
reason = "crashdump";
break;
case 'o':
reason = "off";
break;
default:
reason = "unknown";
break;
}
cl_log(LOG_EMERG, "Rebooting system: %s", reason);
sync();
if(kind == 'c') {
watchdog_close(true);
sysrq_trigger(kind);
} else {
watchdog_close(false);
sysrq_trigger(kind);
if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) {
cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot");
}
}
exit(1);
}
void
do_crashdump(void)
{
do_exit('c');
}
void
do_reset(void)
{
do_exit('b');
}
void
do_off(void)
{
do_exit('o');
}
/*
* Change directory to the directory our core file needs to go in
* Call after you establish the userid you're running under.
*/
int
sbd_cdtocoredir(void)
{
int rc;
static const char *dir = NULL;
if (dir == NULL) {
dir = CRM_CORE_DIR;
}
if ((rc=chdir(dir)) < 0) {
int errsave = errno;
cl_perror("Cannot chdir to [%s]", dir);
errno = errsave;
}
return rc;
}
pid_t
make_daemon(void)
{
pid_t pid;
const char * devnull = "/dev/null";
pid = fork();
if (pid < 0) {
cl_log(LOG_ERR, "%s: could not start daemon\n",
cmdname);
cl_perror("fork");
exit(1);
}else if (pid > 0) {
return pid;
}
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
/* This is the child; ensure privileges have not been lost. */
maximize_priority();
sysrq_init();
umask(022);
close(0);
(void)open(devnull, O_RDONLY);
close(1);
(void)open(devnull, O_WRONLY);
close(2);
(void)open(devnull, O_WRONLY);
sbd_cdtocoredir();
return 0;
}
void
sbd_get_uname(void)
{
struct utsname uname_buf;
int i;
if (uname(&uname_buf) < 0) {
cl_perror("uname() failed?");
exit(1);
}
local_uname = strdup(uname_buf.nodename);
for (i = 0; i < strlen(local_uname); i++)
local_uname[i] = tolower(local_uname[i]);
}
#define FMT_MAX 256
void
sbd_set_format_string(int method, const char *daemon)
{
int offset = 0;
char fmt[FMT_MAX];
struct utsname res;
switch(method) {
case QB_LOG_STDERR:
break;
case QB_LOG_SYSLOG:
if(daemon && strcmp(daemon, "sbd") != 0) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon);
}
break;
default:
/* When logging to a file */
if (uname(&res) == 0) {
offset +=
snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(),
res.nodename, daemon);
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon);
}
}
if (debug && method >= QB_LOG_STDERR) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: ");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: ");
}
if (method == QB_LOG_SYSLOG) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b");
}
if(offset > 0) {
qb_log_format_set(method, fmt);
}
}
void
notify_parent(void)
{
pid_t ppid;
union sigval signal_value;
memset(&signal_value, 0, sizeof(signal_value));
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
cl_log(LOG_WARNING, "Our parent is dead.");
do_reset();
}
switch (servant_health) {
case pcmk_health_pending:
case pcmk_health_shutdown:
case pcmk_health_transient:
- DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", servant_health);
+ DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health);
break;
case pcmk_health_unknown:
case pcmk_health_unclean:
case pcmk_health_noquorum:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health);
sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
break;
case pcmk_health_online:
- DBGLOG(LOG_INFO, "Notifying parent: healthy");
+ DBGLOG(LOG_DEBUG, "Notifying parent: healthy");
sigqueue(ppid, SIG_LIVENESS, signal_value);
break;
default:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health);
sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
break;
}
}
void
set_servant_health(enum pcmk_health state, int level, char const *format, ...)
{
if (servant_health != state) {
va_list ap;
int len = 0;
char *string = NULL;
servant_health = state;
va_start(ap, format);
len = vasprintf (&string, format, ap);
if(len > 0) {
cl_log(level, "%s", string);
}
va_end(ap);
free(string);
}
}
bool
sbd_is_disk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(servant->devname[0] == '/')) {
return true;
}
return false;
}
bool
sbd_is_cluster(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("cluster", servant->devname) == 0)) {
return true;
}
return false;
}
bool
sbd_is_pcmk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("pcmk", servant->devname) == 0)) {
return true;
}
return false;
}
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index 237bf43..90c7d26 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -1,1164 +1,1164 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <pacemaker/crm/common/util.h>
#include "sbd.h"
#define LOCKSTRLEN 11
static struct servants_list_item *servants_leader = NULL;
int disk_priority = 1;
int check_pcmk = 1;
int check_cluster = 1;
int disk_count = 0;
int servant_count = 0;
int servant_restart_interval = 5;
int servant_restart_count = 1;
int start_mode = 0;
char* pidfile = NULL;
int parse_device_line(const char *line);
void recruit_servant(const char *devname, pid_t pid)
{
struct servants_list_item *s = servants_leader;
struct servants_list_item *newbie;
if (lookup_servant_by_dev(devname)) {
cl_log(LOG_DEBUG, "Servant %s already exists", devname);
return;
}
newbie = malloc(sizeof(*newbie));
if (!newbie) {
fprintf(stderr, "malloc failed in recruit_servant.\n");
exit(1);
}
memset(newbie, 0, sizeof(*newbie));
newbie->devname = strdup(devname);
newbie->pid = pid;
newbie->first_start = 1;
if (!s) {
servants_leader = newbie;
} else {
while (s->next)
s = s->next;
s->next = newbie;
}
servant_count++;
if(sbd_is_disk(newbie)) {
- cl_log(LOG_NOTICE, "Monitoring %s", devname);
+ cl_log(LOG_INFO, "Monitoring %s", devname);
disk_count++;
} else {
newbie->outdated = 1;
}
}
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
{
pid_t pid = 0;
int rc = 0;
pid = fork();
if (pid == 0) { /* child */
maximize_priority();
sbd_set_format_string(QB_LOG_SYSLOG, devname);
rc = (*functionp)(devname, mode, argp);
if (rc == -1)
exit(1);
else
exit(0);
} else if (pid != -1) { /* parent */
return pid;
} else {
cl_log(LOG_ERR,"Failed to fork servant");
exit(1);
}
}
struct servants_list_item *lookup_servant_by_dev(const char *devname)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (strcasecmp(s->devname, devname) == 0)
break;
}
return s;
}
struct servants_list_item *lookup_servant_by_pid(pid_t pid)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (s->pid == pid)
break;
}
return s;
}
int check_all_dead(void)
{
struct servants_list_item *s;
int r = 0;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if (r == -1 && errno == ESRCH)
continue;
return 0;
}
}
return 1;
}
void servant_start(struct servants_list_item *s)
{
int r = 0;
union sigval svalue;
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if ((r != -1 || errno != ESRCH))
return;
}
s->restarts++;
if (sbd_is_disk(s)) {
#if SUPPORT_SHARED_DISK
DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
s->pid = assign_servant(s->devname, servant, start_mode, s);
#else
cl_log(LOG_ERR, "Shared disk functionality not supported");
return;
#endif
} else if(sbd_is_pcmk(s)) {
DBGLOG(LOG_INFO, "Starting Pacemaker servant");
s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
} else if(sbd_is_cluster(s)) {
DBGLOG(LOG_INFO, "Starting Cluster servant");
s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
} else {
cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
}
clock_gettime(CLOCK_MONOTONIC, &s->t_started);
return;
}
void servants_start(void)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
s->restarts = 0;
servant_start(s);
}
}
void servants_kill(void)
{
struct servants_list_item *s;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0)
sigqueue(s->pid, SIGKILL, svalue);
}
}
static inline void cleanup_servant_by_pid(pid_t pid)
{
struct servants_list_item* s;
s = lookup_servant_by_pid(pid);
if (s) {
cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
s->devname, s->pid);
s->pid = 0;
} else {
/* This most likely is a stray signal from somewhere, or
* a SIGCHLD for a process that has previously
* explicitly disconnected. */
DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
pid);
}
}
int inquisitor_decouple(void)
{
pid_t ppid = getppid();
union sigval signal_value;
/* During start-up, we only arm the watchdog once we've got
* quorum at least once. */
if (watchdog_use) {
if (watchdog_init() < 0) {
return -1;
}
}
if (ppid > 1) {
sigqueue(ppid, SIG_LIVENESS, signal_value);
}
return 0;
}
static int sbd_lock_running(long pid)
{
int rc = 0;
long mypid;
int running = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
/* check if pid is running */
if (kill(pid, 0) < 0 && errno == ESRCH) {
goto bail;
}
#ifndef HAVE_PROC_PID
return 1;
#endif
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
rc = readlink(proc_path, exe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
exe_path[rc] = 0;
mypid = (unsigned long) getpid();
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
rc = readlink(proc_path, myexe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
myexe_path[rc] = 0;
if(strcmp(exe_path, myexe_path) == 0) {
running = 1;
}
bail:
return running;
}
static int
sbd_lock_pidfile(const char *filename)
{
char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
int fd;
long pid, mypid;
int rc;
struct stat sbuf;
if (filename == NULL) {
errno = EFAULT;
return -1;
}
mypid = (unsigned long) getpid();
snprintf(lf_name, sizeof(lf_name), "%s",filename);
snprintf(tf_name, sizeof(tf_name), "%s.%lu",
filename, mypid);
if ((fd = open(lf_name, O_RDONLY)) >= 0) {
if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
sleep(1); /* if someone was about to create one,
* give'm a sec to do so
* Though if they follow our protocol,
* this won't happen. They should really
* put the pid in, then link, not the
* other way around.
*/
}
if (read(fd, buf, sizeof(buf)) < 1) {
/* lockfile empty -> rm it and go on */;
} else {
if (sscanf(buf, "%ld", &pid) < 1) {
/* lockfile screwed up -> rm it and go on */
} else {
if (pid > 1 && (getpid() != pid)
&& sbd_lock_running(pid)) {
/* is locked by existing process
* -> give up */
close(fd);
return -1;
} else {
/* stale lockfile -> rm it and go on */
}
}
}
unlink(lf_name);
close(fd);
}
if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
/* Hmmh, why did we fail? Anyway, nothing we can do about it */
return -3;
}
/* Slight overkill with the %*d format ;-) */
snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
/* Again, nothing we can do about this */
rc = -3;
close(fd);
goto out;
}
close(fd);
switch (link(tf_name, lf_name)) {
case 0:
if (stat(tf_name, &sbuf) < 0) {
/* something weird happened */
rc = -3;
break;
}
if (sbuf.st_nlink < 2) {
/* somehow, it didn't get through - NFS trouble? */
rc = -2;
break;
}
rc = 0;
break;
case EEXIST:
rc = -1;
break;
default:
rc = -3;
}
out:
unlink(tf_name);
return rc;
}
/*
* Unlock a file (remove its lockfile)
* do we need to check, if its (still) ours? No, IMHO, if someone else
* locked our line, it's his fault -tho
* returns 0 on success
* <0 if some failure occured
*/
static int
sbd_unlock_pidfile(const char *filename)
{
char lf_name[256];
if (filename == NULL) {
errno = EFAULT;
return -1;
}
snprintf(lf_name, sizeof(lf_name), "%s", filename);
return unlink(lf_name);
}
int cluster_alive(bool all)
{
int alive = 1;
struct servants_list_item* s;
if(servant_count == disk_count) {
return 0;
}
for (s = servants_leader; s; s = s->next) {
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if(s->outdated) {
alive = 0;
} else if(all == false) {
return 1;
}
}
}
return alive;
}
int quorum_read(int good_servants)
{
if (disk_count > 2)
return (good_servants > disk_count/2);
else
return (good_servants > 0);
}
void inquisitor_child(void)
{
int sig, pid;
sigset_t procmask;
siginfo_t sinfo;
int status;
struct timespec timeout;
int exiting = 0;
int decoupled = 0;
int cluster_appeared = 0;
int pcmk_override = 0;
time_t latency;
struct timespec t_last_tickle, t_now;
struct servants_list_item* s;
if (debug_mode) {
cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
}
set_proc_title("sbd: inquisitor");
if (pidfile) {
if (sbd_lock_pidfile(pidfile) < 0) {
exit(1);
}
}
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIGTERM);
sigaddset(&procmask, SIG_LIVENESS);
sigaddset(&procmask, SIG_EXITREQ);
sigaddset(&procmask, SIG_TEST);
sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
sigaddset(&procmask, SIG_RESTART);
sigaddset(&procmask, SIGUSR1);
sigaddset(&procmask, SIGUSR2);
sigprocmask(SIG_BLOCK, &procmask, NULL);
servants_start();
timeout.tv_sec = timeout_loop;
timeout.tv_nsec = 0;
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
while (1) {
bool tickle = 0;
bool can_detach = 0;
int good_servants = 0;
sig = sigtimedwait(&procmask, &sinfo, &timeout);
clock_gettime(CLOCK_MONOTONIC, &t_now);
if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_MD_IO_FAIL:
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
s->devname);
break;
case EXIT_MD_REQUEST_RESET:
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
do_reset();
break;
case EXIT_MD_REQUEST_SHUTOFF:
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
do_off();
break;
case EXIT_MD_REQUEST_CRASHDUMP:
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
do_crashdump();
break;
default:
break;
}
}
}
cleanup_servant_by_pid(pid);
}
}
} else if (sig == SIG_PCMK_UNHEALTHY) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if (s->outdated == 0) {
cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
}
s->t_last.tv_sec = 1;
} else {
cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
}
} else if (sig == SIG_LIVENESS) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (s) {
s->first_start = 0;
clock_gettime(CLOCK_MONOTONIC, &s->t_last);
}
} else if (sig == SIG_TEST) {
} else if (sig == SIGUSR1) {
if (exiting)
continue;
servants_start();
}
if (exiting) {
if (check_all_dead()) {
if (pidfile) {
sbd_unlock_pidfile(pidfile);
}
exit(0);
} else
continue;
}
good_servants = 0;
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_last.tv_sec;
if (!s->t_last.tv_sec)
continue;
if (age < (int)(timeout_io+timeout_loop)) {
if (sbd_is_disk(s)) {
good_servants++;
}
if (s->outdated) {
cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
}
s->outdated = 0;
} else if (!s->outdated) {
if (!s->restart_blocked) {
cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
}
s->outdated = 1;
}
}
if(disk_count == 0) {
/* NO disks, everything is up to the cluster */
if(cluster_alive(true)) {
/* We LIVE! */
if(cluster_appeared == false) {
- cl_log(LOG_NOTICE, "Active cluster detected");
+ cl_log(LOG_INFO, "Active cluster detected");
}
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(cluster_alive(false)) {
if(!decoupled) {
/* On the way up, detach and arm the watchdog */
- cl_log(LOG_NOTICE, "Partial cluster detected, detaching");
+ cl_log(LOG_INFO, "Partial cluster detected, detaching");
}
can_detach = 1;
tickle = !cluster_appeared;
} else if(!decoupled) {
/* Stay alive until the cluster comes up */
tickle = !cluster_appeared;
}
} else if(disk_priority == 1 || servant_count == disk_count) {
if (quorum_read(good_servants)) {
/* There are disks and we're connected to the majority of them */
tickle = 1;
can_detach = 1;
pcmk_override = 0;
} else if (servant_count > disk_count && cluster_alive(true)) {
tickle = 1;
if(!pcmk_override) {
cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
pcmk_override = 1; /* Only log this message once */
}
}
} else if(cluster_alive(true) && quorum_read(good_servants)) {
/* Both disk and cluster servants are healthy */
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(quorum_read(good_servants)) {
/* The cluster takes priority but only once
* connected for the first time.
*
* Until then, we tickle based on disk quorum.
*/
can_detach = 1;
tickle = !cluster_appeared;
}
/* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
/* quorum_read(good_servants), good_servants, tickle, disk_count); */
if(tickle) {
watchdog_tickle();
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
}
if (!decoupled && can_detach) {
/* We only do this at the point either the disk or
* cluster servants become healthy
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
exiting = 1;
continue;
} else {
decoupled = 1;
}
}
/* Note that this can actually be negative, since we set
* last_tickle after we set now. */
latency = t_now.tv_sec - t_last_tickle.tv_sec;
if (timeout_watchdog && (latency > (int)timeout_watchdog)) {
if (!decoupled) {
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
exiting = 1;
continue;
}
if (debug_mode < 2) {
/* At level 2 or above, we do nothing, but expect
* things to eventually return to
* normal. */
do_reset();
} else {
cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
}
}
if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)",
(int)latency, (int)timeout_watchdog_warn, good_servants);
if (debug_mode && watchdog_use) {
/* In debug mode, trigger a reset before the watchdog can panic the machine */
do_reset();
}
}
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_started.tv_sec;
if (age > servant_restart_interval) {
s->restarts = 0;
s->restart_blocked = 0;
}
if (servant_restart_count
&& (s->restarts >= servant_restart_count)
&& !s->restart_blocked) {
if (servant_restart_count > 1) {
cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
(int)servant_restart_count, s->devname);
}
s->restart_blocked = 1;
}
if (!s->restart_blocked) {
servant_start(s);
}
}
}
/* not reached */
exit(0);
}
int inquisitor(void)
{
int sig, pid, inquisitor_pid;
int status;
sigset_t procmask;
siginfo_t sinfo;
/* Where's the best place for sysrq init ?*/
sysrq_init();
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIG_LIVENESS);
sigprocmask(SIG_BLOCK, &procmask, NULL);
inquisitor_pid = make_daemon();
if (inquisitor_pid == 0) {
inquisitor_child();
}
/* We're the parent. Wait for a happy signal from our child
* before we proceed - we either get "SIG_LIVENESS" when the
* inquisitor has completed the first successful round, or
* ECHLD when it exits with an error. */
while (1) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
}
/* We got here because the inquisitor
* did not succeed. */
return -1;
}
} else if (sig == SIG_LIVENESS) {
/* Inquisitor started up properly. */
return 0;
} else {
fprintf(stderr, "Nobody expected the spanish inquisition!\n");
continue;
}
}
/* not reached */
return -1;
}
int
parse_device_line(const char *line)
{
int lpc = 0;
int last = 0;
int max = 0;
int found = 0;
if(line) {
max = strlen(line);
}
if (max <= 0) {
return found;
}
cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", max, line);
/* Skip initial whitespace */
for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) {
last = lpc + 1;
}
/* Now the actual content */
for (lpc = 0; lpc <= max; lpc++) {
int a_space = isspace(line[lpc]);
if (a_space && lpc < max && isspace(line[lpc + 1])) {
/* fast-forward to the end of the spaces */
} else if (a_space || line[lpc] == ';' || line[lpc] == 0) {
int rc = 1;
char *entry = NULL;
if (lpc > last) {
entry = calloc(1, 1 + lpc - last);
rc = sscanf(line + last, "%[^;]", entry);
}
if (entry == NULL) {
/* Skip */
} else if (rc != 1) {
cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last);
} else {
cl_log(LOG_DEBUG, "Adding '%s'", entry);
recruit_servant(entry, 0);
found++;
}
free(entry);
last = lpc + 1;
}
}
return found;
}
#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c"
static void
sbd_log_filter_ctl(const char *files, uint8_t priority)
{
if (files == NULL) {
files = SBD_SOURCE_FILES;
}
qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
}
int
arg_enabled(int arg_count)
{
return arg_count % 2;
}
int main(int argc, char **argv, char **envp)
{
int exit_status = 0;
int c;
int W_count = 0;
int c_count = 0;
int P_count = 0;
int qb_facility;
const char *value = NULL;
int start_delay = 0;
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
cmdname = argv[0];
} else {
++cmdname;
}
watchdogdev = strdup("/dev/watchdog");
watchdogdev_is_default = true;
qb_facility = qb_log_facility2int("daemon");
qb_log_init(cmdname, qb_facility, LOG_WARNING);
sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
sbd_log_filter_ctl(NULL, LOG_NOTICE);
sbd_get_uname();
value = getenv("SBD_DEVICE");
if(value) {
#if SUPPORT_SHARED_DISK
int devices = parse_device_line(value);
if(devices < 1) {
fprintf(stderr, "Invalid device line: %s\n", value);
exit_status = -2;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
}
value = getenv("SBD_PACEMAKER");
if(value) {
check_pcmk = crm_is_true(value);
check_cluster = crm_is_true(value);
}
cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default");
value = getenv("SBD_STARTMODE");
if(value == NULL) {
} else if(strcmp(value, "clean") == 0) {
start_mode = 1;
} else if(strcmp(value, "always") == 0) {
start_mode = 0;
}
cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
value = getenv("SBD_WATCHDOG_DEV");
if(value) {
free(watchdogdev);
watchdogdev = strdup(value);
watchdogdev_is_default = false;
}
/* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
* This is for backward compatibility. */
value = getenv("SBD_WATCHDOG");
if(value) {
watchdog_use = crm_is_true(value);
}
value = getenv("SBD_WATCHDOG_TIMEOUT");
if(value) {
timeout_watchdog = crm_get_msec(value) / 1000;
if(timeout_watchdog > 5) {
timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3;
}
}
value = getenv("SBD_PIDFILE");
if(value) {
pidfile = strdup(value);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
}
value = getenv("SBD_DELAY_START");
if(value) {
start_delay = crm_is_true(value);
}
cl_log(LOG_DEBUG, "Start delay: %d (%s)", (int)start_delay, value?value:"default");
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:")) != -1) {
switch (c) {
case 'D':
break;
case 'Z':
debug_mode++;
cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
break;
case 'R':
skip_rt = 1;
cl_log(LOG_INFO, "Realtime mode deactivated.");
break;
case 'S':
start_mode = atoi(optarg);
cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
break;
case 's':
timeout_startup = atoi(optarg);
cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
break;
case 'v':
debug++;
if(debug == 1) {
sbd_log_filter_ctl(NULL, LOG_INFO);
cl_log(LOG_INFO, "Verbose mode enabled.");
} else if(debug == 2) {
sbd_log_filter_ctl(NULL, LOG_DEBUG);
cl_log(LOG_INFO, "Debug mode enabled.");
} else if(debug == 3) {
/* Go nuts, turn on pacemaker's logging too */
sbd_log_filter_ctl("*", LOG_DEBUG);
cl_log(LOG_INFO, "Debug library mode enabled.");
}
break;
case 'T':
watchdog_set_timeout = 0;
cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
break;
case 'W':
W_count++;
break;
case 'w':
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
free(watchdogdev);
watchdogdev = strdup(optarg);
watchdogdev_is_default = false;
break;
case 'd':
#if SUPPORT_SHARED_DISK
recruit_servant(optarg, 0);
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
break;
case 'c':
c_count++;
break;
case 'P':
P_count++;
break;
case 'z':
disk_priority = 0;
break;
case 'n':
local_uname = strdup(optarg);
cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
break;
case 'p':
pidfile = strdup(optarg);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
break;
case 'C':
timeout_watchdog_crashdump = atoi(optarg);
cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
(int)timeout_watchdog_crashdump);
break;
case '1':
timeout_watchdog = atoi(optarg);
if(timeout_watchdog > 5) {
timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3;
}
break;
case '2':
timeout_allocate = atoi(optarg);
break;
case '3':
timeout_loop = atoi(optarg);
break;
case '4':
timeout_msgwait = atoi(optarg);
break;
case '5':
timeout_watchdog_warn = atoi(optarg);
cl_log(LOG_INFO, "Setting latency warning to %d",
(int)timeout_watchdog_warn);
break;
case 't':
servant_restart_interval = atoi(optarg);
cl_log(LOG_INFO, "Setting servant restart interval to %d",
(int)servant_restart_interval);
break;
case 'I':
timeout_io = atoi(optarg);
cl_log(LOG_INFO, "Setting IO timeout to %d",
(int)timeout_io);
break;
case 'F':
servant_restart_count = atoi(optarg);
cl_log(LOG_INFO, "Servant restart count set to %d",
(int)servant_restart_count);
break;
case 'h':
usage();
return (0);
default:
exit_status = -2;
goto out;
break;
}
}
if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
watchdog_use = 0;
} else if (W_count > 0) {
watchdog_use = arg_enabled(W_count);
}
if (watchdog_use) {
cl_log(LOG_INFO, "Watchdog enabled.");
} else {
cl_log(LOG_INFO, "Watchdog disabled.");
}
if (c_count > 0) {
check_cluster = arg_enabled(c_count);
}
if (P_count > 0) {
check_pcmk = arg_enabled(P_count);
}
if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
SECTOR_NAME_MAX);
fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
exit_status = -1;
goto out;
}
if (disk_count > 3) {
fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
exit_status = -1;
goto out;
}
/* There must at least be one command following the options: */
if ((argc - optind) < 1) {
fprintf(stderr, "Not enough arguments.\n");
exit_status = -2;
goto out;
}
if (init_set_proc_title(argc, argv, envp) < 0) {
fprintf(stderr, "Allocation of proc title failed.\n");
exit_status = -1;
goto out;
}
#if SUPPORT_SHARED_DISK
if (strcmp(argv[optind], "create") == 0) {
exit_status = init_devices(servants_leader);
} else if (strcmp(argv[optind], "dump") == 0) {
exit_status = dump_headers(servants_leader);
} else if (strcmp(argv[optind], "allocate") == 0) {
exit_status = allocate_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "list") == 0) {
exit_status = list_slots(servants_leader);
} else if (strcmp(argv[optind], "message") == 0) {
exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
} else if (strcmp(argv[optind], "ping") == 0) {
exit_status = ping_via_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "watch") == 0) {
if(disk_count > 0) {
/* If no devices are specified, its not an error to be unable to find one */
open_any_device(servants_leader);
}
if(start_delay) {
unsigned long delay = get_first_msgwait(servants_leader);
sleep(delay);
}
} else {
exit_status = -2;
}
#endif
if (strcmp(argv[optind], "query-watchdog") == 0) {
exit_status = watchdog_info();
} else if (strcmp(argv[optind], "test-watchdog") == 0) {
exit_status = watchdog_test();
} else if (strcmp(argv[optind], "watch") == 0) {
/* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
/* We only want this to have an effect during watch right now;
* pinging and fencing would be too confused */
cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
if (check_pcmk) {
recruit_servant("pcmk", 0);
#if SUPPORT_PLUGIN
check_cluster = 1;
#endif
}
cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
if (check_cluster) {
recruit_servant("cluster", 0);
}
exit_status = inquisitor();
}
out:
if (exit_status < 0) {
if (exit_status == -2) {
usage();
} else {
fprintf(stderr, "sbd failed; please check the logs.\n");
}
return (1);
}
return (0);
}
diff --git a/src/sbd-md.c b/src/sbd-md.c
index 6a964dd..6f152c4 100644
--- a/src/sbd-md.c
+++ b/src/sbd-md.c
@@ -1,1235 +1,1235 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#define SBD_MSG_EMPTY 0x00
#define SBD_MSG_TEST 0x01
#define SBD_MSG_RESET 0x02
#define SBD_MSG_OFF 0x03
#define SBD_MSG_EXIT 0x04
#define SBD_MSG_CRASHDUMP 0x05
#define SLOT_TO_SECTOR(slot) (1+slot*2)
#define MBOX_TO_SECTOR(mbox) (2+mbox*2)
extern int disk_count;
/* These have to match the values in the header of the partition */
static char sbd_magic[8] = "SBD_SBD_";
static char sbd_version = 0x02;
struct slot_msg_arg_t {
const char* name;
const char* msg;
};
static signed char
cmd2char(const char *cmd)
{
if (strcmp("clear", cmd) == 0) {
return SBD_MSG_EMPTY;
} else if (strcmp("test", cmd) == 0) {
return SBD_MSG_TEST;
} else if (strcmp("reset", cmd) == 0) {
return SBD_MSG_RESET;
} else if (strcmp("off", cmd) == 0) {
return SBD_MSG_OFF;
} else if (strcmp("exit", cmd) == 0) {
return SBD_MSG_EXIT;
} else if (strcmp("crashdump", cmd) == 0) {
return SBD_MSG_CRASHDUMP;
}
return -1;
}
static const char*
char2cmd(const char cmd)
{
switch (cmd) {
case SBD_MSG_EMPTY:
return "clear";
break;
case SBD_MSG_TEST:
return "test";
break;
case SBD_MSG_RESET:
return "reset";
break;
case SBD_MSG_OFF:
return "off";
break;
case SBD_MSG_EXIT:
return "exit";
break;
case SBD_MSG_CRASHDUMP:
return "crashdump";
break;
default:
return "undefined";
break;
}
}
static void
close_device(struct sbd_context *st)
{
close(st->devfd);
free(st);
}
static struct sbd_context *
open_device(const char* devname, int loglevel)
{
struct sbd_context *st;
if (!devname)
return NULL;
st = malloc(sizeof(struct sbd_context));
if (!st)
return NULL;
memset(st, 0, sizeof(struct sbd_context));
if (io_setup(1, &st->ioctx) != 0) {
cl_perror("io_setup failed");
free(st);
return NULL;
}
st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT);
if (st->devfd == -1) {
if (loglevel == LOG_DEBUG) {
DBGLOG(loglevel, "Opening device %s failed.", devname);
} else {
cl_log(loglevel, "Opening device %s failed.", devname);
}
free(st);
return NULL;
}
ioctl(st->devfd, BLKSSZGET, &sector_size);
if (sector_size == 0) {
cl_perror("Get sector size failed.\n");
close_device(st);
return NULL;
}
return st;
}
static void *
sector_alloc(void)
{
void *x;
x = valloc(sector_size);
if (!x) {
exit(1);
}
memset(x, 0, sector_size);
return x;
}
static int
sector_io(struct sbd_context *st, int sector, void *data, int rw)
{
struct timespec timeout;
struct io_event event;
struct iocb *ios[1] = { &st->io };
long r;
timeout.tv_sec = timeout_io;
timeout.tv_nsec = 0;
memset(&st->io, 0, sizeof(struct iocb));
if (rw) {
io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector);
} else {
io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector);
}
if (io_submit(st->ioctx, 1, ios) != 1) {
cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw);
return -1;
}
errno = 0;
r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout);
if (r < 0 ) {
cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw);
return -1;
} else if (r < 1L) {
cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d)", rw);
r = io_cancel(st->ioctx, ios[0], &event);
if (r) {
DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw);
/* Doesn't really matter, debugging information.
*/
}
return -1;
} else if (r > 1L) {
cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r);
return -1;
}
/* IO is happy */
if (event.res == sector_size) {
return 0;
} else {
cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)",
rw, event.res, sector_size);
return -1;
}
}
static int
sector_write(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 1);
}
static int
sector_read(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 0);
}
static int
slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_read(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_write(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
void *data;
int rc = 0;
if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0)
return -1;
data = sector_alloc();
if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) {
rc = -1;
goto out;
}
if (memcmp(s_mbox, data, sector_size) != 0) {
cl_log(LOG_ERR, "Write verification failed!");
rc = -1;
goto out;
}
rc = 0;
out:
free(data);
return rc;
}
static int header_write(struct sbd_context *st, struct sector_header_s *s_header)
{
s_header->sector_size = htonl(s_header->sector_size);
s_header->timeout_watchdog = htonl(s_header->timeout_watchdog);
s_header->timeout_allocate = htonl(s_header->timeout_allocate);
s_header->timeout_loop = htonl(s_header->timeout_loop);
s_header->timeout_msgwait = htonl(s_header->timeout_msgwait);
return sector_write(st, 0, s_header);
}
static int
header_read(struct sbd_context *st, struct sector_header_s *s_header)
{
if (sector_read(st, 0, s_header) < 0)
return -1;
s_header->sector_size = ntohl(s_header->sector_size);
s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog);
s_header->timeout_allocate = ntohl(s_header->timeout_allocate);
s_header->timeout_loop = ntohl(s_header->timeout_loop);
s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait);
/* This sets the global defaults: */
timeout_watchdog = s_header->timeout_watchdog;
timeout_allocate = s_header->timeout_allocate;
timeout_loop = s_header->timeout_loop;
timeout_msgwait = s_header->timeout_msgwait;
return 0;
}
static int
valid_header(const struct sector_header_s *s_header)
{
if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) {
cl_log(LOG_ERR, "Header magic does not match.");
return -1;
}
if (s_header->version != sbd_version) {
cl_log(LOG_ERR, "Header version does not match.");
return -1;
}
if (s_header->sector_size != sector_size) {
cl_log(LOG_ERR, "Header sector size does not match.");
return -1;
}
return 0;
}
static struct sector_header_s *
header_get(struct sbd_context *st)
{
struct sector_header_s *s_header;
s_header = sector_alloc();
if (header_read(st, s_header) < 0) {
cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd);
return NULL;
}
if (valid_header(s_header) < 0) {
cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd);
return NULL;
}
/* cl_log(LOG_INFO, "Found version %d header with %d slots",
s_header->version, s_header->slots); */
return s_header;
}
static int
header_dump(struct sbd_context *st)
{
struct sector_header_s *s_header;
char uuid[37];
s_header = header_get(st);
if (s_header == NULL)
return -1;
printf("Header version : %u.%u\n", s_header->version,
s_header->minor_version);
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
printf("UUID : %s\n", uuid);
}
printf("Number of slots : %u\n", s_header->slots);
printf("Sector size : %lu\n",
(unsigned long)s_header->sector_size);
printf("Timeout (watchdog) : %lu\n",
(unsigned long)s_header->timeout_watchdog);
printf("Timeout (allocate) : %lu\n",
(unsigned long)s_header->timeout_allocate);
printf("Timeout (loop) : %lu\n",
(unsigned long)s_header->timeout_loop);
printf("Timeout (msgwait) : %lu\n",
(unsigned long)s_header->timeout_msgwait);
return 0;
}
static int
init_device(struct sbd_context *st)
{
struct sector_header_s *s_header;
struct sector_node_s *s_node;
struct sector_mbox_s *s_mbox;
struct stat s;
char uuid[37];
int i;
int rc = 0;
s_header = sector_alloc();
s_node = sector_alloc();
s_mbox = sector_alloc();
memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic));
s_header->version = sbd_version;
s_header->slots = 255;
s_header->sector_size = sector_size;
s_header->timeout_watchdog = timeout_watchdog;
s_header->timeout_allocate = timeout_allocate;
s_header->timeout_loop = timeout_loop;
s_header->timeout_msgwait = timeout_msgwait;
s_header->minor_version = 1;
uuid_generate(s_header->uuid);
uuid_unparse_lower(s_header->uuid, uuid);
fstat(st->devfd, &s);
/* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n",
s.st_size, s.st_blksize, s.st_blocks); */
cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)",
s_header->version, s_header->minor_version,
st->devfd, uuid);
fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n",
s_header->version, s_header->minor_version,
st->devfd, uuid);
if (header_write(st, s_header) < 0) {
rc = -1; goto out;
}
cl_log(LOG_INFO, "Initializing %d slots on device %d",
s_header->slots,
st->devfd);
fprintf(stdout, "Initializing %d slots on device %d\n",
s_header->slots,
st->devfd);
for (i=0;i < s_header->slots;i++) {
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (mbox_write(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return(rc);
}
/* Check if there already is a slot allocated to said name; returns the
* slot number. If not found, returns -1.
* This is necessary because slots might not be continuous. */
static int
slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name)
{
struct sector_node_s *s_node = NULL;
int i;
int rc = -1;
if (!name) {
cl_log(LOG_ERR, "slot_lookup(): No name specified.\n");
goto out;
}
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -2; goto out;
}
if (s_node->in_use != 0) {
if (strncasecmp(s_node->name, name,
SECTOR_NAME_MAX) == 0) {
DBGLOG(LOG_INFO, "%s owns slot %d", name, i);
rc = i; goto out;
}
}
}
out: free(s_node);
return rc;
}
static int
slot_unused(struct sbd_context *st, const struct sector_header_s *s_header)
{
struct sector_node_s *s_node;
int i;
int rc = -1;
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use == 0) {
rc = i; goto out;
}
}
out: free(s_node);
return rc;
}
static int
slot_allocate(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_allocate(): No name specified.\n");
fprintf(stderr, "slot_allocate(): No name specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
while (1) {
i = slot_lookup(st, s_header, name);
if ((i >= 0) || (i == -2)) {
/* -1 is "no slot found", in which case we
* proceed to allocate a new one.
* -2 is "read error during lookup", in which
* case we error out too
* >= 0 is "slot already allocated" */
rc = i; goto out;
}
i = slot_unused(st, s_header);
if (i >= 0) {
cl_log(LOG_INFO, "slot %d is unused - trying to own", i);
fprintf(stdout, "slot %d is unused - trying to own\n", i);
memset(s_node, 0, sizeof(*s_node));
s_node->in_use = 1;
strncpy(s_node->name, name, SECTOR_NAME_MAX);
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
sleep(timeout_allocate);
} else {
cl_log(LOG_ERR, "No more free slots.");
fprintf(stderr, "No more free slots.\n");
rc = -1; goto out;
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return(rc);
}
static int
slot_list(struct sbd_context *st)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use > 0) {
if (mbox_read(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
printf("%d\t%s\t%s\t%s\n",
i, s_node->name, char2cmd(s_mbox->cmd),
s_mbox->from);
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return rc;
}
static int
slot_msg(struct sbd_context *st, const char *name, const char *cmd)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int rc = 0;
char uuid[37];
if (!name || !cmd) {
cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device UUID: %s", uuid);
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = cmd2char(cmd);
if (s_mbox->cmd < 0) {
cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd);
rc = -1; goto out;
}
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
cl_log(LOG_INFO, "Writing %s to node slot %s",
cmd, name);
if (mbox_write_verify(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
if (strcasecmp(cmd, "exit") != 0) {
cl_log(LOG_INFO, "Messaging delay: %d",
(int)timeout_msgwait);
sleep(timeout_msgwait);
}
cl_log(LOG_INFO, "%s successfully delivered to %s",
cmd, name);
out: free(s_mbox);
free(s_header);
return rc;
}
static int
slot_ping(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int waited = 0;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = SBD_MSG_TEST;
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
DBGLOG(LOG_DEBUG, "Pinging node %s", name);
if (mbox_write(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
rc = -1;
while (waited <= timeout_msgwait) {
if (mbox_read(st, mbox, s_mbox) < 0)
break;
if (s_mbox->cmd != SBD_MSG_TEST) {
rc = 0;
break;
}
sleep(1);
waited++;
}
if (rc == 0) {
cl_log(LOG_DEBUG, "%s successfully pinged.", name);
} else {
cl_log(LOG_ERR, "%s failed to ping.", name);
}
out: free(s_mbox);
free(s_header);
return rc;
}
int init_devices(struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Initializing device %s\n",
s->devname);
st = open_device(s->devname, LOG_ERR);
if (!st) {
return -1;
}
rc = init_device(st);
close_device(st);
if (rc == -1) {
fprintf(stderr, "Failed to init device %s\n", s->devname);
return rc;
}
fprintf(stdout, "Device %s is initialized.\n", s->devname);
}
return 0;
}
static int slot_msg_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
struct sbd_context *st;
const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
cl_log(LOG_INFO, "Delivery process handling %s",
devname);
rc = slot_msg(st, arg->name, arg->msg);
close_device(st);
return rc;
}
static int slot_ping_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
const char* name = (const char*)argp;
struct sbd_context *st;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
rc = slot_ping(st, name);
close_device(st);
return rc;
}
int allocate_slots(const char *name, struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Trying to allocate slot for %s on device %s.\n",
name,
s->devname);
st = open_device(s->devname, LOG_WARNING);
if (!st) {
return -1;
}
rc = slot_allocate(st, name);
close_device(st);
if (rc < 0)
return rc;
fprintf(stdout, "Slot for %s has been allocated on %s.\n",
name,
s->devname);
}
return 0;
}
int list_slots(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
st = open_device(s->devname, LOG_WARNING);
if (!st) {
fprintf(stdout, "== disk %s unreadable!\n", s->devname);
continue;
}
rc = slot_list(st);
close_device(st);
if (rc == -1) {
fprintf(stdout, "== Slots on disk %s NOT dumped\n", s->devname);
}
}
return 0;
}
int ping_via_slots(const char *name, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
if(sbd_is_disk(s)) {
s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name);
}
}
while (servants_finished < disk_count) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = wait(&status))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
servants_finished++;
}
}
}
}
}
return 0;
}
int quorum_write(int good_servants)
{
return (good_servants > disk_count/2);
}
int messenger(const char *name, const char *msg, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
int successful_delivery = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
struct slot_msg_arg_t slot_msg_arg = {name, msg};
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg);
}
while (!(quorum_write(successful_delivery) ||
(servants_finished == disk_count))) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
servants_finished++;
if (WIFEXITED(status)
&& WEXITSTATUS(status) == 0) {
DBGLOG(LOG_INFO, "Process %d succeeded.",
(int)pid);
successful_delivery++;
} else {
cl_log(LOG_WARNING, "Process %d failed to deliver!",
(int)pid);
}
}
}
}
}
if (quorum_write(successful_delivery)) {
cl_log(LOG_INFO, "Message successfully delivered.");
return 0;
} else {
cl_log(LOG_ERR, "Message is not delivered via more then a half of devices");
return -1;
}
}
unsigned long
get_first_msgwait(struct servants_list_item *servants)
{
unsigned long msgwait = 0;
struct servants_list_item *s = servants;
for (s = servants; s; s = s->next) {
struct sbd_context *st;
struct sector_header_s *s_header;
st = open_device(s->devname, LOG_WARNING);
if (!st) {
continue;
}
s_header = header_get(st);
if (s_header != NULL) {
msgwait = (unsigned long)s_header->timeout_msgwait;
close_device(st);
return msgwait;
}
close_device(st);
}
return msgwait;
}
int dump_headers(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s = servants;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
int rv;
fprintf(stdout, "==Dumping header on disk %s\n", s->devname);
st = open_device(s->devname, LOG_WARNING);
if (st) {
rv = header_dump(st);
close_device(st);
} else {
fprintf(stdout, "== disk %s unreadable!\n", s->devname);
rv = -1;
}
if (rv == -1) {
rc = -1;
fprintf(stdout, "==Header on disk %s NOT dumped\n", s->devname);
} else {
fprintf(stdout, "==Header on disk %s is dumped\n", s->devname);
}
}
return rc;
}
void open_any_device(struct servants_list_item *servants)
{
struct sector_header_s *hdr_cur = NULL;
struct timespec t_0;
int t_wait = 0;
clock_gettime(CLOCK_MONOTONIC, &t_0);
while (!hdr_cur && t_wait < timeout_startup) {
struct timespec t_now;
struct servants_list_item* s;
for (s = servants; s; s = s->next) {
struct sbd_context *st = open_device(s->devname, LOG_DEBUG);
if (!st)
continue;
hdr_cur = header_get(st);
close_device(st);
if (hdr_cur)
break;
}
clock_gettime(CLOCK_MONOTONIC, &t_now);
t_wait = t_now.tv_sec - t_0.tv_sec;
if (!hdr_cur) {
sleep(timeout_loop);
}
}
if (hdr_cur) {
timeout_watchdog = hdr_cur->timeout_watchdog;
timeout_allocate = hdr_cur->timeout_allocate;
timeout_loop = hdr_cur->timeout_loop;
timeout_msgwait = hdr_cur->timeout_msgwait;
} else {
cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.",
timeout_startup);
exit(1);
}
free(hdr_cur);
return;
}
/*
::-::-::-::-::-::-::-::-::-::-::-::-::
Begin disk based servant code
::-::-::-::-::-::-::-::-::-::-::-::-::
*/
static int servant_check_timeout_inconsistent(struct sector_header_s *hdr)
{
if (timeout_watchdog != hdr->timeout_watchdog) {
cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device",
(int)timeout_watchdog, (int)hdr->timeout_watchdog);
return -1;
}
if (timeout_allocate != hdr->timeout_allocate) {
cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device",
(int)timeout_allocate, (int)hdr->timeout_allocate);
return -1;
}
if (timeout_loop != hdr->timeout_loop) {
cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device",
(int)timeout_loop, (int)hdr->timeout_loop);
return -1;
}
if (timeout_msgwait != hdr->timeout_msgwait) {
cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device",
(int)timeout_msgwait, (int)hdr->timeout_msgwait);
return -1;
}
return 0;
}
int servant(const char *diskname, int mode, const void* argp)
{
struct sector_mbox_s *s_mbox = NULL;
struct sector_node_s *s_node = NULL;
struct sector_header_s *s_header = NULL;
int mbox;
int rc = 0;
time_t t0, t1, latency;
union sigval signal_value;
sigset_t servant_masks;
struct sbd_context *st;
pid_t ppid;
char uuid[37];
const struct servants_list_item *s = argp;
if (!diskname) {
cl_log(LOG_ERR, "Empty disk name %s.", diskname);
return -1;
}
cl_log(LOG_INFO, "Servant starting for device %s", diskname);
/* Block most of the signals */
sigfillset(&servant_masks);
sigdelset(&servant_masks, SIGKILL);
sigdelset(&servant_masks, SIGFPE);
sigdelset(&servant_masks, SIGILL);
sigdelset(&servant_masks, SIGSEGV);
sigdelset(&servant_masks, SIGBUS);
sigdelset(&servant_masks, SIGALRM);
/* FIXME: check error */
sigprocmask(SIG_SETMASK, &servant_masks, NULL);
st = open_device(diskname, LOG_WARNING);
if (!st) {
exit(EXIT_MD_IO_FAIL);
}
s_header = header_get(st);
if (!s_header) {
cl_log(LOG_ERR, "Not a valid header on %s", diskname);
exit(EXIT_MD_IO_FAIL);
}
if (servant_check_timeout_inconsistent(s_header) < 0) {
cl_log(LOG_ERR, "Timeouts on %s do not match first device",
diskname);
exit(EXIT_MD_IO_FAIL);
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid);
}
mbox = slot_allocate(st, local_uname);
if (mbox < 0) {
cl_log(LOG_ERR,
"No slot allocated, and automatic allocation failed for disk %s.",
diskname);
rc = EXIT_MD_IO_FAIL;
goto out;
}
s_node = sector_alloc();
if (slot_read(st, mbox, s_node) < 0) {
cl_log(LOG_ERR, "Unable to read node entry on %s",
diskname);
exit(EXIT_MD_IO_FAIL);
}
- DBGLOG(LOG_INFO, "Monitoring slot %d on disk %s", mbox, diskname);
+ cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname);
if (s_header->minor_version == 0) {
set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox);
} else {
set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s",
diskname, mbox, uuid);
}
s_mbox = sector_alloc();
if (s->first_start) {
if (mode > 0) {
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed during start-up in servant.");
rc = EXIT_MD_IO_FAIL;
goto out;
}
if (s_mbox->cmd != SBD_MSG_EXIT &&
s_mbox->cmd != SBD_MSG_EMPTY) {
/* Not a clean stop. Abort start-up */
cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!");
ppid = getppid();
sigqueue(ppid, SIG_EXITREQ, signal_value);
rc = 0;
goto out;
}
}
DBGLOG(LOG_INFO, "First servant start - zeroing inbox");
memset(s_mbox, 0, sizeof(*s_mbox));
if (mbox_write(st, mbox, s_mbox) < 0) {
rc = EXIT_MD_IO_FAIL;
goto out;
}
}
memset(&signal_value, 0, sizeof(signal_value));
while (1) {
struct sector_header_s *s_header_retry = NULL;
struct sector_node_s *s_node_retry = NULL;
t0 = time(NULL);
sleep(timeout_loop);
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
do_reset();
}
/* These attempts are, by definition, somewhat racy. If
* the device is wiped out or corrupted between here and
* us reading our mbox, there is nothing we can do about
* that. But at least we tried. */
s_header_retry = header_get(st);
if (!s_header_retry) {
cl_log(LOG_ERR, "No longer found a valid header on %s", diskname);
exit(EXIT_MD_IO_FAIL);
}
if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) {
cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname);
exit(EXIT_MD_IO_FAIL);
}
free(s_header_retry);
s_node_retry = sector_alloc();
if (slot_read(st, mbox, s_node_retry) < 0) {
cl_log(LOG_ERR, "slot read failed in servant.");
exit(EXIT_MD_IO_FAIL);
}
if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) {
cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname);
exit(EXIT_MD_IO_FAIL);
}
free(s_node_retry);
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed in servant.");
exit(EXIT_MD_IO_FAIL);
}
if (s_mbox->cmd > 0) {
- cl_log(LOG_INFO,
+ cl_log(LOG_NOTICE,
"Received command %s from %s on disk %s",
char2cmd(s_mbox->cmd), s_mbox->from, diskname);
switch (s_mbox->cmd) {
case SBD_MSG_TEST:
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
sigqueue(ppid, SIG_TEST, signal_value);
break;
case SBD_MSG_RESET:
exit(EXIT_MD_REQUEST_RESET);
case SBD_MSG_OFF:
exit(EXIT_MD_REQUEST_SHUTOFF);
case SBD_MSG_EXIT:
sigqueue(ppid, SIG_EXITREQ, signal_value);
break;
case SBD_MSG_CRASHDUMP:
exit(EXIT_MD_REQUEST_CRASHDUMP);
default:
/* FIXME:
An "unknown" message might result
from a partial write.
log it and clear the slot.
*/
cl_log(LOG_ERR, "Unknown message on disk %s",
diskname);
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
break;
}
}
sigqueue(ppid, SIG_LIVENESS, signal_value);
t1 = time(NULL);
latency = t1 - t0;
if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: %d exceeded threshold %d on disk %s",
(int)latency, (int)timeout_watchdog_warn,
diskname);
} else if (debug) {
- DBGLOG(LOG_INFO, "Latency: %d on disk %s", (int)latency,
+ DBGLOG(LOG_DEBUG, "Latency: %d on disk %s", (int)latency,
diskname);
}
}
out:
free(s_mbox);
close_device(st);
exit(rc);
}
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 2f06109..a435d01 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -1,457 +1,457 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/* TODO list:
*
* - Trying to shutdown a node if no devices are up will fail, since SBD
* currently uses a message via the disk to achieve this.
*
* - Shutting down cluster nodes while the majority of devices is down
* will eventually take the cluster below the quorum threshold, at which
* time the remaining cluster nodes will all immediately suicide.
*
*/
#include <sys/param.h>
#include <crm/crm.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <sys/utsname.h>
#include <config.h>
#include <crm_config.h>
#include <crm/msg_xml.h>
#include <crm/common/util.h>
#include <crm/common/xml.h>
#include <crm/common/ipc.h>
#include <crm/common/mainloop.h>
#include <crm/cib.h>
#include <crm/pengine/status.h>
#include "sbd.h"
extern int disk_count;
static void clean_up(int rc);
static void crm_diff_update(const char *event, xmlNode * msg);
static int cib_connect(gboolean full);
static void compute_status(pe_working_set_t * data_set);
static gboolean mon_refresh_state(gpointer user_data);
static GMainLoop *mainloop = NULL;
static guint timer_id_reconnect = 0;
static guint timer_id_notify = 0;
static int reconnect_msec = 1000;
static int cib_connected = 0;
static cib_t *cib = NULL;
static xmlNode *current_cib = NULL;
static long last_refresh = 0;
static gboolean
mon_timer_reconnect(gpointer data)
{
int rc = 0;
if (timer_id_reconnect > 0) {
g_source_remove(timer_id_reconnect);
}
rc = cib_connect(TRUE);
if (rc != 0) {
cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc);
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
} else {
cl_log(LOG_INFO, "CIB reconnect successful");
}
return FALSE;
}
static void
mon_cib_connection_destroy(gpointer user_data)
{
if (cib) {
cib->cmds->signoff(cib);
set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
}
cib_connected = 0;
return;
}
static void
mon_retrieve_current_cib()
{
xmlNode *xml_cib = NULL;
int options = cib_scope_local | cib_sync_call;
int rc = pcmk_ok;
free_xml(current_cib);
current_cib = NULL;
rc = cib->cmds->query(cib, NULL, &xml_cib, options);
if (rc != pcmk_ok) {
crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc);
free_xml(xml_cib);
return;
} else if (xml_cib == NULL) {
crm_err("Couldn't retrieve the CIB: empty result");
return;
}
if (safe_str_eq(crm_element_name(xml_cib), XML_TAG_CIB)) {
current_cib = xml_cib;
} else {
free_xml(xml_cib);
}
return;
}
static gboolean
mon_timer_notify(gpointer data)
{
static int counter = 0;
int counter_max = timeout_watchdog / timeout_loop;
if (timer_id_notify > 0) {
g_source_remove(timer_id_notify);
}
if (cib_connected) {
if (counter == counter_max) {
mon_retrieve_current_cib();
mon_refresh_state(NULL);
counter = 0;
} else {
cib->cmds->noop(cib, 0);
notify_parent();
counter++;
}
}
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
return FALSE;
}
/*
* Mainloop signal handler.
*/
static void
mon_shutdown(int nsig)
{
clean_up(0);
}
static int
cib_connect(gboolean full)
{
int rc = 0;
CRM_CHECK(cib != NULL, return -EINVAL);
cib_connected = 0;
crm_xml_init();
if (cib->state != cib_connected_query && cib->state != cib_connected_command) {
rc = cib->cmds->signon(cib, crm_system_name, cib_query);
if (rc != 0) {
return rc;
}
mon_retrieve_current_cib();
mon_refresh_state(NULL);
if (full) {
if (rc == 0) {
rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy);
if (rc == -EPROTONOSUPPORT) {
/* Notification setup failed, won't be able to reconnect after failure */
rc = 0;
}
}
if (rc == 0) {
cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
}
if (rc != 0) {
/* Notification setup failed, could not monitor CIB actions */
clean_up(-rc);
}
}
}
if (!rc) {
cib_connected = 1;
}
return rc;
}
static void
compute_status(pe_working_set_t * data_set)
{
static int updates = 0;
static int ever_had_quorum = FALSE;
node_t *node = pe_find_node(data_set->nodes, local_uname);
updates++;
if (data_set->dc_node == NULL) {
set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now.");
notify_parent();
return;
}
if (node == NULL) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname);
} else if (node->details->online == FALSE) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE");
} else if (node->details->unclean) {
set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN");
} else if (node->details->pending) {
set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending");
#if 0
} else if (node->details->shutdown) {
set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down");
#endif
} else if (data_set->flags & pe_flag_have_quorum) {
set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online");
ever_had_quorum = TRUE;
} else if(disk_count > 0) {
set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost");
} else if(ever_had_quorum == FALSE) {
set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet");
} else {
/* We lost quorum, and there are no disks present
* Setting healthy > 2 here will result in us self-fencing
*/
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources");
break;
case no_quorum_stop:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources");
break;
case no_quorum_ignore:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore");
break;
case no_quorum_suicide:
set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence");
break;
}
}
notify_parent();
return;
}
static crm_trigger_t *refresh_trigger = NULL;
static gboolean
mon_trigger_refresh(gpointer user_data)
{
mainloop_set_trigger(refresh_trigger);
mon_refresh_state(NULL);
return FALSE;
}
static void
crm_diff_update(const char *event, xmlNode * msg)
{
int rc = -1;
const char *op = NULL;
long now = time(NULL);
static int updates = 0;
static mainloop_timer_t *refresh_timer = NULL;
if(refresh_timer == NULL) {
refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL);
refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer);
}
if (current_cib != NULL) {
xmlNode *cib_last = current_cib;
current_cib = NULL;
rc = cib_apply_patch_event(msg, cib_last, &current_cib, LOG_DEBUG);
free_xml(cib_last);
switch(rc) {
case -pcmk_err_diff_resync:
case -pcmk_err_diff_failed:
crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
case pcmk_ok:
updates++;
break;
default:
crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
}
}
if (current_cib == NULL) {
mon_retrieve_current_cib();
}
/* Refresh
* - immediately if the last update was more than 5s ago
* - every 10 updates
* - at most 2s after the last update
*/
if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) {
mon_refresh_state(refresh_timer);
updates = 0;
} else {
mainloop_set_trigger(refresh_trigger);
mainloop_timer_start(refresh_timer);
}
}
static gboolean
mon_refresh_state(gpointer user_data)
{
xmlNode *cib_copy = NULL;
pe_working_set_t data_set;
if(current_cib == NULL) {
return FALSE;
}
if(user_data) {
mainloop_timer_t *timer = user_data;
mainloop_timer_stop(timer);
}
cib_copy = copy_xml(current_cib);
if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) {
cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB");
if (cib) {
cib->cmds->signoff(cib);
}
} else {
last_refresh = time(NULL);
set_working_set_defaults(&data_set);
data_set.input = cib_copy;
data_set.flags |= pe_flag_have_stonith_resource;
cluster_status(&data_set);
compute_status(&data_set);
cleanup_calculations(&data_set);
}
return FALSE;
}
static void
clean_up(int rc)
{
if (cib != NULL) {
cib->cmds->signoff(cib);
cib_delete(cib);
cib = NULL;
}
if (rc >= 0) {
exit(rc);
}
return;
}
int
servant_pcmk(const char *diskname, int mode, const void* argp)
{
int exit_code = 0;
crm_system_name = strdup("sbd:pcmk");
- cl_log(LOG_INFO, "Monitoring Pacemaker health");
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
set_proc_title("sbd: watcher: Pacemaker");
setenv("PCMK_watchdog", "true", 1);
if(debug == 0) {
/* We don't want any noisy crm messages */
set_crm_log_level(LOG_CRIT);
}
if (current_cib == NULL) {
cib = cib_new();
do {
exit_code = cib_connect(TRUE);
if (exit_code != 0) {
sleep(reconnect_msec / 1000);
}
} while (exit_code == -ENOTCONN);
if (exit_code != 0) {
clean_up(-exit_code);
}
}
mainloop = g_main_new(FALSE);
mainloop_add_signal(SIGTERM, mon_shutdown);
mainloop_add_signal(SIGINT, mon_shutdown);
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
g_main_run(mainloop);
g_main_destroy(mainloop);
clean_up(0);
return 0; /* never reached */
}

File Metadata

Mime Type
text/x-diff
Expires
Wed, Feb 26, 12:13 PM (19 h, 49 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1465336
Default Alt Text
(115 KB)

Event Timeline