Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F3154347
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
115 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index ae4750e..c7328af 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -1,552 +1,553 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <config.h>
#include <crm_config.h>
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
#if CHECK_TWO_NODE
#include <glib-unix.h>
#endif
#include "sbd.h"
//undef SUPPORT_PLUGIN
//define SUPPORT_PLUGIN 1
static bool remote_node = false;
static pid_t remoted_pid = 0;
static int reconnect_msec = 1000;
static GMainLoop *mainloop = NULL;
static guint notify_timer = 0;
static crm_cluster_t cluster;
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
#if SUPPORT_PLUGIN
static void
sbd_plugin_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
{
if(msg_len > 0) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s", name_for_cluster_type(get_cluster_type()));
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Broken %s message", name_for_cluster_type(get_cluster_type()));
}
notify_parent();
return;
}
#endif
#if SUPPORT_COROSYNC
static bool two_node = false;
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
#if CHECK_TWO_NODE
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
static cmap_track_handle_t track_handle = 0;
static GSource *cmap_source = NULL;
#endif
void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
bool quorum_is_suspect =
(two_node && ever_seen_both && cpg_membership_entries == 1);
if (!quorum_is_suspect) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s (%u members)",
name_for_cluster_type(get_cluster_type()),
cpg_membership_entries
);
} else {
/* Alternative would be asking votequorum for number of votes.
* Using pacemaker's cpg as source for number of active nodes
* avoids binding to an additional library, is definitely
* less code to write and we wouldn't have to combine data
* from 3 sources (cmap, cpq & votequorum) in a potentially
* racy environment.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but requires both nodes present",
name_for_cluster_type(get_cluster_type())
);
}
if (cpg_membership_entries > 1) {
ever_seen_both = true;
}
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Empty %s membership", name_for_cluster_type(get_cluster_type()));
}
}
void
sbd_cpg_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
const struct cpg_address *member_list, size_t member_list_entries,
const struct cpg_address *left_list, size_t left_list_entries,
const struct cpg_address *joined_list, size_t joined_list_entries)
{
cpg_membership_entries = member_list_entries;
sbd_cpg_membership_health_update();
notify_parent();
}
#if CHECK_TWO_NODE
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
int32_t event,
const char *key_name,
struct cmap_notify_value new_val,
struct cmap_notify_value old_val,
void *user_data)
{
if (new_val.type == CMAP_VALUETYPE_UINT8) {
switch (event) {
case CMAP_TRACK_ADD:
case CMAP_TRACK_MODIFY:
two_node = *((uint8_t *) new_val.data);
break;
case CMAP_TRACK_DELETE:
two_node = false;
break;
default:
return;
}
sbd_cpg_membership_health_update();
notify_parent();
}
}
static gboolean
cmap_dispatch_callback (gpointer user_data)
{
cmap_dispatch(cmap_handle, CS_DISPATCH_ALL);
return TRUE;
}
static gboolean
sbd_get_two_node(void)
{
uint8_t two_node_u8 = 0;
int cmap_fd;
if (!track_handle) {
if (cmap_initialize(&cmap_handle) != CS_OK) {
cl_log(LOG_WARNING, "Cannot initialize CMAP service\n");
goto out;
}
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n");
goto out;
}
if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) {
cl_log(LOG_WARNING, "Couldn't create source for cmap\n");
goto out;
}
g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL);
g_source_attach(cmap_source, NULL);
}
if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) {
- cl_log(LOG_NOTICE, "Corosync is%s in 2Node-mode", two_node_u8?"":" not");
+ cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
+ "Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
- cl_log(LOG_NOTICE, "quorum.two_node present in cmap\n");
+ cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
return TRUE;
out:
if (cmap_source) {
g_source_destroy(cmap_source);
cmap_source = NULL;
}
if (track_handle) {
cmap_track_delete(cmap_handle, track_handle);
track_handle = 0;
}
if (cmap_handle) {
cmap_finalize(cmap_handle);
cmap_handle = 0;
}
return FALSE;
}
#endif
#endif
static gboolean
notify_timer_cb(gpointer data)
{
cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":"");
if(remote_node) {
sbd_remote_check(NULL);
return TRUE;
}
switch (get_cluster_type()) {
#if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS
case pcmk_cluster_classic_ais:
send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais);
break;
#endif
case pcmk_cluster_corosync:
#if HAVE_DECL_PCMK_CLUSTER_CMAN
case pcmk_cluster_cman:
#endif
/* TODO - Make a CPG call and only call notify_parent() when we get a reply */
notify_parent();
break;
default:
break;
}
return TRUE;
}
static void
sbd_membership_connect(void)
{
bool connected = false;
- cl_log(LOG_NOTICE, "Attempting cluster connection");
+ cl_log(LOG_INFO, "Attempting cluster connection");
cluster.destroy = sbd_membership_destroy;
#if SUPPORT_PLUGIN
cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch;
#endif
#if SUPPORT_COROSYNC
cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch;
#endif
while(connected == false) {
enum cluster_type_e stack = get_cluster_type();
if(get_cluster_type() == pcmk_cluster_unknown) {
crm_debug("Attempting pacemaker remote connection");
/* Nothing is up, go looking for the pacemaker remote process */
if(find_pacemaker_remote() > 0) {
connected = true;
}
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
#if SUPPORT_COROSYNC && CHECK_TWO_NODE
if (sbd_get_two_node()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
#if SUPPORT_COROSYNC && CHECK_TWO_NODE
}
#endif
}
if(connected == false) {
cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000);
sleep(reconnect_msec / 1000);
}
}
- set_servant_health(pcmk_health_transient, LOG_NOTICE, "Connected, waiting for initial membership");
+ set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership");
notify_parent();
notify_timer_cb(NULL);
}
static void
sbd_membership_destroy(gpointer user_data)
{
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated");
notify_parent();
/* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */
sbd_membership_connect();
}
/*
* \internal
* \brief Get process ID and name associated with a /proc directory entry
*
* \param[in] entry Directory entry (must be result of readdir() on /proc)
* \param[out] name If not NULL, a char[64] to hold the process name
* \param[out] pid If not NULL, will be set to process ID of entry
*
* \return 0 on success, -1 if entry is not for a process or info not found
*
* \note This should be called only on Linux systems, as not all systems that
* support /proc store process names and IDs in the same way.
* Copied from the Pacemaker implementation.
*/
int
sbd_procfs_process_info(struct dirent *entry, char *name, int *pid)
{
int fd, local_pid;
FILE *file;
struct stat statbuf;
char key[16] = { 0 }, procpath[128] = { 0 };
/* We're only interested in entries whose name is a PID,
* so skip anything non-numeric or that is too long.
*
* 114 = 128 - strlen("/proc/") - strlen("/status") - 1
*/
local_pid = atoi(entry->d_name);
if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) {
return -1;
}
if (pid) {
*pid = local_pid;
}
/* Get this entry's file information */
strcpy(procpath, "/proc/");
strcat(procpath, entry->d_name);
fd = open(procpath, O_RDONLY);
if (fd < 0 ) {
return -1;
}
if (fstat(fd, &statbuf) < 0) {
close(fd);
return -1;
}
close(fd);
/* We're only interested in subdirectories */
if (!S_ISDIR(statbuf.st_mode)) {
return -1;
}
/* Read the first entry ("Name:") from the process's status file.
* We could handle the valgrind case if we parsed the cmdline file
* instead, but that's more of a pain than it's worth.
*/
if (name != NULL) {
strcat(procpath, "/status");
file = fopen(procpath, "r");
if (!file) {
return -1;
}
if ((fscanf(file, "%15s%63s", key, name) != 2)
|| safe_str_neq(key, "Name:")) {
fclose(file);
return -1;
}
fclose(file);
}
return 0;
}
static gboolean
sbd_remote_check(gpointer user_data)
{
static int have_proc_pid = 0;
int running = 0;
cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid);
if(have_proc_pid == 0) {
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid());
have_proc_pid = 1;
if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) {
have_proc_pid = -1;
}
}
if (remoted_pid <= 0) {
set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection");
goto notify;
} else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) {
/* Not running */
} else if(have_proc_pid == -1) {
running = 1;
cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid);
} else {
int rc = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], expected_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid);
rc = readlink(proc_path, exe_path, PATH_MAX - 1);
if (rc < 0) {
crm_perror(LOG_ERR, "Could not read from %s", proc_path);
goto done;
}
exe_path[rc] = 0;
rc = snprintf(expected_path, sizeof(proc_path), "%s/pacemaker_remoted", SBINDIR);
expected_path[rc] = 0;
if (strcmp(exe_path, expected_path) == 0) {
cl_log(LOG_DEBUG, "Process %s (%ld) is active",
exe_path, (long)remoted_pid);
running = 1;
}
}
done:
if(running) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
notify:
notify_parent();
if(running == 0) {
sbd_membership_connect();
}
return true;
}
static long unsigned int
find_pacemaker_remote(void)
{
DIR *dp;
char entry_name[64];
struct dirent *entry;
dp = opendir("/proc");
if (!dp) {
/* no proc directory to search through */
cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components");
return FALSE;
}
while ((entry = readdir(dp)) != NULL) {
int pid;
if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) {
continue;
}
/* entry_name is truncated to 16 characters including the nul terminator */
cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid);
if (strcmp(entry_name, "pacemaker_remot") == 0) {
cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid);
remoted_pid = pid;
remote_node = true;
break;
}
}
closedir(dp);
return remoted_pid;
}
static void
clean_up(int rc)
{
return;
}
static void
cluster_shutdown(int nsig)
{
clean_up(0);
}
int
servant_cluster(const char *diskname, int mode, const void* argp)
{
enum cluster_type_e cluster_stack = get_cluster_type();
crm_system_name = strdup("sbd:cluster");
- cl_log(LOG_INFO, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
+ cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
set_proc_title("sbd: watcher: Cluster");
sbd_membership_connect();
/* stonith_our_uname = cluster.uname; */
/* stonith_our_uuid = cluster.uuid; */
mainloop = g_main_new(FALSE);
notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL);
mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
g_main_run(mainloop);
g_main_destroy(mainloop);
clean_up(0);
return 0; /* never reached */
}
diff --git a/src/sbd-common.c b/src/sbd-common.c
index f22c4f2..0ce6478 100644
--- a/src/sbd-common.c
+++ b/src/sbd-common.c
@@ -1,971 +1,971 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#include <sys/reboot.h>
#include <sys/types.h>
#ifdef __GLIBC__
#include <sys/sysmacros.h>
#endif
#include <sys/stat.h>
#include <pwd.h>
#include <unistd.h>
#include <dirent.h>
#ifdef _POSIX_MEMLOCK
# include <sys/mman.h>
#endif
/* Tunable defaults: */
#if defined(__s390__) || defined(__s390x__)
unsigned long timeout_watchdog = 15;
int timeout_msgwait = 30;
#else
unsigned long timeout_watchdog = 5;
int timeout_msgwait = 10;
#endif
unsigned long timeout_watchdog_warn = 3;
int timeout_allocate = 2;
int timeout_loop = 1;
int timeout_io = 3;
int timeout_startup = 120;
int watchdog_use = 1;
int watchdog_set_timeout = 1;
unsigned long timeout_watchdog_crashdump = 240;
int skip_rt = 0;
int debug = 0;
int debug_mode = 0;
char *watchdogdev = NULL;
bool watchdogdev_is_default = false;
char * local_uname;
/* Global, non-tunable variables: */
int sector_size = 0;
int watchdogfd = -1;
int servant_health = 0;
/*const char *devname;*/
const char *cmdname;
void
usage(void)
{
fprintf(stderr,
"Shared storage fencing tool.\n"
"Syntax:\n"
" %s <options> <command> <cmdarguments>\n"
"Options:\n"
"-d <devname> Block device to use (mandatory; can be specified up to 3 times)\n"
"-h Display this help.\n"
"-n <node> Set local node name; defaults to uname -n (optional)\n"
"\n"
"-R Do NOT enable realtime priority (debugging only)\n"
"-W Use watchdog (recommended) (watch only)\n"
"-w <dev> Specify watchdog device (optional) (watch only)\n"
"-T Do NOT initialize the watchdog timeout (watch only)\n"
"-S <0|1> Set start mode if the node was previously fenced (watch only)\n"
"-p <path> Write pidfile to the specified path (watch only)\n"
"-v Enable some verbose debug logging (optional)\n"
"\n"
"-1 <N> Set watchdog timeout to N seconds (optional, create only)\n"
"-2 <N> Set slot allocation timeout to N seconds (optional, create only)\n"
"-3 <N> Set daemon loop timeout to N seconds (optional, create only)\n"
"-4 <N> Set msgwait timeout to N seconds (optional, create only)\n"
"-5 <N> Warn if loop latency exceeds threshold (optional, watch only)\n"
" (default is 3, set to 0 to disable)\n"
"-C <N> Watchdog timeout to set before crashdumping (def: 240s, optional)\n"
"-I <N> Async IO read timeout (defaults to 3 * loop timeout, optional)\n"
"-s <N> Timeout to wait for devices to become available (def: 120s)\n"
"-t <N> Dampening delay before faulty servants are restarted (optional)\n"
" (default is 5, set to 0 to disable)\n"
"-F <N> # of failures before a servant is considered faulty (optional)\n"
" (default is 1, set to 0 to disable)\n"
"-P Check Pacemaker quorum and node health (optional, watch only)\n"
"-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n"
"Commands:\n"
#if SUPPORT_SHARED_DISK
"create initialize N slots on <dev> - OVERWRITES DEVICE!\n"
"list List all allocated slots on device, and messages.\n"
"dump Dump meta-data header from device.\n"
"allocate <node>\n"
" Allocate a slot for node (optional)\n"
"message <node> (test|reset|off|clear|exit)\n"
" Writes the specified message to node's slot.\n"
#endif
"watch Loop forever, monitoring own slot\n"
"query-watchdog Check for available watchdog-devices and print some info\n"
"test-watchdog Test the watchdog-device selected.\n"
" Attention: This will arm the watchdog and have your system reset\n"
" in case your watchdog is working properly!\n"
, cmdname);
}
static int
watchdog_init_interval_fd(int wdfd, int timeout)
{
if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) {
cl_perror( "WDIOC_SETTIMEOUT"
": Failed to set watchdog timer to %u seconds.",
timeout);
cl_log(LOG_CRIT, "Please validate your watchdog configuration!");
cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure.");
return -1;
}
return 0;
}
int
watchdog_init_interval(void)
{
if (watchdogfd < 0) {
return 0;
}
if (watchdog_set_timeout == 0) {
cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
return 0;
}
if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) {
return -1;
}
cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
return 0;
}
static int
watchdog_tickle_fd(int wdfd, char *wddev)
{
if (write(wdfd, "", 1) != 1) {
cl_perror("Watchdog write failure: %s!", wddev);
return -1;
}
return 0;
}
int
watchdog_tickle(void)
{
if (watchdogfd >= 0) {
return watchdog_tickle_fd(watchdogfd, watchdogdev);
}
return 0;
}
static int
watchdog_init_fd(char *wddev, int timeout)
{
int wdfd;
wdfd = open(wddev, O_WRONLY);
if (wdfd >= 0) {
if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0))
|| (watchdog_tickle_fd(wdfd, wddev) < 0)) {
close(wdfd);
return -1;
}
} else {
cl_perror("Cannot open watchdog device '%s'", wddev);
return -1;
}
return wdfd;
}
int
watchdog_init(void)
{
if (watchdogfd < 0 && watchdogdev != NULL) {
int timeout = timeout_watchdog;
if (watchdog_set_timeout == 0) {
cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
timeout = -1;
}
watchdogfd = watchdog_init_fd(watchdogdev, timeout);
if (watchdogfd >= 0) {
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
if (watchdog_set_timeout) {
cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
}
} else {
return -1;
}
}
return 0;
}
static void
watchdog_close_fd(int wdfd, char *wddev, bool disarm)
{
if (disarm) {
int r;
int flags = WDIOS_DISABLECARD;;
/* Explicitly disarm it */
r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags);
if (r < 0) {
cl_perror("Failed to disable hardware watchdog %s", wddev);
}
/* To be sure, use magic close logic, too */
for (;;) {
if (write(wdfd, "V", 1) > 0) {
break;
}
cl_perror("Cannot disable watchdog device %s", wddev);
}
}
if (close(wdfd) < 0) {
cl_perror("Watchdog close(%d) failed", wdfd);
}
}
void
watchdog_close(bool disarm)
{
if (watchdogfd < 0) {
return;
}
watchdog_close_fd(watchdogfd, watchdogdev, disarm);
watchdogfd = -1;
}
#define MAX_WATCHDOGS 64
#define SYS_CLASS_WATCHDOG "/sys/class/watchdog"
#define SYS_CHAR_DEV_DIR "/sys/dev/char"
#define WATCHDOG_NODEDIR "/dev"
struct watchdog_list_item {
dev_t dev;
char *dev_node;
char *dev_ident;
char *dev_driver;
struct watchdog_list_item *next;
};
static struct watchdog_list_item *watchdog_list = NULL;
static int watchdog_list_items = 0;
static void
watchdog_populate_list(void)
{
dev_t watchdogs[MAX_WATCHDOGS + 1] =
{makedev(10,130), 0};
int num_watchdogs = 1;
struct dirent *entry;
char entry_name[280];
DIR *dp;
char buf[256] = "";
if (watchdog_list != NULL) {
return;
}
/* get additional devices from /sys/class/watchdog */
dp = opendir(SYS_CLASS_WATCHDOG);
if (dp) {
while ((entry = readdir(dp))) {
if (entry->d_type == DT_LNK) {
FILE *file;
snprintf(entry_name, sizeof(entry_name),
SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
file = fopen(entry_name, "r");
if (file) {
int major, minor;
if (fscanf(file, "%d:%d", &major, &minor) == 2) {
watchdogs[num_watchdogs++] = makedev(major, minor);
}
fclose(file);
if (num_watchdogs == MAX_WATCHDOGS) {
break;
}
}
}
}
closedir(dp);
}
/* search for watchdog nodes in /dev */
dp = opendir(WATCHDOG_NODEDIR);
if (dp) {
while ((entry = readdir(dp))) {
if ((entry->d_type == DT_CHR) || (entry->d_type == DT_LNK)) {
struct stat statbuf;
snprintf(entry_name, sizeof(entry_name),
WATCHDOG_NODEDIR "/%s", entry->d_name);
if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode)) {
int i;
for (i=0; i<num_watchdogs; i++) {
if (statbuf.st_rdev == watchdogs[i]) {
int wdfd = watchdog_init_fd(entry_name, -1);
struct watchdog_list_item *wdg =
calloc(1, sizeof(struct watchdog_list_item));
wdg->dev = watchdogs[i];
wdg->dev_node = strdup(entry_name);
wdg->next = watchdog_list;
watchdog_list = wdg;
watchdog_list_items++;
if (wdfd >= 0) {
struct watchdog_info ident;
ident.identity[0] = '\0';
ioctl(wdfd, WDIOC_GETSUPPORT, &ident);
watchdog_close_fd(wdfd, entry_name, true);
if (ident.identity[0]) {
wdg->dev_ident = strdup((char *) ident.identity);
}
}
snprintf(entry_name, sizeof(entry_name),
SYS_CHAR_DEV_DIR "/%d:%d/device/driver",
major(watchdogs[i]), minor(watchdogs[i]));
if (readlink(entry_name, buf, sizeof(buf)) > 0) {
wdg->dev_driver = strdup(basename(buf));
} else if ((wdg->dev_ident) &&
(strcmp(wdg->dev_ident,
"Software Watchdog") == 0)) {
wdg->dev_driver = strdup("softdog");
}
break;
}
}
}
}
}
closedir(dp);
}
}
int watchdog_info(void)
{
struct watchdog_list_item *wdg;
int wdg_cnt = 0;
watchdog_populate_list();
printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items);
for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
wdg_cnt++;
printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n",
wdg_cnt, wdg->dev_node,
wdg->dev_ident?wdg->dev_ident:"Error: Check if hogged by e.g. sbd-daemon!",
wdg->dev_driver?wdg->dev_driver:"<unknown>");
if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) {
printf("CAUTION: Not recommended for use with sbd.\n");
}
}
return 0;
}
int watchdog_test(void)
{
int i;
if ((watchdog_set_timeout == 0) || !watchdog_use) {
printf("\nWatchdog is disabled - aborting test!!!\n");
return 0;
}
if (watchdogdev_is_default) {
watchdog_populate_list();
if (watchdog_list_items > 1) {
printf("\nError: Multiple watchdog devices discovered.\n"
" Use -w <watchdog> or SBD_WATCHDOG_DEV to specify\n"
" which device to reset the system with\n");
watchdog_info();
return -1;
}
}
if ((isatty(fileno(stdin)))) {
char buffer[16];
printf("\nWARNING: This operation is expected to force-reboot this system\n"
" without following any shutdown procedures.\n\n"
"Proceed? [NO/Proceed] ");
if ((fgets(buffer, 16, stdin) == NULL) ||
strcmp(buffer, "Proceed\n")) {
printf("\nAborting watchdog test!!!\n");
return 0;
}
printf("\n");
}
printf("Initializing %s with a reset countdown of %d seconds ...\n",
watchdogdev, (int) timeout_watchdog);
if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) {
printf("Failed to initialize watchdog!!!\n");
return -1;
}
printf("\n");
printf("NOTICE: The watchdog device is expected to reset the system\n"
" in %d seconds. If system remains active beyond that time,\n"
" watchdog may not be functional.\n\n", (int) timeout_watchdog);
for (i=timeout_watchdog; i>1; i--) {
printf("Reset countdown ... %d seconds\n", i);
sleep(1);
}
for (i=2; i>0; i--) {
printf("System expected to reset any moment ...\n");
sleep(1);
}
for (i=5; i>0; i--) {
printf("System should have reset ...\n");
sleep(1);
}
printf("Error: The watchdog device has failed to reboot the system,\n"
" and it may not be suitable for usage with sbd.\n");
/* test should trigger a reboot thus returning is actually bad */
return -1;
}
/* This duplicates some code from linux/ioprio.h since these are not included
* even in linux-kernel-headers. Sucks. See also
* /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */
extern int sys_ioprio_set(int, int, int);
int ioprio_set(int which, int who, int ioprio);
inline int ioprio_set(int which, int who, int ioprio)
{
return syscall(__NR_ioprio_set, which, who, ioprio);
}
enum {
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
static unsigned char
sbd_stack_hogger(unsigned char * inbuf, int kbytes)
{
unsigned char buf[1024];
if(kbytes <= 0) {
return HOG_CHAR;
}
if (inbuf == NULL) {
memset(buf, HOG_CHAR, sizeof(buf));
} else {
memcpy(buf, inbuf, sizeof(buf));
}
if (kbytes > 0) {
return sbd_stack_hogger(buf, kbytes-1);
} else {
return buf[sizeof(buf)-1];
}
}
static void
sbd_malloc_hogger(int kbytes)
{
int j;
void**chunks;
int chunksize = 1024;
if(kbytes <= 0) {
return;
}
/*
* We could call mallopt(M_MMAP_MAX, 0) to disable it completely,
* but we've already called mlockall()
*
* We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc
* from giving memory back to the system, but we've already called
* mlockall(MCL_FUTURE), so there's no need.
*/
chunks = malloc(kbytes * sizeof(void *));
if (chunks == NULL) {
cl_log(LOG_WARNING, "Could not preallocate chunk array");
return;
}
for (j=0; j < kbytes; ++j) {
chunks[j] = malloc(chunksize);
if (chunks[j] == NULL) {
cl_log(LOG_WARNING, "Could not preallocate block %d", j);
} else {
memset(chunks[j], 0, chunksize);
}
}
for (j=0; j < kbytes; ++j) {
free(chunks[j]);
}
free(chunks);
}
static void sbd_memlock(int stackgrowK, int heapgrowK)
{
#ifdef _POSIX_MEMLOCK
/*
* We could call setrlimit(RLIMIT_MEMLOCK,...) with a large
* number, but the mcp runs as root and mlock(2) says:
*
* Since Linux 2.6.9, no limits are placed on the amount of memory
* that a privileged process may lock, and this limit instead
* governs the amount of memory that an unprivileged process may
* lock.
*/
if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) {
cl_log(LOG_INFO, "Locked ourselves in memory");
/* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */
sbd_malloc_hogger(heapgrowK);
sbd_stack_hogger(NULL, stackgrowK);
} else {
cl_perror("Unable to lock ourselves into memory");
}
#else
cl_log(LOG_ERR, "Unable to lock ourselves into memory");
#endif
}
void
sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
{
if(priority < 0) {
return;
}
#ifdef SCHED_RR
{
int pcurrent = 0;
int pmin = sched_get_priority_min(SCHED_RR);
int pmax = sched_get_priority_max(SCHED_RR);
if (priority == 0) {
priority = pmax;
} else if (priority < pmin) {
priority = pmin;
} else if (priority > pmax) {
priority = pmax;
}
pcurrent = sched_getscheduler(0);
if (pcurrent < 0) {
cl_perror("Unable to get scheduler priority");
} else if(pcurrent < priority) {
struct sched_param sp;
memset(&sp, 0, sizeof(sp));
sp.sched_priority = priority;
if (sched_setscheduler(0, SCHED_RR, &sp) < 0) {
cl_perror("Unable to set scheduler priority to %d", priority);
} else {
cl_log(LOG_INFO, "Scheduler priority is now %d", priority);
}
}
}
#else
cl_log(LOG_ERR, "System does not support updating the scheduler priority");
#endif
sbd_memlock(heapgrowK, stackgrowK);
}
void
maximize_priority(void)
{
if (skip_rt) {
cl_log(LOG_INFO, "Not elevating to realtime (-R specified).");
return;
}
sbd_make_realtime(0, 256, 256);
if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(),
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) {
cl_perror("ioprio_set() call failed.");
}
}
void
sysrq_init(void)
{
FILE* procf;
int c;
procf = fopen("/proc/sys/kernel/sysrq", "r");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for read.");
return;
}
if (fscanf(procf, "%d", &c) != 1) {
cl_perror("Parsing sysrq failed");
c = 0;
}
fclose(procf);
if (c == 1)
return;
/* 8 for debugging dumps of processes,
128 for reboot/poweroff */
c |= 136;
procf = fopen("/proc/sys/kernel/sysrq", "w");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for writing");
return;
}
fprintf(procf, "%d", c);
fclose(procf);
return;
}
void
sysrq_trigger(char t)
{
FILE *procf;
procf = fopen("/proc/sysrq-trigger", "a");
if (!procf) {
cl_perror("Opening sysrq-trigger failed.");
return;
}
cl_log(LOG_INFO, "sysrq-trigger: %c\n", t);
fprintf(procf, "%c\n", t);
fclose(procf);
return;
}
static void
do_exit(char kind)
{
/* TODO: Turn debug_mode into a bit field? Delay + kdump for example */
const char *reason = NULL;
if (kind == 'c') {
cl_log(LOG_NOTICE, "Initiating kdump");
} else if (debug_mode == 1) {
cl_log(LOG_WARNING, "Initiating kdump instead of panicing the node (debug mode)");
kind = 'c';
}
if (debug_mode == 2) {
cl_log(LOG_WARNING, "Shutting down SBD instead of panicing the node (debug mode)");
watchdog_close(true);
exit(0);
}
if (debug_mode == 3) {
/* Give the system some time to flush logs to disk before rebooting. */
cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)");
watchdog_close(true);
sync();
sleep(10);
}
switch(kind) {
case 'b':
reason = "reboot";
break;
case 'c':
reason = "crashdump";
break;
case 'o':
reason = "off";
break;
default:
reason = "unknown";
break;
}
cl_log(LOG_EMERG, "Rebooting system: %s", reason);
sync();
if(kind == 'c') {
watchdog_close(true);
sysrq_trigger(kind);
} else {
watchdog_close(false);
sysrq_trigger(kind);
if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) {
cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot");
}
}
exit(1);
}
void
do_crashdump(void)
{
do_exit('c');
}
void
do_reset(void)
{
do_exit('b');
}
void
do_off(void)
{
do_exit('o');
}
/*
* Change directory to the directory our core file needs to go in
* Call after you establish the userid you're running under.
*/
int
sbd_cdtocoredir(void)
{
int rc;
static const char *dir = NULL;
if (dir == NULL) {
dir = CRM_CORE_DIR;
}
if ((rc=chdir(dir)) < 0) {
int errsave = errno;
cl_perror("Cannot chdir to [%s]", dir);
errno = errsave;
}
return rc;
}
pid_t
make_daemon(void)
{
pid_t pid;
const char * devnull = "/dev/null";
pid = fork();
if (pid < 0) {
cl_log(LOG_ERR, "%s: could not start daemon\n",
cmdname);
cl_perror("fork");
exit(1);
}else if (pid > 0) {
return pid;
}
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
/* This is the child; ensure privileges have not been lost. */
maximize_priority();
sysrq_init();
umask(022);
close(0);
(void)open(devnull, O_RDONLY);
close(1);
(void)open(devnull, O_WRONLY);
close(2);
(void)open(devnull, O_WRONLY);
sbd_cdtocoredir();
return 0;
}
void
sbd_get_uname(void)
{
struct utsname uname_buf;
int i;
if (uname(&uname_buf) < 0) {
cl_perror("uname() failed?");
exit(1);
}
local_uname = strdup(uname_buf.nodename);
for (i = 0; i < strlen(local_uname); i++)
local_uname[i] = tolower(local_uname[i]);
}
#define FMT_MAX 256
void
sbd_set_format_string(int method, const char *daemon)
{
int offset = 0;
char fmt[FMT_MAX];
struct utsname res;
switch(method) {
case QB_LOG_STDERR:
break;
case QB_LOG_SYSLOG:
if(daemon && strcmp(daemon, "sbd") != 0) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon);
}
break;
default:
/* When logging to a file */
if (uname(&res) == 0) {
offset +=
snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(),
res.nodename, daemon);
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon);
}
}
if (debug && method >= QB_LOG_STDERR) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: ");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: ");
}
if (method == QB_LOG_SYSLOG) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b");
}
if(offset > 0) {
qb_log_format_set(method, fmt);
}
}
void
notify_parent(void)
{
pid_t ppid;
union sigval signal_value;
memset(&signal_value, 0, sizeof(signal_value));
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
cl_log(LOG_WARNING, "Our parent is dead.");
do_reset();
}
switch (servant_health) {
case pcmk_health_pending:
case pcmk_health_shutdown:
case pcmk_health_transient:
- DBGLOG(LOG_INFO, "Not notifying parent: state transient (%d)", servant_health);
+ DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health);
break;
case pcmk_health_unknown:
case pcmk_health_unclean:
case pcmk_health_noquorum:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health);
sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
break;
case pcmk_health_online:
- DBGLOG(LOG_INFO, "Notifying parent: healthy");
+ DBGLOG(LOG_DEBUG, "Notifying parent: healthy");
sigqueue(ppid, SIG_LIVENESS, signal_value);
break;
default:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health);
sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
break;
}
}
void
set_servant_health(enum pcmk_health state, int level, char const *format, ...)
{
if (servant_health != state) {
va_list ap;
int len = 0;
char *string = NULL;
servant_health = state;
va_start(ap, format);
len = vasprintf (&string, format, ap);
if(len > 0) {
cl_log(level, "%s", string);
}
va_end(ap);
free(string);
}
}
bool
sbd_is_disk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(servant->devname[0] == '/')) {
return true;
}
return false;
}
bool
sbd_is_cluster(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("cluster", servant->devname) == 0)) {
return true;
}
return false;
}
bool
sbd_is_pcmk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("pcmk", servant->devname) == 0)) {
return true;
}
return false;
}
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index 237bf43..90c7d26 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -1,1164 +1,1164 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <pacemaker/crm/common/util.h>
#include "sbd.h"
#define LOCKSTRLEN 11
static struct servants_list_item *servants_leader = NULL;
int disk_priority = 1;
int check_pcmk = 1;
int check_cluster = 1;
int disk_count = 0;
int servant_count = 0;
int servant_restart_interval = 5;
int servant_restart_count = 1;
int start_mode = 0;
char* pidfile = NULL;
int parse_device_line(const char *line);
void recruit_servant(const char *devname, pid_t pid)
{
struct servants_list_item *s = servants_leader;
struct servants_list_item *newbie;
if (lookup_servant_by_dev(devname)) {
cl_log(LOG_DEBUG, "Servant %s already exists", devname);
return;
}
newbie = malloc(sizeof(*newbie));
if (!newbie) {
fprintf(stderr, "malloc failed in recruit_servant.\n");
exit(1);
}
memset(newbie, 0, sizeof(*newbie));
newbie->devname = strdup(devname);
newbie->pid = pid;
newbie->first_start = 1;
if (!s) {
servants_leader = newbie;
} else {
while (s->next)
s = s->next;
s->next = newbie;
}
servant_count++;
if(sbd_is_disk(newbie)) {
- cl_log(LOG_NOTICE, "Monitoring %s", devname);
+ cl_log(LOG_INFO, "Monitoring %s", devname);
disk_count++;
} else {
newbie->outdated = 1;
}
}
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
{
pid_t pid = 0;
int rc = 0;
pid = fork();
if (pid == 0) { /* child */
maximize_priority();
sbd_set_format_string(QB_LOG_SYSLOG, devname);
rc = (*functionp)(devname, mode, argp);
if (rc == -1)
exit(1);
else
exit(0);
} else if (pid != -1) { /* parent */
return pid;
} else {
cl_log(LOG_ERR,"Failed to fork servant");
exit(1);
}
}
struct servants_list_item *lookup_servant_by_dev(const char *devname)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (strcasecmp(s->devname, devname) == 0)
break;
}
return s;
}
struct servants_list_item *lookup_servant_by_pid(pid_t pid)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (s->pid == pid)
break;
}
return s;
}
int check_all_dead(void)
{
struct servants_list_item *s;
int r = 0;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if (r == -1 && errno == ESRCH)
continue;
return 0;
}
}
return 1;
}
void servant_start(struct servants_list_item *s)
{
int r = 0;
union sigval svalue;
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if ((r != -1 || errno != ESRCH))
return;
}
s->restarts++;
if (sbd_is_disk(s)) {
#if SUPPORT_SHARED_DISK
DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
s->pid = assign_servant(s->devname, servant, start_mode, s);
#else
cl_log(LOG_ERR, "Shared disk functionality not supported");
return;
#endif
} else if(sbd_is_pcmk(s)) {
DBGLOG(LOG_INFO, "Starting Pacemaker servant");
s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
} else if(sbd_is_cluster(s)) {
DBGLOG(LOG_INFO, "Starting Cluster servant");
s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
} else {
cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
}
clock_gettime(CLOCK_MONOTONIC, &s->t_started);
return;
}
void servants_start(void)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
s->restarts = 0;
servant_start(s);
}
}
void servants_kill(void)
{
struct servants_list_item *s;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0)
sigqueue(s->pid, SIGKILL, svalue);
}
}
static inline void cleanup_servant_by_pid(pid_t pid)
{
struct servants_list_item* s;
s = lookup_servant_by_pid(pid);
if (s) {
cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
s->devname, s->pid);
s->pid = 0;
} else {
/* This most likely is a stray signal from somewhere, or
* a SIGCHLD for a process that has previously
* explicitly disconnected. */
DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
pid);
}
}
int inquisitor_decouple(void)
{
pid_t ppid = getppid();
union sigval signal_value;
/* During start-up, we only arm the watchdog once we've got
* quorum at least once. */
if (watchdog_use) {
if (watchdog_init() < 0) {
return -1;
}
}
if (ppid > 1) {
sigqueue(ppid, SIG_LIVENESS, signal_value);
}
return 0;
}
static int sbd_lock_running(long pid)
{
int rc = 0;
long mypid;
int running = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
/* check if pid is running */
if (kill(pid, 0) < 0 && errno == ESRCH) {
goto bail;
}
#ifndef HAVE_PROC_PID
return 1;
#endif
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
rc = readlink(proc_path, exe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
exe_path[rc] = 0;
mypid = (unsigned long) getpid();
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
rc = readlink(proc_path, myexe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
myexe_path[rc] = 0;
if(strcmp(exe_path, myexe_path) == 0) {
running = 1;
}
bail:
return running;
}
static int
sbd_lock_pidfile(const char *filename)
{
char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
int fd;
long pid, mypid;
int rc;
struct stat sbuf;
if (filename == NULL) {
errno = EFAULT;
return -1;
}
mypid = (unsigned long) getpid();
snprintf(lf_name, sizeof(lf_name), "%s",filename);
snprintf(tf_name, sizeof(tf_name), "%s.%lu",
filename, mypid);
if ((fd = open(lf_name, O_RDONLY)) >= 0) {
if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
sleep(1); /* if someone was about to create one,
* give'm a sec to do so
* Though if they follow our protocol,
* this won't happen. They should really
* put the pid in, then link, not the
* other way around.
*/
}
if (read(fd, buf, sizeof(buf)) < 1) {
/* lockfile empty -> rm it and go on */;
} else {
if (sscanf(buf, "%ld", &pid) < 1) {
/* lockfile screwed up -> rm it and go on */
} else {
if (pid > 1 && (getpid() != pid)
&& sbd_lock_running(pid)) {
/* is locked by existing process
* -> give up */
close(fd);
return -1;
} else {
/* stale lockfile -> rm it and go on */
}
}
}
unlink(lf_name);
close(fd);
}
if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
/* Hmmh, why did we fail? Anyway, nothing we can do about it */
return -3;
}
/* Slight overkill with the %*d format ;-) */
snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
/* Again, nothing we can do about this */
rc = -3;
close(fd);
goto out;
}
close(fd);
switch (link(tf_name, lf_name)) {
case 0:
if (stat(tf_name, &sbuf) < 0) {
/* something weird happened */
rc = -3;
break;
}
if (sbuf.st_nlink < 2) {
/* somehow, it didn't get through - NFS trouble? */
rc = -2;
break;
}
rc = 0;
break;
case EEXIST:
rc = -1;
break;
default:
rc = -3;
}
out:
unlink(tf_name);
return rc;
}
/*
* Unlock a file (remove its lockfile)
* do we need to check, if its (still) ours? No, IMHO, if someone else
* locked our line, it's his fault -tho
* returns 0 on success
* <0 if some failure occured
*/
static int
sbd_unlock_pidfile(const char *filename)
{
char lf_name[256];
if (filename == NULL) {
errno = EFAULT;
return -1;
}
snprintf(lf_name, sizeof(lf_name), "%s", filename);
return unlink(lf_name);
}
int cluster_alive(bool all)
{
int alive = 1;
struct servants_list_item* s;
if(servant_count == disk_count) {
return 0;
}
for (s = servants_leader; s; s = s->next) {
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if(s->outdated) {
alive = 0;
} else if(all == false) {
return 1;
}
}
}
return alive;
}
int quorum_read(int good_servants)
{
if (disk_count > 2)
return (good_servants > disk_count/2);
else
return (good_servants > 0);
}
void inquisitor_child(void)
{
int sig, pid;
sigset_t procmask;
siginfo_t sinfo;
int status;
struct timespec timeout;
int exiting = 0;
int decoupled = 0;
int cluster_appeared = 0;
int pcmk_override = 0;
time_t latency;
struct timespec t_last_tickle, t_now;
struct servants_list_item* s;
if (debug_mode) {
cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
}
set_proc_title("sbd: inquisitor");
if (pidfile) {
if (sbd_lock_pidfile(pidfile) < 0) {
exit(1);
}
}
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIGTERM);
sigaddset(&procmask, SIG_LIVENESS);
sigaddset(&procmask, SIG_EXITREQ);
sigaddset(&procmask, SIG_TEST);
sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
sigaddset(&procmask, SIG_RESTART);
sigaddset(&procmask, SIGUSR1);
sigaddset(&procmask, SIGUSR2);
sigprocmask(SIG_BLOCK, &procmask, NULL);
servants_start();
timeout.tv_sec = timeout_loop;
timeout.tv_nsec = 0;
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
while (1) {
bool tickle = 0;
bool can_detach = 0;
int good_servants = 0;
sig = sigtimedwait(&procmask, &sinfo, &timeout);
clock_gettime(CLOCK_MONOTONIC, &t_now);
if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_MD_IO_FAIL:
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
s->devname);
break;
case EXIT_MD_REQUEST_RESET:
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
do_reset();
break;
case EXIT_MD_REQUEST_SHUTOFF:
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
do_off();
break;
case EXIT_MD_REQUEST_CRASHDUMP:
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
do_crashdump();
break;
default:
break;
}
}
}
cleanup_servant_by_pid(pid);
}
}
} else if (sig == SIG_PCMK_UNHEALTHY) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if (s->outdated == 0) {
cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
}
s->t_last.tv_sec = 1;
} else {
cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
}
} else if (sig == SIG_LIVENESS) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (s) {
s->first_start = 0;
clock_gettime(CLOCK_MONOTONIC, &s->t_last);
}
} else if (sig == SIG_TEST) {
} else if (sig == SIGUSR1) {
if (exiting)
continue;
servants_start();
}
if (exiting) {
if (check_all_dead()) {
if (pidfile) {
sbd_unlock_pidfile(pidfile);
}
exit(0);
} else
continue;
}
good_servants = 0;
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_last.tv_sec;
if (!s->t_last.tv_sec)
continue;
if (age < (int)(timeout_io+timeout_loop)) {
if (sbd_is_disk(s)) {
good_servants++;
}
if (s->outdated) {
cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
}
s->outdated = 0;
} else if (!s->outdated) {
if (!s->restart_blocked) {
cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
}
s->outdated = 1;
}
}
if(disk_count == 0) {
/* NO disks, everything is up to the cluster */
if(cluster_alive(true)) {
/* We LIVE! */
if(cluster_appeared == false) {
- cl_log(LOG_NOTICE, "Active cluster detected");
+ cl_log(LOG_INFO, "Active cluster detected");
}
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(cluster_alive(false)) {
if(!decoupled) {
/* On the way up, detach and arm the watchdog */
- cl_log(LOG_NOTICE, "Partial cluster detected, detaching");
+ cl_log(LOG_INFO, "Partial cluster detected, detaching");
}
can_detach = 1;
tickle = !cluster_appeared;
} else if(!decoupled) {
/* Stay alive until the cluster comes up */
tickle = !cluster_appeared;
}
} else if(disk_priority == 1 || servant_count == disk_count) {
if (quorum_read(good_servants)) {
/* There are disks and we're connected to the majority of them */
tickle = 1;
can_detach = 1;
pcmk_override = 0;
} else if (servant_count > disk_count && cluster_alive(true)) {
tickle = 1;
if(!pcmk_override) {
cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
pcmk_override = 1; /* Only log this message once */
}
}
} else if(cluster_alive(true) && quorum_read(good_servants)) {
/* Both disk and cluster servants are healthy */
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(quorum_read(good_servants)) {
/* The cluster takes priority but only once
* connected for the first time.
*
* Until then, we tickle based on disk quorum.
*/
can_detach = 1;
tickle = !cluster_appeared;
}
/* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
/* quorum_read(good_servants), good_servants, tickle, disk_count); */
if(tickle) {
watchdog_tickle();
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
}
if (!decoupled && can_detach) {
/* We only do this at the point either the disk or
* cluster servants become healthy
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
exiting = 1;
continue;
} else {
decoupled = 1;
}
}
/* Note that this can actually be negative, since we set
* last_tickle after we set now. */
latency = t_now.tv_sec - t_last_tickle.tv_sec;
if (timeout_watchdog && (latency > (int)timeout_watchdog)) {
if (!decoupled) {
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
exiting = 1;
continue;
}
if (debug_mode < 2) {
/* At level 2 or above, we do nothing, but expect
* things to eventually return to
* normal. */
do_reset();
} else {
cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
}
}
if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)",
(int)latency, (int)timeout_watchdog_warn, good_servants);
if (debug_mode && watchdog_use) {
/* In debug mode, trigger a reset before the watchdog can panic the machine */
do_reset();
}
}
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_started.tv_sec;
if (age > servant_restart_interval) {
s->restarts = 0;
s->restart_blocked = 0;
}
if (servant_restart_count
&& (s->restarts >= servant_restart_count)
&& !s->restart_blocked) {
if (servant_restart_count > 1) {
cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
(int)servant_restart_count, s->devname);
}
s->restart_blocked = 1;
}
if (!s->restart_blocked) {
servant_start(s);
}
}
}
/* not reached */
exit(0);
}
int inquisitor(void)
{
int sig, pid, inquisitor_pid;
int status;
sigset_t procmask;
siginfo_t sinfo;
/* Where's the best place for sysrq init ?*/
sysrq_init();
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIG_LIVENESS);
sigprocmask(SIG_BLOCK, &procmask, NULL);
inquisitor_pid = make_daemon();
if (inquisitor_pid == 0) {
inquisitor_child();
}
/* We're the parent. Wait for a happy signal from our child
* before we proceed - we either get "SIG_LIVENESS" when the
* inquisitor has completed the first successful round, or
* ECHLD when it exits with an error. */
while (1) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
}
/* We got here because the inquisitor
* did not succeed. */
return -1;
}
} else if (sig == SIG_LIVENESS) {
/* Inquisitor started up properly. */
return 0;
} else {
fprintf(stderr, "Nobody expected the spanish inquisition!\n");
continue;
}
}
/* not reached */
return -1;
}
int
parse_device_line(const char *line)
{
int lpc = 0;
int last = 0;
int max = 0;
int found = 0;
if(line) {
max = strlen(line);
}
if (max <= 0) {
return found;
}
cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", max, line);
/* Skip initial whitespace */
for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) {
last = lpc + 1;
}
/* Now the actual content */
for (lpc = 0; lpc <= max; lpc++) {
int a_space = isspace(line[lpc]);
if (a_space && lpc < max && isspace(line[lpc + 1])) {
/* fast-forward to the end of the spaces */
} else if (a_space || line[lpc] == ';' || line[lpc] == 0) {
int rc = 1;
char *entry = NULL;
if (lpc > last) {
entry = calloc(1, 1 + lpc - last);
rc = sscanf(line + last, "%[^;]", entry);
}
if (entry == NULL) {
/* Skip */
} else if (rc != 1) {
cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last);
} else {
cl_log(LOG_DEBUG, "Adding '%s'", entry);
recruit_servant(entry, 0);
found++;
}
free(entry);
last = lpc + 1;
}
}
return found;
}
#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c"
static void
sbd_log_filter_ctl(const char *files, uint8_t priority)
{
if (files == NULL) {
files = SBD_SOURCE_FILES;
}
qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
}
int
arg_enabled(int arg_count)
{
return arg_count % 2;
}
int main(int argc, char **argv, char **envp)
{
int exit_status = 0;
int c;
int W_count = 0;
int c_count = 0;
int P_count = 0;
int qb_facility;
const char *value = NULL;
int start_delay = 0;
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
cmdname = argv[0];
} else {
++cmdname;
}
watchdogdev = strdup("/dev/watchdog");
watchdogdev_is_default = true;
qb_facility = qb_log_facility2int("daemon");
qb_log_init(cmdname, qb_facility, LOG_WARNING);
sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
sbd_log_filter_ctl(NULL, LOG_NOTICE);
sbd_get_uname();
value = getenv("SBD_DEVICE");
if(value) {
#if SUPPORT_SHARED_DISK
int devices = parse_device_line(value);
if(devices < 1) {
fprintf(stderr, "Invalid device line: %s\n", value);
exit_status = -2;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
}
value = getenv("SBD_PACEMAKER");
if(value) {
check_pcmk = crm_is_true(value);
check_cluster = crm_is_true(value);
}
cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default");
value = getenv("SBD_STARTMODE");
if(value == NULL) {
} else if(strcmp(value, "clean") == 0) {
start_mode = 1;
} else if(strcmp(value, "always") == 0) {
start_mode = 0;
}
cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
value = getenv("SBD_WATCHDOG_DEV");
if(value) {
free(watchdogdev);
watchdogdev = strdup(value);
watchdogdev_is_default = false;
}
/* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
* This is for backward compatibility. */
value = getenv("SBD_WATCHDOG");
if(value) {
watchdog_use = crm_is_true(value);
}
value = getenv("SBD_WATCHDOG_TIMEOUT");
if(value) {
timeout_watchdog = crm_get_msec(value) / 1000;
if(timeout_watchdog > 5) {
timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3;
}
}
value = getenv("SBD_PIDFILE");
if(value) {
pidfile = strdup(value);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
}
value = getenv("SBD_DELAY_START");
if(value) {
start_delay = crm_is_true(value);
}
cl_log(LOG_DEBUG, "Start delay: %d (%s)", (int)start_delay, value?value:"default");
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:")) != -1) {
switch (c) {
case 'D':
break;
case 'Z':
debug_mode++;
cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
break;
case 'R':
skip_rt = 1;
cl_log(LOG_INFO, "Realtime mode deactivated.");
break;
case 'S':
start_mode = atoi(optarg);
cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
break;
case 's':
timeout_startup = atoi(optarg);
cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
break;
case 'v':
debug++;
if(debug == 1) {
sbd_log_filter_ctl(NULL, LOG_INFO);
cl_log(LOG_INFO, "Verbose mode enabled.");
} else if(debug == 2) {
sbd_log_filter_ctl(NULL, LOG_DEBUG);
cl_log(LOG_INFO, "Debug mode enabled.");
} else if(debug == 3) {
/* Go nuts, turn on pacemaker's logging too */
sbd_log_filter_ctl("*", LOG_DEBUG);
cl_log(LOG_INFO, "Debug library mode enabled.");
}
break;
case 'T':
watchdog_set_timeout = 0;
cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
break;
case 'W':
W_count++;
break;
case 'w':
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
free(watchdogdev);
watchdogdev = strdup(optarg);
watchdogdev_is_default = false;
break;
case 'd':
#if SUPPORT_SHARED_DISK
recruit_servant(optarg, 0);
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
break;
case 'c':
c_count++;
break;
case 'P':
P_count++;
break;
case 'z':
disk_priority = 0;
break;
case 'n':
local_uname = strdup(optarg);
cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
break;
case 'p':
pidfile = strdup(optarg);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
break;
case 'C':
timeout_watchdog_crashdump = atoi(optarg);
cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
(int)timeout_watchdog_crashdump);
break;
case '1':
timeout_watchdog = atoi(optarg);
if(timeout_watchdog > 5) {
timeout_watchdog_warn = (int)timeout_watchdog / 5 * 3;
}
break;
case '2':
timeout_allocate = atoi(optarg);
break;
case '3':
timeout_loop = atoi(optarg);
break;
case '4':
timeout_msgwait = atoi(optarg);
break;
case '5':
timeout_watchdog_warn = atoi(optarg);
cl_log(LOG_INFO, "Setting latency warning to %d",
(int)timeout_watchdog_warn);
break;
case 't':
servant_restart_interval = atoi(optarg);
cl_log(LOG_INFO, "Setting servant restart interval to %d",
(int)servant_restart_interval);
break;
case 'I':
timeout_io = atoi(optarg);
cl_log(LOG_INFO, "Setting IO timeout to %d",
(int)timeout_io);
break;
case 'F':
servant_restart_count = atoi(optarg);
cl_log(LOG_INFO, "Servant restart count set to %d",
(int)servant_restart_count);
break;
case 'h':
usage();
return (0);
default:
exit_status = -2;
goto out;
break;
}
}
if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
watchdog_use = 0;
} else if (W_count > 0) {
watchdog_use = arg_enabled(W_count);
}
if (watchdog_use) {
cl_log(LOG_INFO, "Watchdog enabled.");
} else {
cl_log(LOG_INFO, "Watchdog disabled.");
}
if (c_count > 0) {
check_cluster = arg_enabled(c_count);
}
if (P_count > 0) {
check_pcmk = arg_enabled(P_count);
}
if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
SECTOR_NAME_MAX);
fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
exit_status = -1;
goto out;
}
if (disk_count > 3) {
fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
exit_status = -1;
goto out;
}
/* There must at least be one command following the options: */
if ((argc - optind) < 1) {
fprintf(stderr, "Not enough arguments.\n");
exit_status = -2;
goto out;
}
if (init_set_proc_title(argc, argv, envp) < 0) {
fprintf(stderr, "Allocation of proc title failed.\n");
exit_status = -1;
goto out;
}
#if SUPPORT_SHARED_DISK
if (strcmp(argv[optind], "create") == 0) {
exit_status = init_devices(servants_leader);
} else if (strcmp(argv[optind], "dump") == 0) {
exit_status = dump_headers(servants_leader);
} else if (strcmp(argv[optind], "allocate") == 0) {
exit_status = allocate_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "list") == 0) {
exit_status = list_slots(servants_leader);
} else if (strcmp(argv[optind], "message") == 0) {
exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
} else if (strcmp(argv[optind], "ping") == 0) {
exit_status = ping_via_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "watch") == 0) {
if(disk_count > 0) {
/* If no devices are specified, its not an error to be unable to find one */
open_any_device(servants_leader);
}
if(start_delay) {
unsigned long delay = get_first_msgwait(servants_leader);
sleep(delay);
}
} else {
exit_status = -2;
}
#endif
if (strcmp(argv[optind], "query-watchdog") == 0) {
exit_status = watchdog_info();
} else if (strcmp(argv[optind], "test-watchdog") == 0) {
exit_status = watchdog_test();
} else if (strcmp(argv[optind], "watch") == 0) {
/* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
/* We only want this to have an effect during watch right now;
* pinging and fencing would be too confused */
cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
if (check_pcmk) {
recruit_servant("pcmk", 0);
#if SUPPORT_PLUGIN
check_cluster = 1;
#endif
}
cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
if (check_cluster) {
recruit_servant("cluster", 0);
}
exit_status = inquisitor();
}
out:
if (exit_status < 0) {
if (exit_status == -2) {
usage();
} else {
fprintf(stderr, "sbd failed; please check the logs.\n");
}
return (1);
}
return (0);
}
diff --git a/src/sbd-md.c b/src/sbd-md.c
index 6a964dd..6f152c4 100644
--- a/src/sbd-md.c
+++ b/src/sbd-md.c
@@ -1,1235 +1,1235 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#define SBD_MSG_EMPTY 0x00
#define SBD_MSG_TEST 0x01
#define SBD_MSG_RESET 0x02
#define SBD_MSG_OFF 0x03
#define SBD_MSG_EXIT 0x04
#define SBD_MSG_CRASHDUMP 0x05
#define SLOT_TO_SECTOR(slot) (1+slot*2)
#define MBOX_TO_SECTOR(mbox) (2+mbox*2)
extern int disk_count;
/* These have to match the values in the header of the partition */
static char sbd_magic[8] = "SBD_SBD_";
static char sbd_version = 0x02;
struct slot_msg_arg_t {
const char* name;
const char* msg;
};
static signed char
cmd2char(const char *cmd)
{
if (strcmp("clear", cmd) == 0) {
return SBD_MSG_EMPTY;
} else if (strcmp("test", cmd) == 0) {
return SBD_MSG_TEST;
} else if (strcmp("reset", cmd) == 0) {
return SBD_MSG_RESET;
} else if (strcmp("off", cmd) == 0) {
return SBD_MSG_OFF;
} else if (strcmp("exit", cmd) == 0) {
return SBD_MSG_EXIT;
} else if (strcmp("crashdump", cmd) == 0) {
return SBD_MSG_CRASHDUMP;
}
return -1;
}
static const char*
char2cmd(const char cmd)
{
switch (cmd) {
case SBD_MSG_EMPTY:
return "clear";
break;
case SBD_MSG_TEST:
return "test";
break;
case SBD_MSG_RESET:
return "reset";
break;
case SBD_MSG_OFF:
return "off";
break;
case SBD_MSG_EXIT:
return "exit";
break;
case SBD_MSG_CRASHDUMP:
return "crashdump";
break;
default:
return "undefined";
break;
}
}
static void
close_device(struct sbd_context *st)
{
close(st->devfd);
free(st);
}
static struct sbd_context *
open_device(const char* devname, int loglevel)
{
struct sbd_context *st;
if (!devname)
return NULL;
st = malloc(sizeof(struct sbd_context));
if (!st)
return NULL;
memset(st, 0, sizeof(struct sbd_context));
if (io_setup(1, &st->ioctx) != 0) {
cl_perror("io_setup failed");
free(st);
return NULL;
}
st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT);
if (st->devfd == -1) {
if (loglevel == LOG_DEBUG) {
DBGLOG(loglevel, "Opening device %s failed.", devname);
} else {
cl_log(loglevel, "Opening device %s failed.", devname);
}
free(st);
return NULL;
}
ioctl(st->devfd, BLKSSZGET, §or_size);
if (sector_size == 0) {
cl_perror("Get sector size failed.\n");
close_device(st);
return NULL;
}
return st;
}
static void *
sector_alloc(void)
{
void *x;
x = valloc(sector_size);
if (!x) {
exit(1);
}
memset(x, 0, sector_size);
return x;
}
static int
sector_io(struct sbd_context *st, int sector, void *data, int rw)
{
struct timespec timeout;
struct io_event event;
struct iocb *ios[1] = { &st->io };
long r;
timeout.tv_sec = timeout_io;
timeout.tv_nsec = 0;
memset(&st->io, 0, sizeof(struct iocb));
if (rw) {
io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector);
} else {
io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector);
}
if (io_submit(st->ioctx, 1, ios) != 1) {
cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw);
return -1;
}
errno = 0;
r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout);
if (r < 0 ) {
cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw);
return -1;
} else if (r < 1L) {
cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d)", rw);
r = io_cancel(st->ioctx, ios[0], &event);
if (r) {
DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw);
/* Doesn't really matter, debugging information.
*/
}
return -1;
} else if (r > 1L) {
cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r);
return -1;
}
/* IO is happy */
if (event.res == sector_size) {
return 0;
} else {
cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)",
rw, event.res, sector_size);
return -1;
}
}
static int
sector_write(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 1);
}
static int
sector_read(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 0);
}
static int
slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_read(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_write(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
void *data;
int rc = 0;
if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0)
return -1;
data = sector_alloc();
if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) {
rc = -1;
goto out;
}
if (memcmp(s_mbox, data, sector_size) != 0) {
cl_log(LOG_ERR, "Write verification failed!");
rc = -1;
goto out;
}
rc = 0;
out:
free(data);
return rc;
}
static int header_write(struct sbd_context *st, struct sector_header_s *s_header)
{
s_header->sector_size = htonl(s_header->sector_size);
s_header->timeout_watchdog = htonl(s_header->timeout_watchdog);
s_header->timeout_allocate = htonl(s_header->timeout_allocate);
s_header->timeout_loop = htonl(s_header->timeout_loop);
s_header->timeout_msgwait = htonl(s_header->timeout_msgwait);
return sector_write(st, 0, s_header);
}
static int
header_read(struct sbd_context *st, struct sector_header_s *s_header)
{
if (sector_read(st, 0, s_header) < 0)
return -1;
s_header->sector_size = ntohl(s_header->sector_size);
s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog);
s_header->timeout_allocate = ntohl(s_header->timeout_allocate);
s_header->timeout_loop = ntohl(s_header->timeout_loop);
s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait);
/* This sets the global defaults: */
timeout_watchdog = s_header->timeout_watchdog;
timeout_allocate = s_header->timeout_allocate;
timeout_loop = s_header->timeout_loop;
timeout_msgwait = s_header->timeout_msgwait;
return 0;
}
static int
valid_header(const struct sector_header_s *s_header)
{
if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) {
cl_log(LOG_ERR, "Header magic does not match.");
return -1;
}
if (s_header->version != sbd_version) {
cl_log(LOG_ERR, "Header version does not match.");
return -1;
}
if (s_header->sector_size != sector_size) {
cl_log(LOG_ERR, "Header sector size does not match.");
return -1;
}
return 0;
}
static struct sector_header_s *
header_get(struct sbd_context *st)
{
struct sector_header_s *s_header;
s_header = sector_alloc();
if (header_read(st, s_header) < 0) {
cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd);
return NULL;
}
if (valid_header(s_header) < 0) {
cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd);
return NULL;
}
/* cl_log(LOG_INFO, "Found version %d header with %d slots",
s_header->version, s_header->slots); */
return s_header;
}
static int
header_dump(struct sbd_context *st)
{
struct sector_header_s *s_header;
char uuid[37];
s_header = header_get(st);
if (s_header == NULL)
return -1;
printf("Header version : %u.%u\n", s_header->version,
s_header->minor_version);
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
printf("UUID : %s\n", uuid);
}
printf("Number of slots : %u\n", s_header->slots);
printf("Sector size : %lu\n",
(unsigned long)s_header->sector_size);
printf("Timeout (watchdog) : %lu\n",
(unsigned long)s_header->timeout_watchdog);
printf("Timeout (allocate) : %lu\n",
(unsigned long)s_header->timeout_allocate);
printf("Timeout (loop) : %lu\n",
(unsigned long)s_header->timeout_loop);
printf("Timeout (msgwait) : %lu\n",
(unsigned long)s_header->timeout_msgwait);
return 0;
}
static int
init_device(struct sbd_context *st)
{
struct sector_header_s *s_header;
struct sector_node_s *s_node;
struct sector_mbox_s *s_mbox;
struct stat s;
char uuid[37];
int i;
int rc = 0;
s_header = sector_alloc();
s_node = sector_alloc();
s_mbox = sector_alloc();
memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic));
s_header->version = sbd_version;
s_header->slots = 255;
s_header->sector_size = sector_size;
s_header->timeout_watchdog = timeout_watchdog;
s_header->timeout_allocate = timeout_allocate;
s_header->timeout_loop = timeout_loop;
s_header->timeout_msgwait = timeout_msgwait;
s_header->minor_version = 1;
uuid_generate(s_header->uuid);
uuid_unparse_lower(s_header->uuid, uuid);
fstat(st->devfd, &s);
/* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n",
s.st_size, s.st_blksize, s.st_blocks); */
cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)",
s_header->version, s_header->minor_version,
st->devfd, uuid);
fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n",
s_header->version, s_header->minor_version,
st->devfd, uuid);
if (header_write(st, s_header) < 0) {
rc = -1; goto out;
}
cl_log(LOG_INFO, "Initializing %d slots on device %d",
s_header->slots,
st->devfd);
fprintf(stdout, "Initializing %d slots on device %d\n",
s_header->slots,
st->devfd);
for (i=0;i < s_header->slots;i++) {
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (mbox_write(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return(rc);
}
/* Check if there already is a slot allocated to said name; returns the
* slot number. If not found, returns -1.
* This is necessary because slots might not be continuous. */
static int
slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name)
{
struct sector_node_s *s_node = NULL;
int i;
int rc = -1;
if (!name) {
cl_log(LOG_ERR, "slot_lookup(): No name specified.\n");
goto out;
}
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -2; goto out;
}
if (s_node->in_use != 0) {
if (strncasecmp(s_node->name, name,
SECTOR_NAME_MAX) == 0) {
DBGLOG(LOG_INFO, "%s owns slot %d", name, i);
rc = i; goto out;
}
}
}
out: free(s_node);
return rc;
}
static int
slot_unused(struct sbd_context *st, const struct sector_header_s *s_header)
{
struct sector_node_s *s_node;
int i;
int rc = -1;
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use == 0) {
rc = i; goto out;
}
}
out: free(s_node);
return rc;
}
static int
slot_allocate(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_allocate(): No name specified.\n");
fprintf(stderr, "slot_allocate(): No name specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
while (1) {
i = slot_lookup(st, s_header, name);
if ((i >= 0) || (i == -2)) {
/* -1 is "no slot found", in which case we
* proceed to allocate a new one.
* -2 is "read error during lookup", in which
* case we error out too
* >= 0 is "slot already allocated" */
rc = i; goto out;
}
i = slot_unused(st, s_header);
if (i >= 0) {
cl_log(LOG_INFO, "slot %d is unused - trying to own", i);
fprintf(stdout, "slot %d is unused - trying to own\n", i);
memset(s_node, 0, sizeof(*s_node));
s_node->in_use = 1;
strncpy(s_node->name, name, SECTOR_NAME_MAX);
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
sleep(timeout_allocate);
} else {
cl_log(LOG_ERR, "No more free slots.");
fprintf(stderr, "No more free slots.\n");
rc = -1; goto out;
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return(rc);
}
static int
slot_list(struct sbd_context *st)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use > 0) {
if (mbox_read(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
printf("%d\t%s\t%s\t%s\n",
i, s_node->name, char2cmd(s_mbox->cmd),
s_mbox->from);
}
}
out: free(s_node);
free(s_header);
free(s_mbox);
return rc;
}
static int
slot_msg(struct sbd_context *st, const char *name, const char *cmd)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int rc = 0;
char uuid[37];
if (!name || !cmd) {
cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device UUID: %s", uuid);
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = cmd2char(cmd);
if (s_mbox->cmd < 0) {
cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd);
rc = -1; goto out;
}
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
cl_log(LOG_INFO, "Writing %s to node slot %s",
cmd, name);
if (mbox_write_verify(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
if (strcasecmp(cmd, "exit") != 0) {
cl_log(LOG_INFO, "Messaging delay: %d",
(int)timeout_msgwait);
sleep(timeout_msgwait);
}
cl_log(LOG_INFO, "%s successfully delivered to %s",
cmd, name);
out: free(s_mbox);
free(s_header);
return rc;
}
static int
slot_ping(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int waited = 0;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = SBD_MSG_TEST;
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
DBGLOG(LOG_DEBUG, "Pinging node %s", name);
if (mbox_write(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
rc = -1;
while (waited <= timeout_msgwait) {
if (mbox_read(st, mbox, s_mbox) < 0)
break;
if (s_mbox->cmd != SBD_MSG_TEST) {
rc = 0;
break;
}
sleep(1);
waited++;
}
if (rc == 0) {
cl_log(LOG_DEBUG, "%s successfully pinged.", name);
} else {
cl_log(LOG_ERR, "%s failed to ping.", name);
}
out: free(s_mbox);
free(s_header);
return rc;
}
int init_devices(struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Initializing device %s\n",
s->devname);
st = open_device(s->devname, LOG_ERR);
if (!st) {
return -1;
}
rc = init_device(st);
close_device(st);
if (rc == -1) {
fprintf(stderr, "Failed to init device %s\n", s->devname);
return rc;
}
fprintf(stdout, "Device %s is initialized.\n", s->devname);
}
return 0;
}
static int slot_msg_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
struct sbd_context *st;
const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
cl_log(LOG_INFO, "Delivery process handling %s",
devname);
rc = slot_msg(st, arg->name, arg->msg);
close_device(st);
return rc;
}
static int slot_ping_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
const char* name = (const char*)argp;
struct sbd_context *st;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
rc = slot_ping(st, name);
close_device(st);
return rc;
}
int allocate_slots(const char *name, struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Trying to allocate slot for %s on device %s.\n",
name,
s->devname);
st = open_device(s->devname, LOG_WARNING);
if (!st) {
return -1;
}
rc = slot_allocate(st, name);
close_device(st);
if (rc < 0)
return rc;
fprintf(stdout, "Slot for %s has been allocated on %s.\n",
name,
s->devname);
}
return 0;
}
int list_slots(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
st = open_device(s->devname, LOG_WARNING);
if (!st) {
fprintf(stdout, "== disk %s unreadable!\n", s->devname);
continue;
}
rc = slot_list(st);
close_device(st);
if (rc == -1) {
fprintf(stdout, "== Slots on disk %s NOT dumped\n", s->devname);
}
}
return 0;
}
int ping_via_slots(const char *name, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
if(sbd_is_disk(s)) {
s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name);
}
}
while (servants_finished < disk_count) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = wait(&status))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
servants_finished++;
}
}
}
}
}
return 0;
}
int quorum_write(int good_servants)
{
return (good_servants > disk_count/2);
}
int messenger(const char *name, const char *msg, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
int successful_delivery = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
struct slot_msg_arg_t slot_msg_arg = {name, msg};
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg);
}
while (!(quorum_write(successful_delivery) ||
(servants_finished == disk_count))) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
servants_finished++;
if (WIFEXITED(status)
&& WEXITSTATUS(status) == 0) {
DBGLOG(LOG_INFO, "Process %d succeeded.",
(int)pid);
successful_delivery++;
} else {
cl_log(LOG_WARNING, "Process %d failed to deliver!",
(int)pid);
}
}
}
}
}
if (quorum_write(successful_delivery)) {
cl_log(LOG_INFO, "Message successfully delivered.");
return 0;
} else {
cl_log(LOG_ERR, "Message is not delivered via more then a half of devices");
return -1;
}
}
unsigned long
get_first_msgwait(struct servants_list_item *servants)
{
unsigned long msgwait = 0;
struct servants_list_item *s = servants;
for (s = servants; s; s = s->next) {
struct sbd_context *st;
struct sector_header_s *s_header;
st = open_device(s->devname, LOG_WARNING);
if (!st) {
continue;
}
s_header = header_get(st);
if (s_header != NULL) {
msgwait = (unsigned long)s_header->timeout_msgwait;
close_device(st);
return msgwait;
}
close_device(st);
}
return msgwait;
}
int dump_headers(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s = servants;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
int rv;
fprintf(stdout, "==Dumping header on disk %s\n", s->devname);
st = open_device(s->devname, LOG_WARNING);
if (st) {
rv = header_dump(st);
close_device(st);
} else {
fprintf(stdout, "== disk %s unreadable!\n", s->devname);
rv = -1;
}
if (rv == -1) {
rc = -1;
fprintf(stdout, "==Header on disk %s NOT dumped\n", s->devname);
} else {
fprintf(stdout, "==Header on disk %s is dumped\n", s->devname);
}
}
return rc;
}
void open_any_device(struct servants_list_item *servants)
{
struct sector_header_s *hdr_cur = NULL;
struct timespec t_0;
int t_wait = 0;
clock_gettime(CLOCK_MONOTONIC, &t_0);
while (!hdr_cur && t_wait < timeout_startup) {
struct timespec t_now;
struct servants_list_item* s;
for (s = servants; s; s = s->next) {
struct sbd_context *st = open_device(s->devname, LOG_DEBUG);
if (!st)
continue;
hdr_cur = header_get(st);
close_device(st);
if (hdr_cur)
break;
}
clock_gettime(CLOCK_MONOTONIC, &t_now);
t_wait = t_now.tv_sec - t_0.tv_sec;
if (!hdr_cur) {
sleep(timeout_loop);
}
}
if (hdr_cur) {
timeout_watchdog = hdr_cur->timeout_watchdog;
timeout_allocate = hdr_cur->timeout_allocate;
timeout_loop = hdr_cur->timeout_loop;
timeout_msgwait = hdr_cur->timeout_msgwait;
} else {
cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.",
timeout_startup);
exit(1);
}
free(hdr_cur);
return;
}
/*
::-::-::-::-::-::-::-::-::-::-::-::-::
Begin disk based servant code
::-::-::-::-::-::-::-::-::-::-::-::-::
*/
static int servant_check_timeout_inconsistent(struct sector_header_s *hdr)
{
if (timeout_watchdog != hdr->timeout_watchdog) {
cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device",
(int)timeout_watchdog, (int)hdr->timeout_watchdog);
return -1;
}
if (timeout_allocate != hdr->timeout_allocate) {
cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device",
(int)timeout_allocate, (int)hdr->timeout_allocate);
return -1;
}
if (timeout_loop != hdr->timeout_loop) {
cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device",
(int)timeout_loop, (int)hdr->timeout_loop);
return -1;
}
if (timeout_msgwait != hdr->timeout_msgwait) {
cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device",
(int)timeout_msgwait, (int)hdr->timeout_msgwait);
return -1;
}
return 0;
}
int servant(const char *diskname, int mode, const void* argp)
{
struct sector_mbox_s *s_mbox = NULL;
struct sector_node_s *s_node = NULL;
struct sector_header_s *s_header = NULL;
int mbox;
int rc = 0;
time_t t0, t1, latency;
union sigval signal_value;
sigset_t servant_masks;
struct sbd_context *st;
pid_t ppid;
char uuid[37];
const struct servants_list_item *s = argp;
if (!diskname) {
cl_log(LOG_ERR, "Empty disk name %s.", diskname);
return -1;
}
cl_log(LOG_INFO, "Servant starting for device %s", diskname);
/* Block most of the signals */
sigfillset(&servant_masks);
sigdelset(&servant_masks, SIGKILL);
sigdelset(&servant_masks, SIGFPE);
sigdelset(&servant_masks, SIGILL);
sigdelset(&servant_masks, SIGSEGV);
sigdelset(&servant_masks, SIGBUS);
sigdelset(&servant_masks, SIGALRM);
/* FIXME: check error */
sigprocmask(SIG_SETMASK, &servant_masks, NULL);
st = open_device(diskname, LOG_WARNING);
if (!st) {
exit(EXIT_MD_IO_FAIL);
}
s_header = header_get(st);
if (!s_header) {
cl_log(LOG_ERR, "Not a valid header on %s", diskname);
exit(EXIT_MD_IO_FAIL);
}
if (servant_check_timeout_inconsistent(s_header) < 0) {
cl_log(LOG_ERR, "Timeouts on %s do not match first device",
diskname);
exit(EXIT_MD_IO_FAIL);
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid);
}
mbox = slot_allocate(st, local_uname);
if (mbox < 0) {
cl_log(LOG_ERR,
"No slot allocated, and automatic allocation failed for disk %s.",
diskname);
rc = EXIT_MD_IO_FAIL;
goto out;
}
s_node = sector_alloc();
if (slot_read(st, mbox, s_node) < 0) {
cl_log(LOG_ERR, "Unable to read node entry on %s",
diskname);
exit(EXIT_MD_IO_FAIL);
}
- DBGLOG(LOG_INFO, "Monitoring slot %d on disk %s", mbox, diskname);
+ cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname);
if (s_header->minor_version == 0) {
set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox);
} else {
set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s",
diskname, mbox, uuid);
}
s_mbox = sector_alloc();
if (s->first_start) {
if (mode > 0) {
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed during start-up in servant.");
rc = EXIT_MD_IO_FAIL;
goto out;
}
if (s_mbox->cmd != SBD_MSG_EXIT &&
s_mbox->cmd != SBD_MSG_EMPTY) {
/* Not a clean stop. Abort start-up */
cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!");
ppid = getppid();
sigqueue(ppid, SIG_EXITREQ, signal_value);
rc = 0;
goto out;
}
}
DBGLOG(LOG_INFO, "First servant start - zeroing inbox");
memset(s_mbox, 0, sizeof(*s_mbox));
if (mbox_write(st, mbox, s_mbox) < 0) {
rc = EXIT_MD_IO_FAIL;
goto out;
}
}
memset(&signal_value, 0, sizeof(signal_value));
while (1) {
struct sector_header_s *s_header_retry = NULL;
struct sector_node_s *s_node_retry = NULL;
t0 = time(NULL);
sleep(timeout_loop);
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
do_reset();
}
/* These attempts are, by definition, somewhat racy. If
* the device is wiped out or corrupted between here and
* us reading our mbox, there is nothing we can do about
* that. But at least we tried. */
s_header_retry = header_get(st);
if (!s_header_retry) {
cl_log(LOG_ERR, "No longer found a valid header on %s", diskname);
exit(EXIT_MD_IO_FAIL);
}
if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) {
cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname);
exit(EXIT_MD_IO_FAIL);
}
free(s_header_retry);
s_node_retry = sector_alloc();
if (slot_read(st, mbox, s_node_retry) < 0) {
cl_log(LOG_ERR, "slot read failed in servant.");
exit(EXIT_MD_IO_FAIL);
}
if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) {
cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname);
exit(EXIT_MD_IO_FAIL);
}
free(s_node_retry);
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed in servant.");
exit(EXIT_MD_IO_FAIL);
}
if (s_mbox->cmd > 0) {
- cl_log(LOG_INFO,
+ cl_log(LOG_NOTICE,
"Received command %s from %s on disk %s",
char2cmd(s_mbox->cmd), s_mbox->from, diskname);
switch (s_mbox->cmd) {
case SBD_MSG_TEST:
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
sigqueue(ppid, SIG_TEST, signal_value);
break;
case SBD_MSG_RESET:
exit(EXIT_MD_REQUEST_RESET);
case SBD_MSG_OFF:
exit(EXIT_MD_REQUEST_SHUTOFF);
case SBD_MSG_EXIT:
sigqueue(ppid, SIG_EXITREQ, signal_value);
break;
case SBD_MSG_CRASHDUMP:
exit(EXIT_MD_REQUEST_CRASHDUMP);
default:
/* FIXME:
An "unknown" message might result
from a partial write.
log it and clear the slot.
*/
cl_log(LOG_ERR, "Unknown message on disk %s",
diskname);
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
break;
}
}
sigqueue(ppid, SIG_LIVENESS, signal_value);
t1 = time(NULL);
latency = t1 - t0;
if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: %d exceeded threshold %d on disk %s",
(int)latency, (int)timeout_watchdog_warn,
diskname);
} else if (debug) {
- DBGLOG(LOG_INFO, "Latency: %d on disk %s", (int)latency,
+ DBGLOG(LOG_DEBUG, "Latency: %d on disk %s", (int)latency,
diskname);
}
}
out:
free(s_mbox);
close_device(st);
exit(rc);
}
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 2f06109..a435d01 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -1,457 +1,457 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/* TODO list:
*
* - Trying to shutdown a node if no devices are up will fail, since SBD
* currently uses a message via the disk to achieve this.
*
* - Shutting down cluster nodes while the majority of devices is down
* will eventually take the cluster below the quorum threshold, at which
* time the remaining cluster nodes will all immediately suicide.
*
*/
#include <sys/param.h>
#include <crm/crm.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <sys/utsname.h>
#include <config.h>
#include <crm_config.h>
#include <crm/msg_xml.h>
#include <crm/common/util.h>
#include <crm/common/xml.h>
#include <crm/common/ipc.h>
#include <crm/common/mainloop.h>
#include <crm/cib.h>
#include <crm/pengine/status.h>
#include "sbd.h"
extern int disk_count;
static void clean_up(int rc);
static void crm_diff_update(const char *event, xmlNode * msg);
static int cib_connect(gboolean full);
static void compute_status(pe_working_set_t * data_set);
static gboolean mon_refresh_state(gpointer user_data);
static GMainLoop *mainloop = NULL;
static guint timer_id_reconnect = 0;
static guint timer_id_notify = 0;
static int reconnect_msec = 1000;
static int cib_connected = 0;
static cib_t *cib = NULL;
static xmlNode *current_cib = NULL;
static long last_refresh = 0;
static gboolean
mon_timer_reconnect(gpointer data)
{
int rc = 0;
if (timer_id_reconnect > 0) {
g_source_remove(timer_id_reconnect);
}
rc = cib_connect(TRUE);
if (rc != 0) {
cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc);
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
} else {
cl_log(LOG_INFO, "CIB reconnect successful");
}
return FALSE;
}
static void
mon_cib_connection_destroy(gpointer user_data)
{
if (cib) {
cib->cmds->signoff(cib);
set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
}
cib_connected = 0;
return;
}
static void
mon_retrieve_current_cib()
{
xmlNode *xml_cib = NULL;
int options = cib_scope_local | cib_sync_call;
int rc = pcmk_ok;
free_xml(current_cib);
current_cib = NULL;
rc = cib->cmds->query(cib, NULL, &xml_cib, options);
if (rc != pcmk_ok) {
crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc);
free_xml(xml_cib);
return;
} else if (xml_cib == NULL) {
crm_err("Couldn't retrieve the CIB: empty result");
return;
}
if (safe_str_eq(crm_element_name(xml_cib), XML_TAG_CIB)) {
current_cib = xml_cib;
} else {
free_xml(xml_cib);
}
return;
}
static gboolean
mon_timer_notify(gpointer data)
{
static int counter = 0;
int counter_max = timeout_watchdog / timeout_loop;
if (timer_id_notify > 0) {
g_source_remove(timer_id_notify);
}
if (cib_connected) {
if (counter == counter_max) {
mon_retrieve_current_cib();
mon_refresh_state(NULL);
counter = 0;
} else {
cib->cmds->noop(cib, 0);
notify_parent();
counter++;
}
}
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
return FALSE;
}
/*
* Mainloop signal handler.
*/
static void
mon_shutdown(int nsig)
{
clean_up(0);
}
static int
cib_connect(gboolean full)
{
int rc = 0;
CRM_CHECK(cib != NULL, return -EINVAL);
cib_connected = 0;
crm_xml_init();
if (cib->state != cib_connected_query && cib->state != cib_connected_command) {
rc = cib->cmds->signon(cib, crm_system_name, cib_query);
if (rc != 0) {
return rc;
}
mon_retrieve_current_cib();
mon_refresh_state(NULL);
if (full) {
if (rc == 0) {
rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy);
if (rc == -EPROTONOSUPPORT) {
/* Notification setup failed, won't be able to reconnect after failure */
rc = 0;
}
}
if (rc == 0) {
cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
}
if (rc != 0) {
/* Notification setup failed, could not monitor CIB actions */
clean_up(-rc);
}
}
}
if (!rc) {
cib_connected = 1;
}
return rc;
}
static void
compute_status(pe_working_set_t * data_set)
{
static int updates = 0;
static int ever_had_quorum = FALSE;
node_t *node = pe_find_node(data_set->nodes, local_uname);
updates++;
if (data_set->dc_node == NULL) {
set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now.");
notify_parent();
return;
}
if (node == NULL) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname);
} else if (node->details->online == FALSE) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE");
} else if (node->details->unclean) {
set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN");
} else if (node->details->pending) {
set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending");
#if 0
} else if (node->details->shutdown) {
set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down");
#endif
} else if (data_set->flags & pe_flag_have_quorum) {
set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online");
ever_had_quorum = TRUE;
} else if(disk_count > 0) {
set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost");
} else if(ever_had_quorum == FALSE) {
set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet");
} else {
/* We lost quorum, and there are no disks present
* Setting healthy > 2 here will result in us self-fencing
*/
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources");
break;
case no_quorum_stop:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources");
break;
case no_quorum_ignore:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore");
break;
case no_quorum_suicide:
set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence");
break;
}
}
notify_parent();
return;
}
static crm_trigger_t *refresh_trigger = NULL;
static gboolean
mon_trigger_refresh(gpointer user_data)
{
mainloop_set_trigger(refresh_trigger);
mon_refresh_state(NULL);
return FALSE;
}
static void
crm_diff_update(const char *event, xmlNode * msg)
{
int rc = -1;
const char *op = NULL;
long now = time(NULL);
static int updates = 0;
static mainloop_timer_t *refresh_timer = NULL;
if(refresh_timer == NULL) {
refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL);
refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer);
}
if (current_cib != NULL) {
xmlNode *cib_last = current_cib;
current_cib = NULL;
rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG);
free_xml(cib_last);
switch(rc) {
case -pcmk_err_diff_resync:
case -pcmk_err_diff_failed:
crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
case pcmk_ok:
updates++;
break;
default:
crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
}
}
if (current_cib == NULL) {
mon_retrieve_current_cib();
}
/* Refresh
* - immediately if the last update was more than 5s ago
* - every 10 updates
* - at most 2s after the last update
*/
if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) {
mon_refresh_state(refresh_timer);
updates = 0;
} else {
mainloop_set_trigger(refresh_trigger);
mainloop_timer_start(refresh_timer);
}
}
static gboolean
mon_refresh_state(gpointer user_data)
{
xmlNode *cib_copy = NULL;
pe_working_set_t data_set;
if(current_cib == NULL) {
return FALSE;
}
if(user_data) {
mainloop_timer_t *timer = user_data;
mainloop_timer_stop(timer);
}
cib_copy = copy_xml(current_cib);
if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) {
cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB");
if (cib) {
cib->cmds->signoff(cib);
}
} else {
last_refresh = time(NULL);
set_working_set_defaults(&data_set);
data_set.input = cib_copy;
data_set.flags |= pe_flag_have_stonith_resource;
cluster_status(&data_set);
compute_status(&data_set);
cleanup_calculations(&data_set);
}
return FALSE;
}
static void
clean_up(int rc)
{
if (cib != NULL) {
cib->cmds->signoff(cib);
cib_delete(cib);
cib = NULL;
}
if (rc >= 0) {
exit(rc);
}
return;
}
int
servant_pcmk(const char *diskname, int mode, const void* argp)
{
int exit_code = 0;
crm_system_name = strdup("sbd:pcmk");
- cl_log(LOG_INFO, "Monitoring Pacemaker health");
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
set_proc_title("sbd: watcher: Pacemaker");
setenv("PCMK_watchdog", "true", 1);
if(debug == 0) {
/* We don't want any noisy crm messages */
set_crm_log_level(LOG_CRIT);
}
if (current_cib == NULL) {
cib = cib_new();
do {
exit_code = cib_connect(TRUE);
if (exit_code != 0) {
sleep(reconnect_msec / 1000);
}
} while (exit_code == -ENOTCONN);
if (exit_code != 0) {
clean_up(-exit_code);
}
}
mainloop = g_main_new(FALSE);
mainloop_add_signal(SIGTERM, mon_shutdown);
mainloop_add_signal(SIGINT, mon_shutdown);
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
g_main_run(mainloop);
g_main_destroy(mainloop);
clean_up(0);
return 0; /* never reached */
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Feb 26, 12:13 PM (19 h, 49 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1465336
Default Alt Text
(115 KB)
Attached To
Mode
rS SBD
Attached
Detach File
Event Timeline
Log In to Comment