Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F3687223
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
162 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/src/Makefile.am b/src/Makefile.am
index db10c71..69535cf 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,13 +1,13 @@
AM_CFLAGS = -D_GNU_SOURCE -DCHECK_AIS -DSBINDIR=\"$(sbindir)\"
AM_CPPFLAGS = -I$(includedir)/pacemaker \
-I$(includedir)/heartbeat \
$(glib_CFLAGS)
sbin_PROGRAMS = sbd
-sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig
+sbd_SOURCES = sbd-common.c sbd-watchdog.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig
if SUPPORT_SHARED_DISK
sbd_SOURCES += sbd-md.c
endif
diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index b6c5512..48c6071 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -1,770 +1,770 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <config.h>
#include <crm_config.h>
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <glib-unix.h>
#endif
#include "sbd.h"
//undef SUPPORT_PLUGIN
//define SUPPORT_PLUGIN 1
/* binary for pacemaker-remote has changed with pacemaker 2 */
#ifdef CRM_SCORE_INFINITY
#define PACEMAKER_REMOTE_BINARY "pacemaker-remoted"
#else
#define PACEMAKER_REMOTE_BINARY "pacemaker_remoted"
#endif
static bool remote_node = false;
static pid_t remoted_pid = 0;
static int reconnect_msec = 1000;
static GMainLoop *mainloop = NULL;
static guint notify_timer = 0;
static crm_cluster_t cluster;
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
#if SUPPORT_PLUGIN
static void
sbd_plugin_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
{
if(msg_len > 0) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s", name_for_cluster_type(get_cluster_type()));
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Broken %s message", name_for_cluster_type(get_cluster_type()));
}
notify_parent();
return;
}
#endif
#if SUPPORT_COROSYNC
#if CHECK_VOTEQUORUM_HANDLE
#include <corosync/votequorum.h>
static votequorum_handle_t votequorum_handle = 0;
#endif
#if CHECK_TWO_NODE
static bool two_node = false;
#endif
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
#if CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/votequorum.h>
static bool using_qdevice = false;
static uint32_t qdevice_sync_timeout = /* in seconds */
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
#endif
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
static cmap_track_handle_t track_handle = 0;
static GSource *cmap_source = NULL;
#endif
void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
#if CHECK_TWO_NODE
bool quorum_is_suspect_two_node =
(two_node && ever_seen_both && cpg_membership_entries == 1);
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
bool quorum_is_suspect_qdevice_timing =
using_qdevice && (qdevice_sync_timeout > timeout_watchdog);
#endif
do {
#if CHECK_TWO_NODE
if (quorum_is_suspect_two_node) {
/* Alternative would be asking votequorum for number of votes.
* Using pacemaker's cpg as source for number of active nodes
* avoids binding to an additional library, is definitely
* less code to write and we wouldn't have to combine data
* from 3 sources (cmap, cpg & votequorum) in a potentially
* racy environment.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but requires both nodes present",
name_for_cluster_type(get_cluster_type())
);
break;
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (quorum_is_suspect_qdevice_timing) {
/* We can't really trust quorum info as qdevice-sync_timeout
* makes reaction of quorum too sluggish for our
* watchdog-timeout.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but quorum using qdevice is distrusted "
"for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout "
- "(%lus).",
+ "(%us).",
name_for_cluster_type(get_cluster_type()),
qdevice_sync_timeout, timeout_watchdog
);
break;
}
#endif
set_servant_health(pcmk_health_online, LOG_INFO,
- "Connected to %s (%u members)%s",
+ "Connected to %s (%d members)%s",
name_for_cluster_type(get_cluster_type()),
cpg_membership_entries,
#if CHECK_QDEVICE_SYNC_TIMEOUT
using_qdevice?" using qdevice for quorum":""
#else
""
#endif
);
} while (false);
if (cpg_membership_entries > 1) {
ever_seen_both = true;
}
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Empty %s membership", name_for_cluster_type(get_cluster_type()));
}
}
void
sbd_cpg_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
const struct cpg_address *member_list, size_t member_list_entries,
const struct cpg_address *left_list, size_t left_list_entries,
const struct cpg_address *joined_list, size_t joined_list_entries)
{
cpg_membership_entries = member_list_entries;
sbd_cpg_membership_health_update();
notify_parent();
}
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
int32_t event,
const char *key_name,
struct cmap_notify_value new_val,
struct cmap_notify_value old_val,
void *user_data)
{
switch (event) {
case CMAP_TRACK_ADD:
case CMAP_TRACK_MODIFY:
switch (new_val.type) {
case CMAP_VALUETYPE_UINT8:
#if CHECK_TWO_NODE
if (!strcmp(key_name, "quorum.two_node")) {
two_node = *((uint8_t *) new_val.data);
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_STRING:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.model")) {
using_qdevice =
((new_val.data) && strlen((char *) new_val.data));
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_UINT32:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.sync_timeout")) {
if (new_val.data) {
qdevice_sync_timeout =
*((uint32_t *) new_val.data) / 1000;
} else {
qdevice_sync_timeout =
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
}
} else {
return;
}
break;
#else
return;
#endif
default:
return;
}
break;
case CMAP_TRACK_DELETE:
switch (new_val.type) {
case CMAP_VALUETYPE_UINT8:
#if CHECK_TWO_NODE
if (!strcmp(key_name, "quorum.two_node")) {
two_node = false;
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_STRING:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.model")) {
using_qdevice = false;
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_UINT32:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.sync_timeout")) {
qdevice_sync_timeout =
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
} else {
return;
}
break;
#else
return;
#endif
default:
return;
}
break;
default:
return;
}
sbd_cpg_membership_health_update();
notify_parent();
}
static gboolean
cmap_dispatch_callback (gpointer user_data)
{
cmap_dispatch(cmap_handle, CS_DISPATCH_ALL);
return TRUE;
}
static void
cmap_destroy(void)
{
if (cmap_source) {
g_source_destroy(cmap_source);
cmap_source = NULL;
}
if (track_handle) {
cmap_track_delete(cmap_handle, track_handle);
track_handle = 0;
}
if (cmap_handle) {
cmap_finalize(cmap_handle);
cmap_handle = 0;
}
}
static gboolean
verify_against_cmap_config(void)
{
#if CHECK_TWO_NODE
uint8_t two_node_u8 = 0;
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
char *qdevice_model = NULL;
#endif
int cmap_fd;
if (!track_handle) {
if (cmap_initialize(&cmap_handle) != CS_OK) {
cl_log(LOG_WARNING, "Cannot initialize CMAP service\n");
goto out;
}
#if CHECK_TWO_NODE
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (cmap_track_add(cmap_handle, "quorum.device.model",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n");
goto out;
}
if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING,
"Failed adding CMAP tracker for qdevice-sync_timeout\n");
goto out;
}
#endif
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n");
goto out;
}
if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) {
cl_log(LOG_WARNING, "Couldn't create source for cmap\n");
goto out;
}
g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL);
g_source_attach(cmap_source, NULL);
}
#if CHECK_TWO_NODE
if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8)
== CS_OK) {
cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
"Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (cmap_get_string(cmap_handle, "quorum.device.model",
&qdevice_model) == CS_OK) {
using_qdevice = qdevice_model && strlen(qdevice_model);
cl_log(using_qdevice? LOG_NOTICE : LOG_INFO,
"Corosync is%s using qdevice", using_qdevice?"":" not");
} else {
cl_log(LOG_INFO, "quorum.device.model not present in cmap\n");
}
if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout",
&qdevice_sync_timeout) == CS_OK) {
qdevice_sync_timeout /= 1000;
cl_log(LOG_INFO,
"Corosync is using qdevice-sync_timeout=%ds",
qdevice_sync_timeout);
} else {
cl_log(LOG_INFO,
"quorum.device.sync_timeout not present in cmap\n");
}
#endif
return TRUE;
out:
cmap_destroy();
return FALSE;
}
#endif
#endif
static gboolean
notify_timer_cb(gpointer data)
{
cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":"");
if(remote_node) {
sbd_remote_check(NULL);
return TRUE;
}
switch (get_cluster_type()) {
#if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS
case pcmk_cluster_classic_ais:
send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais);
break;
#endif
case pcmk_cluster_corosync:
do {
#if SUPPORT_COROSYNC && CHECK_VOTEQUORUM_HANDLE
struct votequorum_info info;
if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) {
votequorum_finalize(votequorum_handle);
if (votequorum_initialize(&votequorum_handle, NULL) != CS_OK) {
votequorum_handle = 0;
break;
}
if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) {
break;
}
}
#endif
notify_parent();
} while (0);
break;
#if HAVE_DECL_PCMK_CLUSTER_CMAN
case pcmk_cluster_cman:
notify_parent();
break;
#endif
default:
break;
}
return TRUE;
}
static void
sbd_membership_connect(void)
{
bool connected = false;
cl_log(LOG_INFO, "Attempting cluster connection");
cluster.destroy = sbd_membership_destroy;
#if SUPPORT_PLUGIN
cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch;
#endif
#if SUPPORT_COROSYNC
cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch;
#endif
while(connected == false) {
enum cluster_type_e stack = get_cluster_type();
if(get_cluster_type() == pcmk_cluster_unknown) {
crm_debug("Attempting pacemaker remote connection");
/* Nothing is up, go looking for the pacemaker remote process */
if(find_pacemaker_remote() > 0) {
connected = true;
}
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
if (verify_against_cmap_config()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
}
#endif
}
if(connected == false) {
cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000);
sleep(reconnect_msec / 1000);
}
}
set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership");
notify_parent();
notify_timer_cb(NULL);
}
static void
sbd_membership_destroy(gpointer user_data)
{
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
if (get_cluster_type() != pcmk_cluster_unknown) {
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
cmap_destroy();
#endif
}
set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated");
notify_parent();
/* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */
sbd_membership_connect();
}
/*
* \internal
* \brief Get process ID and name associated with a /proc directory entry
*
* \param[in] entry Directory entry (must be result of readdir() on /proc)
* \param[out] name If not NULL, a char[16] to hold the process name
* \param[out] pid If not NULL, will be set to process ID of entry
*
* \return 0 on success, -1 if entry is not for a process or info not found
*
* \note This should be called only on Linux systems, as not all systems that
* support /proc store process names and IDs in the same way.
* Copied from the Pacemaker implementation.
*/
int
sbd_procfs_process_info(struct dirent *entry, char *name, int *pid)
{
int fd, local_pid;
FILE *file;
struct stat statbuf;
char procpath[128] = { 0 };
/* We're only interested in entries whose name is a PID,
* so skip anything non-numeric or that is too long.
*
* 114 = 128 - strlen("/proc/") - strlen("/status") - 1
*/
local_pid = atoi(entry->d_name);
if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) {
return -1;
}
if (pid) {
*pid = local_pid;
}
/* Get this entry's file information */
strcpy(procpath, "/proc/");
strcat(procpath, entry->d_name);
fd = open(procpath, O_RDONLY);
if (fd < 0 ) {
return -1;
}
if (fstat(fd, &statbuf) < 0) {
close(fd);
return -1;
}
close(fd);
/* We're only interested in subdirectories */
if (!S_ISDIR(statbuf.st_mode)) {
return -1;
}
/* Read the first entry ("Name:") from the process's status file.
* We could handle the valgrind case if we parsed the cmdline file
* instead, but that's more of a pain than it's worth.
*/
if (name != NULL) {
strcat(procpath, "/status");
file = fopen(procpath, "r");
if (!file) {
return -1;
}
if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) != 1) {
fclose(file);
return -1;
}
fclose(file);
}
return 0;
}
static gboolean
sbd_remote_check(gpointer user_data)
{
static int have_proc_pid = 0;
int running = 0;
cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid);
if(have_proc_pid == 0) {
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid());
have_proc_pid = 1;
if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) {
have_proc_pid = -1;
}
}
if (remoted_pid <= 0) {
set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection");
goto notify;
} else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) {
/* Not running */
} else if(have_proc_pid == -1) {
running = 1;
cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid);
} else {
int rc = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid);
rc = readlink(proc_path, exe_path, PATH_MAX - 1);
if (rc < 0) {
crm_perror(LOG_ERR, "Could not read from %s", proc_path);
goto done;
}
exe_path[rc] = 0;
if (strcmp(exe_path, SBINDIR "/" PACEMAKER_REMOTE_BINARY) == 0) {
cl_log(LOG_DEBUG, "Process %s (%ld) is active",
exe_path, (long)remoted_pid);
running = 1;
}
}
done:
if(running) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
notify:
notify_parent();
if(running == 0) {
sbd_membership_connect();
}
return true;
}
static long unsigned int
find_pacemaker_remote(void)
{
DIR *dp;
char entry_name[16];
struct dirent *entry;
dp = opendir("/proc");
if (!dp) {
/* no proc directory to search through */
cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components");
return FALSE;
}
while ((entry = readdir(dp)) != NULL) {
int pid;
if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) {
continue;
}
/* entry_name is truncated to 16 characters including the nul terminator */
- cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid);
+ cl_log(LOG_DEBUG, "Found %s at %d", entry_name, pid);
if (strncmp(entry_name, PACEMAKER_REMOTE_BINARY, 15) == 0) {
- cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid);
+ cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %d", pid);
remoted_pid = pid;
remote_node = true;
break;
}
}
closedir(dp);
return remoted_pid;
}
static void
clean_up(int rc)
{
#if CHECK_VOTEQUORUM_HANDLE
votequorum_finalize(votequorum_handle);
votequorum_handle = 0; /* there isn't really an invalid handle value
* just to be back where we started
*/
#endif
return;
}
static void
cluster_shutdown(int nsig)
{
clean_up(0);
}
int
servant_cluster(const char *diskname, int mode, const void* argp)
{
enum cluster_type_e cluster_stack = get_cluster_type();
crm_system_name = strdup("sbd:cluster");
cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
set_proc_title("sbd: watcher: Cluster");
sbd_membership_connect();
/* stonith_our_uname = cluster.uname; */
/* stonith_our_uuid = cluster.uuid; */
mainloop = g_main_loop_new(NULL, FALSE);
notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL);
mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
g_main_loop_run(mainloop);
g_main_loop_unref(mainloop);
clean_up(0);
return 0; /* never reached */
}
diff --git a/src/sbd-common.c b/src/sbd-common.c
index 3abf75f..7ebf4a7 100644
--- a/src/sbd-common.c
+++ b/src/sbd-common.c
@@ -1,1355 +1,832 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#include <sys/reboot.h>
#include <sys/types.h>
-#ifdef __GLIBC__
-#include <sys/sysmacros.h>
-#endif
#include <sys/stat.h>
#include <pwd.h>
#include <unistd.h>
-#include <dirent.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <limits.h>
#ifdef _POSIX_MEMLOCK
# include <sys/mman.h>
#endif
/* Tunable defaults: */
-unsigned long timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT;
-int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT;
-unsigned long timeout_watchdog_warn = calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT);
-bool do_calculate_timeout_watchdog_warn = true;
-int timeout_allocate = 2;
-int timeout_loop = 1;
-int timeout_io = 3;
-int timeout_startup = 120;
-
-int watchdog_use = 1;
-int watchdog_set_timeout = 1;
-unsigned long timeout_watchdog_crashdump = 0;
-int skip_rt = 0;
-int debug = 0;
-int debug_mode = 0;
-char *watchdogdev = NULL;
-bool watchdogdev_is_default = false;
-char * local_uname;
+int timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT;
+int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT;
+
+int timeout_allocate = 2;
+int timeout_loop = 1;
+int timeout_io = 3;
+int timeout_startup = 120;
+
+int watchdog_use = 1;
+int watchdog_set_timeout = 1;
+int timeout_watchdog_crashdump = 0;
+int skip_rt = 0;
+int debug = 0;
+int debug_mode = 0;
/* Global, non-tunable variables: */
-int sector_size = 0;
-int watchdogfd = -1;
-int servant_health = 0;
+int sector_size = 0;
+int servant_health = 0;
-/*const char *devname;*/
-const char *cmdname;
+const char *cmdname;
+char *local_uname;
void
usage(void)
{
fprintf(stderr,
"Shared storage fencing tool.\n"
"Syntax:\n"
" %s <options> <command> <cmdarguments>\n"
"Options:\n"
"-d <devname> Block device to use (mandatory; can be specified up to 3 times)\n"
"-h Display this help.\n"
"-n <node> Set local node name; defaults to uname -n (optional)\n"
"\n"
"-R Do NOT enable realtime priority (debugging only)\n"
"-W Use watchdog (recommended) (watch only)\n"
"-w <dev> Specify watchdog device (optional) (watch only)\n"
"-T Do NOT initialize the watchdog timeout (watch only)\n"
"-S <0|1> Set start mode if the node was previously fenced (watch only)\n"
"-p <path> Write pidfile to the specified path (watch only)\n"
"-v|-vv|-vvv Enable verbose|debug|debug-library logging (optional)\n"
"\n"
"-1 <N> Set watchdog timeout to N seconds (optional, create only)\n"
"-2 <N> Set slot allocation timeout to N seconds (optional, create only)\n"
"-3 <N> Set daemon loop timeout to N seconds (optional, create only)\n"
"-4 <N> Set msgwait timeout to N seconds (optional, create only)\n"
"-5 <N> Warn if loop latency exceeds threshold (optional, watch only)\n"
" (default is 3, set to 0 to disable)\n"
"-C <N> Watchdog timeout to set before crashdumping\n"
" (def: 0s = disable gracefully, optional)\n"
"-I <N> Async IO read timeout (defaults to 3 * loop timeout, optional)\n"
"-s <N> Timeout to wait for devices to become available (def: 120s)\n"
"-t <N> Dampening delay before faulty servants are restarted (optional)\n"
" (default is 5, set to 0 to disable)\n"
"-F <N> # of failures before a servant is considered faulty (optional)\n"
" (default is 1, set to 0 to disable)\n"
"-P Check Pacemaker quorum and node health (optional, watch only)\n"
"-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n"
"-r Set timeout-action to comma-separated combination of\n"
" noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n"
"Commands:\n"
#if SUPPORT_SHARED_DISK
"create initialize N slots on <dev> - OVERWRITES DEVICE!\n"
"list List all allocated slots on device, and messages.\n"
"dump Dump meta-data header from device.\n"
"allocate <node>\n"
" Allocate a slot for node (optional)\n"
"message <node> (test|reset|off|crashdump|clear|exit)\n"
" Writes the specified message to node's slot.\n"
#endif
"watch Loop forever, monitoring own slot\n"
"query-watchdog Check for available watchdog-devices and print some info\n"
"test-watchdog Test the watchdog-device selected.\n"
" Attention: This will arm the watchdog and have your system reset\n"
" in case your watchdog is working properly!\n"
, cmdname);
}
-#define MAX_WATCHDOGS 64
-#define SYS_CLASS_WATCHDOG "/sys/class/watchdog"
-#define SYS_CHAR_DEV_DIR "/sys/dev/char"
-#define WATCHDOG_NODEDIR "/dev/"
-
-static bool
-is_watchdog(dev_t device)
-{
- static int num_watchdog_devs = 0;
- static dev_t watchdog_devs[MAX_WATCHDOGS];
- struct dirent *entry;
- int i;
-
- /* populate on first call */
- if (num_watchdog_devs == 0) {
- DIR *dp;
-
- watchdog_devs[0] = makedev(10,130);
- num_watchdog_devs = 1;
-
- /* get additional devices from /sys/class/watchdog */
- dp = opendir(SYS_CLASS_WATCHDOG);
- if (dp) {
- while ((entry = readdir(dp))) {
- if (entry->d_type == DT_LNK) {
- FILE *file;
- char entry_name[NAME_MAX+sizeof(SYS_CLASS_WATCHDOG)+5];
-
- snprintf(entry_name, sizeof(entry_name),
- SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
- file = fopen(entry_name, "r");
- if (file) {
- int major, minor;
-
- if (fscanf(file, "%d:%d", &major, &minor) == 2) {
- watchdog_devs[num_watchdog_devs++] = makedev(major, minor);
- }
- fclose(file);
- if (num_watchdog_devs == MAX_WATCHDOGS) {
- break;
- }
- }
- }
- }
- closedir(dp);
- }
- }
-
- for (i=0; i < num_watchdog_devs; i++) {
- if (device == watchdog_devs[i]) {
- return true;
- }
- }
- return false;
-}
-
-static int
-watchdog_init_interval_fd(int wdfd, int timeout)
-{
- if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) {
- cl_perror( "WDIOC_SETTIMEOUT"
- ": Failed to set watchdog timer to %u seconds.",
- timeout);
- cl_log(LOG_CRIT, "Please validate your watchdog configuration!");
- cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure.");
- return -1;
- }
- return 0;
-}
-
-int
-watchdog_init_interval(void)
-{
- if (watchdogfd < 0) {
- return 0;
- }
-
- if (watchdog_set_timeout == 0) {
- cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
- return 0;
- }
-
- if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) {
- return -1;
- }
- cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
- return 0;
-}
-
-static int
-watchdog_tickle_fd(int wdfd, char *wddev)
-{
- if (write(wdfd, "", 1) != 1) {
- cl_perror("Watchdog write failure: %s!", wddev);
- return -1;
- }
- return 0;
-}
-
-int
-watchdog_tickle(void)
-{
- if (watchdogfd >= 0) {
- return watchdog_tickle_fd(watchdogfd, watchdogdev);
- }
- return 0;
-}
-
-static int
-watchdog_init_fd(char *wddev, int timeout)
-{
- int wdfd;
-
- wdfd = open(wddev, O_WRONLY);
- if (wdfd >= 0) {
- if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0)) ||
- (watchdog_tickle_fd(wdfd, wddev) < 0)) {
- close(wdfd);
- return -1;
- }
- } else {
- struct stat statbuf;
-
- if(!stat(wddev, &statbuf) && S_ISCHR(statbuf.st_mode) &&
- is_watchdog(statbuf.st_rdev)) {
- cl_perror("Cannot open watchdog device '%s'", wddev);
- } else {
- cl_perror("Seems as if '%s' isn't a valid watchdog-device", wddev);
- }
- return -1;
- }
- return wdfd;
-}
-
-int
-watchdog_init(void)
-{
- if (watchdogfd < 0 && watchdogdev != NULL) {
- int timeout = timeout_watchdog;
-
- if (watchdog_set_timeout == 0) {
- cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!");
- timeout = -1;
- }
- watchdogfd = watchdog_init_fd(watchdogdev, timeout);
- if (watchdogfd >= 0) {
- cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
- if (watchdog_set_timeout) {
- cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog);
- }
- } else {
- return -1;
- }
- }
- return 0;
-}
-
-static void
-watchdog_close_fd(int wdfd, char *wddev, bool disarm)
-{
- if (disarm) {
- int r;
- int flags = WDIOS_DISABLECARD;;
-
- /* Explicitly disarm it */
- r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags);
- if (r < 0) {
- cl_perror("Failed to disable hardware watchdog %s", wddev);
- }
-
- /* To be sure, use magic close logic, too */
- for (;;) {
- if (write(wdfd, "V", 1) > 0) {
- break;
- }
- cl_perror("Cannot disable watchdog device %s", wddev);
- }
- }
-
- if (close(wdfd) < 0) {
- cl_perror("Watchdog close(%d) failed", wdfd);
- }
-}
-
-void
-watchdog_close(bool disarm)
-{
- if (watchdogfd < 0) {
- return;
- }
-
- watchdog_close_fd(watchdogfd, watchdogdev, disarm);
- watchdogfd = -1;
-}
-
-struct watchdog_list_item {
- dev_t dev;
- char *dev_node;
- char *dev_ident;
- char *dev_driver;
- pid_t busy_pid;
- char *busy_name;
- struct watchdog_list_item *next;
-};
-
-struct link_list_item {
- char *dev_node;
- char *link_name;
- struct link_list_item *next;
-};
-
-static struct watchdog_list_item *watchdog_list = NULL;
-static int watchdog_list_items = 0;
-
-static void
-watchdog_populate_list(void)
-{
- struct dirent *entry;
- char entry_name[sizeof(WATCHDOG_NODEDIR)+NAME_MAX];
- DIR *dp;
- char buf[NAME_MAX+sizeof(WATCHDOG_NODEDIR)] = "";
- struct link_list_item *link_list = NULL;
-
- if (watchdog_list != NULL) {
- return;
- }
-
- /* search for watchdog nodes in /dev */
- dp = opendir(WATCHDOG_NODEDIR);
- if (dp) {
- /* first go for links and memorize them */
- while ((entry = readdir(dp))) {
- if (entry->d_type == DT_LNK) {
- int len;
-
- snprintf(entry_name, sizeof(entry_name),
- WATCHDOG_NODEDIR "%s", entry->d_name);
-
- /* realpath(entry_name, buf) unfortunately does a stat on
- * target so we can't really use it to check if links stay
- * within /dev without triggering e.g. AVC-logs (with
- * SELinux policy that just allows stat within /dev).
- * Without canonicalization that doesn't actually touch the
- * filesystem easily available introduce some limitations
- * for simplicity:
- * - just simple path without '..'
- * - just one level of symlinks (avoid e.g. loop-checking)
- */
- len = readlink(entry_name, buf, sizeof(buf) - 1);
- if ((len < 1) ||
- (len > sizeof(buf) - sizeof(WATCHDOG_NODEDIR) -1 - 1)) {
- continue;
- }
- buf[len] = '\0';
- if (buf[0] != '/') {
- memmove(&buf[sizeof(WATCHDOG_NODEDIR)-1], buf, len+1);
- memcpy(buf, WATCHDOG_NODEDIR, sizeof(WATCHDOG_NODEDIR)-1);
- len += sizeof(WATCHDOG_NODEDIR)-1;
- }
- if (strstr(buf, "/../") ||
- strncmp(WATCHDOG_NODEDIR, buf, sizeof(WATCHDOG_NODEDIR)-1)) {
- continue;
- } else {
- /* just memorize to avoid statting the target - SELinux */
- struct link_list_item *lli =
- calloc(1, sizeof(struct link_list_item));
-
- if (lli == NULL) {
- break;
- }
- lli->dev_node = strdup(buf);
- lli->link_name = strdup(entry_name);
- if ((lli->dev_node == NULL) || (lli->link_name == NULL)) {
- free(lli->dev_node);
- free(lli->link_name);
- free(lli);
- break;
- }
- lli->next = link_list;
- link_list = lli;
- }
- }
- }
-
- rewinddir(dp);
-
- while ((entry = readdir(dp))) {
- if (entry->d_type == DT_CHR) {
- struct stat statbuf;
-
- snprintf(entry_name, sizeof(entry_name),
- WATCHDOG_NODEDIR "%s", entry->d_name);
- if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode) &&
- is_watchdog(statbuf.st_rdev)) {
-
- int wdfd;
- struct watchdog_list_item *wdg =
- calloc(1, sizeof(struct watchdog_list_item));
- int len;
- struct link_list_item *tmp_list = NULL;
-
- if (wdg == NULL) {
- break;
- }
-
- wdg->dev = statbuf.st_rdev;
- wdg->dev_node = strdup(entry_name);
- if (wdg->dev_node == NULL) {
- free(wdg);
- break;
- }
- wdg->next = watchdog_list;
- watchdog_list = wdg;
- watchdog_list_items++;
-
- wdfd = watchdog_init_fd(entry_name, -1);
- if (wdfd >= 0) {
- struct watchdog_info ident;
-
- ident.identity[0] = '\0';
- ioctl(wdfd, WDIOC_GETSUPPORT, &ident);
- watchdog_close_fd(wdfd, entry_name, true);
- if (ident.identity[0]) {
- wdg->dev_ident = strdup((char *) ident.identity);
- }
- }
-
- snprintf(entry_name, sizeof(entry_name),
- SYS_CHAR_DEV_DIR "/%d:%d/device/driver",
- major(wdg->dev), minor(wdg->dev));
- len = readlink(entry_name, buf, sizeof(buf) - 1);
- if (len > 0) {
- buf[len] = '\0';
- wdg->dev_driver = strdup(basename(buf));
- } else if ((wdg->dev_ident) &&
- (strcmp(wdg->dev_ident,
- "Software Watchdog") == 0)) {
- wdg->dev_driver = strdup("softdog");
- }
-
- /* create dupes if we have memorized links
- * to this node
- */
- for (tmp_list = link_list; tmp_list;
- tmp_list = tmp_list->next) {
- if (!strcmp(tmp_list->dev_node,
- wdg->dev_node)) {
- struct watchdog_list_item *dupe_wdg =
- calloc(1, sizeof(struct watchdog_list_item));
-
- if (dupe_wdg == NULL) {
- break;
- }
- /* as long as we never purge watchdog_list
- * there is no need to dupe strings
- */
- *dupe_wdg = *wdg;
- dupe_wdg->dev_node = strdup(tmp_list->link_name);
- if (dupe_wdg->dev_node == NULL) {
- free(dupe_wdg);
- break;
- }
- dupe_wdg->next = watchdog_list;
- watchdog_list = dupe_wdg;
- watchdog_list_items++;
- }
- /* for performance reasons we could remove
- * the link_list entry
- */
- }
- }
- }
- }
-
- closedir(dp);
- }
-
- /* cleanup link list */
- while (link_list) {
- struct link_list_item *tmp_list = link_list;
-
- link_list = link_list->next;
- free(tmp_list->dev_node);
- free(tmp_list->link_name);
- free(tmp_list);
- }
-}
-
-static void
-watchdog_checkbusy()
-{
- DIR *dproc;
- struct dirent *entry;
-
- dproc = opendir("/proc");
- if (!dproc) {
- /* no proc directory to search through */
- return;
- }
-
- while ((entry = readdir(dproc)) != NULL) {
- pid_t local_pid;
- char *leftover;
- DIR *dpid;
- char procpath[NAME_MAX+10] = { 0 };
-
- if (entry->d_name[0] == '.') {
- continue;
- }
-
- local_pid = strtol(entry->d_name, &leftover, 10);
- if (leftover[0] != '\0')
- continue;
-
- snprintf(procpath, sizeof(procpath), "/proc/%s/fd", entry->d_name);
- dpid = opendir(procpath);
- if (!dpid) {
- /* silently continue - might be just a race */
- continue;
- }
- while ((entry = readdir(dpid)) != NULL) {
- struct watchdog_list_item *wdg;
- char entry_name[sizeof(procpath)+NAME_MAX+1] = { 0 };
- char buf[NAME_MAX+1] = { 0 };
- int len;
-
- if (entry->d_type != DT_LNK) {
- continue;
- }
- snprintf(entry_name, sizeof(entry_name),
- "%s/%s", procpath, entry->d_name);
- len = readlink(entry_name, buf, sizeof(buf) - 1);
- if (len < 1) {
- continue;
- }
- buf[len] = '\0';
- for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
- if (!strcmp(buf, wdg->dev_node)) {
- char name[16];
- FILE *file;
-
- wdg->busy_pid = local_pid;
- snprintf(procpath, sizeof(procpath), "/proc/%d/status", local_pid);
- file = fopen(procpath, "r");
- if (file) {
- if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) == 1) {
- wdg->busy_name = strdup(name);
- }
- fclose(file);
- }
- }
- }
- }
- closedir(dpid);
- }
-
- closedir(dproc);
-
- return;
-}
-
-int watchdog_info(void)
-{
- struct watchdog_list_item *wdg;
- int wdg_cnt = 0;
-
- watchdog_populate_list();
- watchdog_checkbusy();
- printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items);
- for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
- wdg_cnt++;
- if (wdg->busy_pid) {
- printf("\n[%d] %s\nIdentity: Busy: PID %d (%s)\nDriver: %s\n",
- wdg_cnt, wdg->dev_node,
- wdg->busy_pid,
- wdg->busy_name?wdg->busy_name:"<unknown>",
- wdg->dev_driver?wdg->dev_driver:"<unknown>");
- } else {
- printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n",
- wdg_cnt, wdg->dev_node,
- wdg->dev_ident?wdg->dev_ident:
- "Error: device hogged via alias major/minor?",
- wdg->dev_driver?wdg->dev_driver:"<unknown>");
- }
- if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) {
- printf("CAUTION: Not recommended for use with sbd.\n");
- }
- }
-
- return 0;
-}
-
-int watchdog_test(void)
-{
- int i;
-
- if ((watchdog_set_timeout == 0) || !watchdog_use) {
- printf("\nWatchdog is disabled - aborting test!!!\n");
- return 0;
- }
- if (watchdogdev_is_default) {
- watchdog_populate_list();
- if (watchdog_list_items > 1) {
- printf("\nError: Multiple watchdog devices discovered.\n"
- " Use -w <watchdog> or SBD_WATCHDOG_DEV to specify\n"
- " which device to reset the system with\n");
- watchdog_info();
- return -1;
- }
- }
- if ((isatty(fileno(stdin)))) {
- char buffer[16];
- printf("\nWARNING: This operation is expected to force-reboot this system\n"
- " without following any shutdown procedures.\n\n"
- "Proceed? [NO/Proceed] ");
-
- if ((fgets(buffer, 16, stdin) == NULL) ||
- strcmp(buffer, "Proceed\n")) {
- printf("\nAborting watchdog test!!!\n");
- return 0;
- }
- printf("\n");
- }
- printf("Initializing %s with a reset countdown of %d seconds ...\n",
- watchdogdev, (int) timeout_watchdog);
- if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) {
- printf("Failed to initialize watchdog!!!\n");
- watchdog_info();
- return -1;
- }
- printf("\n");
- printf("NOTICE: The watchdog device is expected to reset the system\n"
- " in %d seconds. If system remains active beyond that time,\n"
- " watchdog may not be functional.\n\n", (int) timeout_watchdog);
- for (i=timeout_watchdog; i>1; i--) {
- printf("Reset countdown ... %d seconds\n", i);
- sleep(1);
- }
- for (i=2; i>0; i--) {
- printf("System expected to reset any moment ...\n");
- sleep(1);
- }
- for (i=5; i>0; i--) {
- printf("System should have reset ...\n");
- sleep(1);
- }
- printf("Error: The watchdog device has failed to reboot the system,\n"
- " and it may not be suitable for usage with sbd.\n");
-
- /* test should trigger a reboot thus returning is actually bad */
- return -1;
-}
-
/* This duplicates some code from linux/ioprio.h since these are not included
* even in linux-kernel-headers. Sucks. See also
* /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */
extern int sys_ioprio_set(int, int, int);
int ioprio_set(int which, int who, int ioprio);
inline int ioprio_set(int which, int who, int ioprio)
{
return syscall(__NR_ioprio_set, which, who, ioprio);
}
enum {
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
static void
sbd_stack_hogger(unsigned char * inbuf, int kbytes)
{
unsigned char buf[1024];
if(kbytes <= 0) {
return;
}
if (inbuf == NULL) {
memset(buf, HOG_CHAR, sizeof(buf));
} else {
memcpy(buf, inbuf, sizeof(buf));
}
if (kbytes > 0) {
sbd_stack_hogger(buf, kbytes-1);
}
return;
}
static void
sbd_malloc_hogger(int kbytes)
{
int j;
void**chunks;
int chunksize = 1024;
if(kbytes <= 0) {
return;
}
/*
* We could call mallopt(M_MMAP_MAX, 0) to disable it completely,
* but we've already called mlockall()
*
* We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc
* from giving memory back to the system, but we've already called
* mlockall(MCL_FUTURE), so there's no need.
*/
chunks = malloc(kbytes * sizeof(void *));
if (chunks == NULL) {
cl_log(LOG_WARNING, "Could not preallocate chunk array");
return;
}
for (j=0; j < kbytes; ++j) {
chunks[j] = malloc(chunksize);
if (chunks[j] == NULL) {
cl_log(LOG_WARNING, "Could not preallocate block %d", j);
} else {
memset(chunks[j], 0, chunksize);
}
}
for (j=0; j < kbytes; ++j) {
free(chunks[j]);
}
free(chunks);
}
static void sbd_memlock(int stackgrowK, int heapgrowK)
{
#ifdef _POSIX_MEMLOCK
/*
* We could call setrlimit(RLIMIT_MEMLOCK,...) with a large
* number, but the mcp runs as root and mlock(2) says:
*
* Since Linux 2.6.9, no limits are placed on the amount of memory
* that a privileged process may lock, and this limit instead
* governs the amount of memory that an unprivileged process may
* lock.
*/
if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) {
cl_log(LOG_INFO, "Locked ourselves in memory");
/* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */
sbd_malloc_hogger(heapgrowK);
sbd_stack_hogger(NULL, stackgrowK);
} else {
cl_perror("Unable to lock ourselves into memory");
}
#else
cl_log(LOG_ERR, "Unable to lock ourselves into memory");
#endif
}
static int get_realtime_budget(void)
{
FILE *f;
char fname[PATH_MAX];
int res = -1, lnum = 0, num;
char *cgroup = NULL, *namespecs = NULL;
snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid());
f = fopen(fname, "rt");
if (f == NULL) {
cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd",
(intmax_t)getpid());
goto exit_res;
}
while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum,
&namespecs, &cgroup)) !=EOF ) {
if (namespecs && strstr(namespecs, "cpuacct")) {
free(namespecs);
break;
}
if (cgroup) {
free(cgroup);
cgroup = NULL;
}
if (namespecs) {
free(namespecs);
namespecs = NULL;
}
/* not to get stuck if format changes */
if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) ||
(fscanf(f, "\n") == EOF))) {
break;
}
}
fclose(f);
if (cgroup == NULL) {
cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd",
(intmax_t)getpid());
goto exit_res;
}
snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us",
cgroup);
f = fopen(fname, "rt");
if (f == NULL) {
cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but "
"doesn't for '%s'", cgroup);
goto exit_res;
}
if (fscanf(f, "%d", &res) != 1) {
cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname);
} else {
cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res);
}
fclose(f);
exit_res:
if (cgroup) {
free(cgroup);
}
return res;
}
/* stolen from corosync */
static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
FILE *f;
int res = -1;
/*
* /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
* using systemd and systemd uses hardcoded path of cgroup mount point.
*
* This feature is expected to be removed as soon as systemd gets support
* for managing RT configuration.
*/
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
if (f == NULL) {
cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> "
"system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
res = 0;
goto exit_res;
}
fclose(f);
if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) {
cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are "
"-> skip moving to root-slice");
res = 0;
goto exit_res;
}
f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
if (f == NULL) {
cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing");
goto exit_res;
}
if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) {
cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file");
goto close_and_exit_res;
}
close_and_exit_res:
if (fclose(f) != 0) {
cl_log(LOG_WARNING, "Can't close cgroups tasks file");
goto exit_res;
}
exit_res:
return (res);
}
void
sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
{
if(priority < 0) {
return;
}
do {
#ifdef SCHED_RR
if (move_to_root_cgroup) {
sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup);
}
{
int pmin = sched_get_priority_min(SCHED_RR);
int pmax = sched_get_priority_max(SCHED_RR);
struct sched_param sp;
int pcurrent;
if (priority == 0) {
priority = pmax;
} else if (priority < pmin) {
priority = pmin;
} else if (priority > pmax) {
priority = pmax;
}
if (sched_getparam(0, &sp) < 0) {
cl_perror("Unable to get scheduler priority");
} else if ((pcurrent = sched_getscheduler(0)) < 0) {
cl_perror("Unable to get scheduler policy");
} else if ((pcurrent == SCHED_RR) &&
(sp.sched_priority >= priority)) {
cl_log(LOG_INFO,
"Stay with priority (%d) for policy SCHED_RR",
sp.sched_priority);
break;
} else {
memset(&sp, 0, sizeof(sp));
sp.sched_priority = priority;
if (sched_setscheduler(0, SCHED_RR, &sp) < 0) {
cl_perror(
"Unable to set scheduler policy to SCHED_RR priority %d",
priority);
} else {
cl_log(LOG_INFO,
"Scheduler policy is now SCHED_RR priority %d",
priority);
break;
}
}
}
#else
cl_log(LOG_ERR, "System does not support updating the scheduler policy");
#endif
#ifdef PRIO_PGRP
if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) {
cl_perror("Unable to raise the scheduler priority");
} else {
cl_log(LOG_INFO, "Scheduler priority raised to the maximum");
}
#else
cl_perror("System does not support setting the scheduler priority");
#endif
} while (0);
sbd_memlock(heapgrowK, stackgrowK);
}
void
maximize_priority(void)
{
if (skip_rt) {
cl_log(LOG_INFO, "Not elevating to realtime (-R specified).");
return;
}
sbd_make_realtime(0, 256, 256);
if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(),
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) {
cl_perror("ioprio_set() call failed.");
}
}
void
sysrq_init(void)
{
FILE* procf;
int c;
procf = fopen("/proc/sys/kernel/sysrq", "r");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for read.");
return;
}
if (fscanf(procf, "%d", &c) != 1) {
cl_perror("Parsing sysrq failed");
c = 0;
}
fclose(procf);
if (c == 1)
return;
/* 8 for debugging dumps of processes,
128 for reboot/poweroff */
c |= 136;
procf = fopen("/proc/sys/kernel/sysrq", "w");
if (!procf) {
cl_perror("cannot open /proc/sys/kernel/sysrq for writing");
return;
}
fprintf(procf, "%d", c);
fclose(procf);
return;
}
void
sysrq_trigger(char t)
{
FILE *procf;
procf = fopen("/proc/sysrq-trigger", "a");
if (!procf) {
cl_perror("Opening sysrq-trigger failed.");
return;
}
cl_log(LOG_INFO, "sysrq-trigger: %c\n", t);
fprintf(procf, "%c\n", t);
fclose(procf);
return;
}
static void
do_exit(char kind, bool do_flush)
{
/* TODO: Turn debug_mode into a bit field? Delay + kdump for example */
const char *reason = NULL;
if (kind == 'c') {
cl_log(LOG_NOTICE, "Initiating kdump");
} else if (debug_mode == 1) {
cl_log(LOG_WARNING, "Initiating kdump instead of panicking the node (debug mode)");
kind = 'c';
}
if (debug_mode == 2) {
cl_log(LOG_WARNING, "Shutting down SBD instead of panicking the node (debug mode)");
watchdog_close(true);
exit(0);
}
if (debug_mode == 3) {
/* Give the system some time to flush logs to disk before rebooting. */
cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)");
watchdog_close(true);
sync();
sleep(10);
}
switch(kind) {
case 'b':
reason = "reboot";
break;
case 'c':
reason = "crashdump";
break;
case 'o':
reason = "off";
break;
default:
reason = "unknown";
break;
}
cl_log(LOG_EMERG, "Rebooting system: %s", reason);
if (do_flush) {
sync();
}
if (kind == 'c') {
if (timeout_watchdog_crashdump) {
if (timeout_watchdog != timeout_watchdog_crashdump) {
timeout_watchdog = timeout_watchdog_crashdump;
watchdog_init_interval();
}
watchdog_close(false);
} else {
watchdog_close(true);
}
sysrq_trigger(kind);
} else {
watchdog_close(false);
sysrq_trigger(kind);
if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) {
cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot");
}
}
exit(1);
}
void
do_crashdump(void)
{
do_exit('c', true);
}
void
do_reset(void)
{
do_exit('b', true);
}
void
do_off(void)
{
do_exit('o', true);
}
void
do_timeout_action(void)
{
do_exit(timeout_sysrq_char, do_flush);
}
/*
* Change directory to the directory our core file needs to go in
* Call after you establish the userid you're running under.
*/
int
sbd_cdtocoredir(void)
{
int rc;
static const char *dir = NULL;
if (dir == NULL) {
dir = CRM_CORE_DIR;
}
if ((rc=chdir(dir)) < 0) {
int errsave = errno;
cl_perror("Cannot chdir to [%s]", dir);
errno = errsave;
}
return rc;
}
pid_t
make_daemon(void)
{
pid_t pid;
const char * devnull = "/dev/null";
pid = fork();
if (pid < 0) {
cl_log(LOG_ERR, "%s: could not start daemon\n",
cmdname);
cl_perror("fork");
exit(1);
}else if (pid > 0) {
return pid;
}
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
/* This is the child; ensure privileges have not been lost. */
maximize_priority();
sysrq_init();
umask(022);
close(0);
(void)open(devnull, O_RDONLY);
close(1);
(void)open(devnull, O_WRONLY);
close(2);
(void)open(devnull, O_WRONLY);
sbd_cdtocoredir();
return 0;
}
void
sbd_get_uname(void)
{
struct utsname uname_buf;
int i;
if (uname(&uname_buf) < 0) {
cl_perror("uname() failed?");
exit(1);
}
local_uname = strdup(uname_buf.nodename);
for (i = 0; i < strlen(local_uname); i++)
local_uname[i] = tolower(local_uname[i]);
}
#define FMT_MAX 256
void
sbd_set_format_string(int method, const char *daemon)
{
int offset = 0;
char fmt[FMT_MAX];
struct utsname res;
switch(method) {
case QB_LOG_STDERR:
break;
case QB_LOG_SYSLOG:
if(daemon && strcmp(daemon, "sbd") != 0) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon);
}
break;
default:
/* When logging to a file */
if (uname(&res) == 0) {
offset +=
snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(),
res.nodename, daemon);
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon);
}
}
if (debug && method >= QB_LOG_STDERR) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: ");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: ");
}
if (method == QB_LOG_SYSLOG) {
offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b");
} else {
offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b");
}
if(offset > 0) {
qb_log_format_set(method, fmt);
}
}
+int sigqueue_zero(pid_t pid, int sig)
+{
+union sigval signal_value;
+
+ memset(&signal_value, 0, sizeof(signal_value));
+
+ return sigqueue(pid, sig, signal_value);
+}
+
void
notify_parent(void)
{
pid_t ppid;
- union sigval signal_value;
- memset(&signal_value, 0, sizeof(signal_value));
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
cl_log(LOG_WARNING, "Our parent is dead.");
do_timeout_action();
}
switch (servant_health) {
case pcmk_health_pending:
case pcmk_health_shutdown:
case pcmk_health_transient:
DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health);
break;
case pcmk_health_unknown:
case pcmk_health_unclean:
case pcmk_health_noquorum:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health);
- sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
+ sigqueue_zero(ppid, SIG_PCMK_UNHEALTHY);
break;
case pcmk_health_online:
DBGLOG(LOG_DEBUG, "Notifying parent: healthy");
- sigqueue(ppid, SIG_LIVENESS, signal_value);
+ sigqueue_zero(ppid, SIG_LIVENESS);
break;
default:
DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health);
- sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value);
+ sigqueue_zero(ppid, SIG_PCMK_UNHEALTHY);
break;
}
}
void
set_servant_health(enum pcmk_health state, int level, char const *format, ...)
{
if (servant_health != state) {
va_list ap;
int len = 0;
char *string = NULL;
servant_health = state;
va_start(ap, format);
len = vasprintf (&string, format, ap);
if(len > 0) {
cl_log(level, "%s", string);
}
va_end(ap);
free(string);
}
}
bool
sbd_is_disk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(servant->devname[0] == '/')) {
return true;
}
return false;
}
bool
sbd_is_cluster(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("cluster", servant->devname) == 0)) {
return true;
}
return false;
}
bool
sbd_is_pcmk(struct servants_list_item *servant)
{
if ((servant != NULL) &&
(servant->devname != NULL) &&
(strcmp("pcmk", servant->devname) == 0)) {
return true;
}
return false;
}
+
+#define MAX_LEGITIMATE_AGE 3600 /* 1h should be plenty */
+
+int
+seconds_diff_time_t(time_t a, time_t b)
+{
+ long long diff;
+
+ diff = a - b;
+
+ if ((diff > -MAX_LEGITIMATE_AGE) && (diff < MAX_LEGITIMATE_AGE)) {
+ return (int) diff;
+ }
+
+ DBGLOG(LOG_WARNING, "Detected unreasonable age (%lld)", diff);
+ return MAX_LEGITIMATE_AGE; /* something is fishy - provoke timeout */
+}
+
+int
+seconds_diff_timespec(struct timespec *a, struct timespec *b)
+{
+ struct timeval diff;
+ struct timeval a_tv;
+ struct timeval b_tv;
+
+ TIMESPEC_TO_TIMEVAL(&a_tv, a);
+ TIMESPEC_TO_TIMEVAL(&b_tv, b);
+
+ timersub(&a_tv, &b_tv, &diff);
+
+ return seconds_diff_time_t(diff.tv_sec, 0);
+}
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index d917cd1..56fd8a8 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -1,1408 +1,1405 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <crm/common/util.h>
#include "sbd.h"
#define LOCKSTRLEN 11
static struct servants_list_item *servants_leader = NULL;
int disk_priority = 1;
int check_pcmk = 1;
int check_cluster = 1;
int has_check_pcmk_env = false;
int disk_count = 0;
int servant_count = 0;
int servant_restart_interval = 5;
int servant_restart_count = 1;
int start_mode = 0;
char* pidfile = NULL;
bool do_flush = true;
char timeout_sysrq_char = 'b';
bool move_to_root_cgroup = true;
bool enforce_moving_to_root_cgroup = false;
bool sync_resource_startup = false;
int parse_device_line(const char *line);
static int
sanitize_numeric_option_value(const char *value)
{
char *end = NULL;
long int result = -1;
if (value == NULL) {
return -1;
}
errno = 0;
result = strtol(value, &end, 10);
if (result <= INT_MIN || result >= INT_MAX || errno != 0) {
result = -1;
} else if (*end != '\0') {
result = -1;
}
return (int)result;
}
static const char *
sanitize_option_value(const char *value)
{
size_t max = 0;
size_t lpc = 0;
if (value == NULL) {
return NULL;
}
max = strlen(value);
for (lpc = 0; lpc < max; lpc++) {
if (!isspace(value[lpc])) {
break;
}
}
return (strlen(value + lpc) > 0 ? (value + lpc) : NULL);
}
static const char *
get_env_option(const char *option)
{
const char *value = getenv(option);
return sanitize_option_value(value);
}
static int
recruit_servant(const char *devname, pid_t pid)
{
struct servants_list_item *s = servants_leader;
struct servants_list_item *newbie;
if (lookup_servant_by_dev(devname)) {
cl_log(LOG_DEBUG, "Servant %s already exists", devname);
return 0;
}
newbie = malloc(sizeof(*newbie));
if (newbie) {
memset(newbie, 0, sizeof(*newbie));
newbie->devname = strdup(devname);
newbie->pid = pid;
newbie->first_start = 1;
}
if (!newbie || !newbie->devname) {
fprintf(stderr, "heap allocation failed in recruit_servant.\n");
exit(1);
}
/* some sanity-check on our newbie */
if (sbd_is_disk(newbie)) {
cl_log(LOG_INFO, "Monitoring %s", devname);
disk_count++;
} else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) {
/* alive just after pcmk and cluster servants have shown up */
newbie->outdated = 1;
} else {
/* toss our newbie */
cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname);
free((void *) newbie->devname);
free(newbie);
return -1;
}
if (!s) {
servants_leader = newbie;
} else {
while (s->next)
s = s->next;
s->next = newbie;
}
servant_count++;
return 0;
}
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
{
pid_t pid = 0;
int rc = 0;
pid = fork();
if (pid == 0) { /* child */
maximize_priority();
sbd_set_format_string(QB_LOG_SYSLOG, devname);
rc = (*functionp)(devname, mode, argp);
if (rc == -1)
exit(1);
else
exit(0);
} else if (pid != -1) { /* parent */
return pid;
} else {
cl_log(LOG_ERR,"Failed to fork servant");
exit(1);
}
}
struct servants_list_item *lookup_servant_by_dev(const char *devname)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (strcasecmp(s->devname, devname) == 0)
break;
}
return s;
}
struct servants_list_item *lookup_servant_by_pid(pid_t pid)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (s->pid == pid)
break;
}
return s;
}
int check_all_dead(void)
{
struct servants_list_item *s;
int r = 0;
- union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0) {
- r = sigqueue(s->pid, 0, svalue);
+ r = sigqueue_zero(s->pid, 0);
if (r == -1 && errno == ESRCH)
continue;
return 0;
}
}
return 1;
}
void servant_start(struct servants_list_item *s)
{
int r = 0;
- union sigval svalue;
if (s->pid != 0) {
- r = sigqueue(s->pid, 0, svalue);
+ r = sigqueue_zero(s->pid, 0);
if ((r != -1 || errno != ESRCH))
return;
}
s->restarts++;
if (sbd_is_disk(s)) {
#if SUPPORT_SHARED_DISK
DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
s->pid = assign_servant(s->devname, servant_md, start_mode, s);
#else
cl_log(LOG_ERR, "Shared disk functionality not supported");
return;
#endif
} else if(sbd_is_pcmk(s)) {
DBGLOG(LOG_INFO, "Starting Pacemaker servant");
s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
} else if(sbd_is_cluster(s)) {
DBGLOG(LOG_INFO, "Starting Cluster servant");
s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
} else {
cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
}
clock_gettime(CLOCK_MONOTONIC, &s->t_started);
return;
}
void servants_start(void)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
s->restarts = 0;
servant_start(s);
}
}
void servants_kill(void)
{
struct servants_list_item *s;
- union sigval svalue;
for (s = servants_leader; s; s = s->next) {
- if (s->pid != 0)
- sigqueue(s->pid, SIGKILL, svalue);
+ if (s->pid != 0) {
+ sigqueue_zero(s->pid, SIGKILL);
+ }
}
}
static inline void cleanup_servant_by_pid(pid_t pid)
{
struct servants_list_item* s;
s = lookup_servant_by_pid(pid);
if (s) {
cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
s->devname, s->pid);
s->pid = 0;
} else {
/* This most likely is a stray signal from somewhere, or
* a SIGCHLD for a process that has previously
* explicitly disconnected. */
DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
pid);
}
}
int inquisitor_decouple(void)
{
pid_t ppid = getppid();
- union sigval signal_value;
/* During start-up, we only arm the watchdog once we've got
* quorum at least once. */
if (watchdog_use) {
if (watchdog_init() < 0) {
return -1;
}
}
if (ppid > 1) {
- sigqueue(ppid, SIG_LIVENESS, signal_value);
+ sigqueue_zero(ppid, SIG_LIVENESS);
}
return 0;
}
static int sbd_lock_running(long pid)
{
int rc = 0;
long mypid;
int running = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
/* check if pid is running */
if (kill(pid, 0) < 0 && errno == ESRCH) {
goto bail;
}
#ifndef HAVE_PROC_PID
return 1;
#endif
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
rc = readlink(proc_path, exe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
exe_path[rc] = 0;
mypid = (unsigned long) getpid();
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
rc = readlink(proc_path, myexe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
myexe_path[rc] = 0;
if(strcmp(exe_path, myexe_path) == 0) {
running = 1;
}
bail:
return running;
}
static int
sbd_lock_pidfile(const char *filename)
{
char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
int fd;
long pid, mypid;
int rc;
struct stat sbuf;
if (filename == NULL) {
errno = EFAULT;
return -1;
}
mypid = (unsigned long) getpid();
snprintf(lf_name, sizeof(lf_name), "%s",filename);
snprintf(tf_name, sizeof(tf_name), "%s.%lu",
filename, mypid);
if ((fd = open(lf_name, O_RDONLY)) >= 0) {
if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
sleep(1); /* if someone was about to create one,
* give'm a sec to do so
* Though if they follow our protocol,
* this won't happen. They should really
* put the pid in, then link, not the
* other way around.
*/
}
if (read(fd, buf, sizeof(buf)) < 1) {
/* lockfile empty -> rm it and go on */;
} else {
if (sscanf(buf, "%ld", &pid) < 1) {
/* lockfile screwed up -> rm it and go on */
} else {
if (pid > 1 && (getpid() != pid)
&& sbd_lock_running(pid)) {
/* is locked by existing process
* -> give up */
close(fd);
return -1;
} else {
/* stale lockfile -> rm it and go on */
}
}
}
unlink(lf_name);
close(fd);
}
if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
/* Hmmh, why did we fail? Anyway, nothing we can do about it */
return -3;
}
/* Slight overkill with the %*d format ;-) */
snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
/* Again, nothing we can do about this */
rc = -3;
close(fd);
goto out;
}
close(fd);
switch (link(tf_name, lf_name)) {
case 0:
if (stat(tf_name, &sbuf) < 0) {
/* something weird happened */
rc = -3;
break;
}
if (sbuf.st_nlink < 2) {
/* somehow, it didn't get through - NFS trouble? */
rc = -2;
break;
}
rc = 0;
break;
case EEXIST:
rc = -1;
break;
default:
rc = -3;
}
out:
unlink(tf_name);
return rc;
}
/*
* Unlock a file (remove its lockfile)
* do we need to check, if its (still) ours? No, IMHO, if someone else
* locked our line, it's his fault -tho
* returns 0 on success
* <0 if some failure occured
*/
static int
sbd_unlock_pidfile(const char *filename)
{
char lf_name[256];
if (filename == NULL) {
errno = EFAULT;
return -1;
}
snprintf(lf_name, sizeof(lf_name), "%s", filename);
return unlink(lf_name);
}
int cluster_alive(bool all)
{
int alive = 1;
struct servants_list_item* s;
if(servant_count == disk_count) {
return 0;
}
for (s = servants_leader; s; s = s->next) {
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if(s->outdated) {
alive = 0;
} else if(all == false) {
return 1;
}
}
}
return alive;
}
int quorum_read(int good_servants)
{
if (disk_count > 2)
return (good_servants > disk_count/2);
else
return (good_servants > 0);
}
void inquisitor_child(void)
{
int sig, pid;
sigset_t procmask;
siginfo_t sinfo;
int status;
struct timespec timeout;
int exiting = 0;
int decoupled = 0;
int cluster_appeared = 0;
int pcmk_override = 0;
- time_t latency;
+ int latency;
struct timespec t_last_tickle, t_now;
struct servants_list_item* s;
if (debug_mode) {
cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
}
set_proc_title("sbd: inquisitor");
if (pidfile) {
if (sbd_lock_pidfile(pidfile) < 0) {
exit(1);
}
}
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIGTERM);
sigaddset(&procmask, SIG_LIVENESS);
sigaddset(&procmask, SIG_EXITREQ);
sigaddset(&procmask, SIG_TEST);
sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
sigaddset(&procmask, SIG_RESTART);
sigaddset(&procmask, SIGUSR1);
sigaddset(&procmask, SIGUSR2);
sigprocmask(SIG_BLOCK, &procmask, NULL);
servants_start();
timeout.tv_sec = timeout_loop;
timeout.tv_nsec = 0;
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
while (1) {
bool tickle = 0;
bool can_detach = 0;
int good_servants = 0;
sig = sigtimedwait(&procmask, &sinfo, &timeout);
clock_gettime(CLOCK_MONOTONIC, &t_now);
if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_MD_SERVANT_IO_FAIL:
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
s->devname);
break;
case EXIT_MD_SERVANT_REQUEST_RESET:
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
do_reset();
break;
case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
do_off();
break;
case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
do_crashdump();
break;
default:
break;
}
}
} else if (sbd_is_pcmk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
/* revert to state prior to pacemaker-detection */
s->restarts = 0;
s->restart_blocked = 0;
cluster_appeared = 0;
s->outdated = 1;
s->t_last.tv_sec = 0;
break;
default:
break;
}
}
}
cleanup_servant_by_pid(pid);
}
}
} else if (sig == SIG_PCMK_UNHEALTHY) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if (s->outdated == 0) {
cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
}
s->t_last.tv_sec = 1;
} else {
cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
}
} else if (sig == SIG_LIVENESS) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (s) {
s->first_start = 0;
clock_gettime(CLOCK_MONOTONIC, &s->t_last);
}
} else if (sig == SIG_TEST) {
} else if (sig == SIGUSR1) {
if (exiting)
continue;
servants_start();
}
if (exiting) {
if (check_all_dead()) {
if (pidfile) {
sbd_unlock_pidfile(pidfile);
}
exit(0);
} else
continue;
}
good_servants = 0;
for (s = servants_leader; s; s = s->next) {
- int age = t_now.tv_sec - s->t_last.tv_sec;
+ int age = seconds_diff_timespec(&t_now, &(s->t_last));
if (!s->t_last.tv_sec)
continue;
- if (age < (int)(timeout_io+timeout_loop)) {
+ if (age < timeout_io+timeout_loop) {
if (sbd_is_disk(s)) {
good_servants++;
}
if (s->outdated) {
cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
}
s->outdated = 0;
} else if (!s->outdated) {
if (!s->restart_blocked) {
cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
}
s->outdated = 1;
}
}
if(disk_count == 0) {
/* NO disks, everything is up to the cluster */
if(cluster_alive(true)) {
/* We LIVE! */
if(cluster_appeared == false) {
cl_log(LOG_INFO, "Active cluster detected");
}
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(cluster_alive(false)) {
if(!decoupled) {
/* On the way up, detach and arm the watchdog */
cl_log(LOG_INFO, "Partial cluster detected, detaching");
}
can_detach = 1;
tickle = !cluster_appeared;
} else if(!decoupled) {
/* Stay alive until the cluster comes up */
tickle = !cluster_appeared;
}
} else if(disk_priority == 1 || servant_count == disk_count) {
if (quorum_read(good_servants)) {
/* There are disks and we're connected to the majority of them */
tickle = 1;
can_detach = 1;
pcmk_override = 0;
} else if (servant_count > disk_count && cluster_alive(true)) {
tickle = 1;
if(!pcmk_override) {
cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
pcmk_override = 1; /* Only log this message once */
}
}
} else if(cluster_alive(true) && quorum_read(good_servants)) {
/* Both disk and cluster servants are healthy */
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(quorum_read(good_servants)) {
/* The cluster takes priority but only once
* connected for the first time.
*
* Until then, we tickle based on disk quorum.
*/
can_detach = 1;
tickle = !cluster_appeared;
}
/* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
/* quorum_read(good_servants), good_servants, tickle, disk_count); */
if(tickle) {
watchdog_tickle();
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
}
if (!decoupled && can_detach) {
/* We only do this at the point either the disk or
* cluster servants become healthy
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
exiting = 1;
continue;
} else {
decoupled = 1;
}
}
/* Note that this can actually be negative, since we set
* last_tickle after we set now. */
- latency = t_now.tv_sec - t_last_tickle.tv_sec;
- if (timeout_watchdog && (latency > (int)timeout_watchdog)) {
+ latency = seconds_diff_timespec(&t_now, &t_last_tickle);
+ if (timeout_watchdog && (latency > timeout_watchdog)) {
if (!decoupled) {
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
exiting = 1;
continue;
}
if (debug_mode < 2) {
/* At level 2 or above, we do nothing, but expect
* things to eventually return to
* normal. */
do_timeout_action();
} else {
cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
}
}
- if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) {
+ if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)",
- (int)latency, (int)timeout_watchdog_warn, good_servants);
+ latency, timeout_watchdog_warn, good_servants);
if (debug_mode && watchdog_use) {
/* In debug mode, trigger a reset before the watchdog can panic the machine */
do_timeout_action();
}
}
for (s = servants_leader; s; s = s->next) {
- int age = t_now.tv_sec - s->t_started.tv_sec;
+ int age = seconds_diff_timespec(&t_now, &(s->t_started));
if (age > servant_restart_interval) {
s->restarts = 0;
s->restart_blocked = 0;
}
if (servant_restart_count
&& (s->restarts >= servant_restart_count)
&& !s->restart_blocked) {
if (servant_restart_count > 1) {
cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
(int)servant_restart_count, s->devname);
}
s->restart_blocked = 1;
}
if (!s->restart_blocked) {
servant_start(s);
}
}
}
/* not reached */
exit(0);
}
int inquisitor(void)
{
int sig, pid, inquisitor_pid;
int status;
sigset_t procmask;
siginfo_t sinfo;
/* Where's the best place for sysrq init ?*/
sysrq_init();
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIG_LIVENESS);
sigprocmask(SIG_BLOCK, &procmask, NULL);
inquisitor_pid = make_daemon();
if (inquisitor_pid == 0) {
inquisitor_child();
}
/* We're the parent. Wait for a happy signal from our child
* before we proceed - we either get "SIG_LIVENESS" when the
* inquisitor has completed the first successful round, or
* ECHLD when it exits with an error. */
while (1) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
}
/* We got here because the inquisitor
* did not succeed. */
return -1;
}
} else if (sig == SIG_LIVENESS) {
/* Inquisitor started up properly. */
return 0;
} else {
fprintf(stderr, "Nobody expected the spanish inquisition!\n");
continue;
}
}
/* not reached */
return -1;
}
int
parse_device_line(const char *line)
{
size_t lpc = 0;
size_t last = 0;
size_t max = 0;
int found = 0;
bool skip_space = true;
int space_run = 0;
if (!line) {
return 0;
}
max = strlen(line);
cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line);
for (lpc = 0; lpc <= max; lpc++) {
if (isspace(line[lpc])) {
if (skip_space) {
last = lpc + 1;
} else {
space_run++;
}
continue;
}
skip_space = false;
if (line[lpc] == ';' || line[lpc] == 0) {
int rc = 0;
char *entry = calloc(1, 1 + lpc - last);
if (entry) {
rc = sscanf(line + last, "%[^;]", entry);
} else {
fprintf(stderr, "Heap allocation failed parsing device-line.\n");
exit(1);
}
if (rc != 1) {
cl_log(LOG_WARNING, "Could not parse: '%s'", line + last);
} else {
entry[strlen(entry)-space_run] = '\0';
cl_log(LOG_DEBUG, "Adding '%s'", entry);
if (recruit_servant(entry, 0) != 0) {
free(entry);
// sbd should refuse to start if any of the configured device names is invalid.
return -1;
}
found++;
}
free(entry);
skip_space = true;
last = lpc + 1;
}
space_run = 0;
}
return found;
}
-#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c"
+#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,sbd-watchdog.c,setproctitle.c"
static void
sbd_log_filter_ctl(const char *files, uint8_t priority)
{
if (files == NULL) {
files = SBD_SOURCE_FILES;
}
qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
}
int
arg_enabled(int arg_count)
{
return arg_count % 2;
}
int main(int argc, char **argv, char **envp)
{
int exit_status = 0;
int c;
int W_count = 0;
int c_count = 0;
int P_count = 0;
int qb_facility;
const char *value = NULL;
bool delay_start = false;
long delay = 0;
char *timeout_action = NULL;
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
cmdname = argv[0];
} else {
++cmdname;
}
watchdogdev = strdup("/dev/watchdog");
watchdogdev_is_default = true;
qb_facility = qb_log_facility2int("daemon");
qb_log_init(cmdname, qb_facility, LOG_WARNING);
sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
sbd_log_filter_ctl(NULL, LOG_NOTICE);
sbd_get_uname();
value = get_env_option("SBD_PACEMAKER");
if(value) {
check_pcmk = crm_is_true(value);
check_cluster = crm_is_true(value);
has_check_pcmk_env = true;
}
cl_log(LOG_INFO, "SBD_PACEMAKER set to: %d (%s)", (int)check_pcmk, value?value:"default");
value = get_env_option("SBD_STARTMODE");
if(value == NULL) {
} else if(strcmp(value, "clean") == 0) {
start_mode = 1;
} else if(strcmp(value, "always") == 0) {
start_mode = 0;
}
cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
value = get_env_option("SBD_WATCHDOG_DEV");
if(value) {
free(watchdogdev);
watchdogdev = strdup(value);
watchdogdev_is_default = false;
}
/* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
* This is for backward compatibility. */
value = get_env_option("SBD_WATCHDOG");
if(value) {
watchdog_use = crm_is_true(value);
}
value = get_env_option("SBD_WATCHDOG_TIMEOUT");
if(value) {
timeout_watchdog = crm_get_msec(value) / 1000;
}
value = get_env_option("SBD_PIDFILE");
if(value) {
pidfile = strdup(value);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
}
value = get_env_option("SBD_DELAY_START");
if(value) {
delay_start = crm_is_true(value);
if (!delay_start) {
delay = crm_get_msec(value) / 1000;
if (delay > 0) {
delay_start = true;
}
}
}
value = get_env_option("SBD_TIMEOUT_ACTION");
if(value) {
timeout_action = strdup(value);
}
value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP");
if(value) {
move_to_root_cgroup = crm_is_true(value);
if (move_to_root_cgroup) {
enforce_moving_to_root_cgroup = true;
} else {
if (strcmp(value, "auto") == 0) {
move_to_root_cgroup = true;
}
}
}
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
int sanitized_num_optarg = 0;
/* Call it before checking optarg for NULL to make coverity happy */
const char *sanitized_optarg = sanitize_option_value(optarg);
if (optarg && ((sanitized_optarg == NULL) ||
(strchr("SsC12345tIF", c) &&
(sanitized_num_optarg = sanitize_numeric_option_value(sanitized_optarg)) < 0))) {
fprintf(stderr, "Invalid value \"%s\" for option -%c\n", optarg, c);
exit_status = -2;
goto out;
}
switch (c) {
case 'D':
break;
case 'Z':
debug_mode++;
cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
break;
case 'R':
skip_rt = 1;
cl_log(LOG_INFO, "Realtime mode deactivated.");
break;
case 'S':
start_mode = sanitized_num_optarg;
cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
break;
case 's':
timeout_startup = sanitized_num_optarg;
cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
break;
case 'v':
debug++;
if(debug == 1) {
sbd_log_filter_ctl(NULL, LOG_INFO);
cl_log(LOG_INFO, "Verbose mode enabled.");
} else if(debug == 2) {
sbd_log_filter_ctl(NULL, LOG_DEBUG);
cl_log(LOG_INFO, "Debug mode enabled.");
} else if(debug == 3) {
/* Go nuts, turn on pacemaker's logging too */
sbd_log_filter_ctl("*", LOG_DEBUG);
cl_log(LOG_INFO, "Debug library mode enabled.");
}
break;
case 'T':
watchdog_set_timeout = 0;
cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
break;
case 'W':
W_count++;
break;
case 'w':
free(watchdogdev);
watchdogdev = strdup(sanitized_optarg);
watchdogdev_is_default = false;
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
break;
case 'd':
#if SUPPORT_SHARED_DISK
if (recruit_servant(sanitized_optarg, 0) != 0) {
fprintf(stderr, "Invalid device: %s\n", optarg);
exit_status = -1;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
break;
case 'c':
c_count++;
break;
case 'P':
P_count++;
break;
case 'z':
disk_priority = 0;
break;
case 'n':
local_uname = strdup(sanitized_optarg);
cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
break;
case 'p':
pidfile = strdup(sanitized_optarg);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
break;
case 'C':
timeout_watchdog_crashdump = sanitized_num_optarg;
cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
- (int)timeout_watchdog_crashdump);
+ timeout_watchdog_crashdump);
break;
case '1':
timeout_watchdog = sanitized_num_optarg;
break;
case '2':
timeout_allocate = sanitized_num_optarg;
break;
case '3':
timeout_loop = sanitized_num_optarg;
break;
case '4':
timeout_msgwait = sanitized_num_optarg;
break;
case '5':
timeout_watchdog_warn = sanitized_num_optarg;
do_calculate_timeout_watchdog_warn = false;
cl_log(LOG_INFO, "Setting latency warning to %d",
- (int)timeout_watchdog_warn);
+ timeout_watchdog_warn);
break;
case 't':
servant_restart_interval = sanitized_num_optarg;
cl_log(LOG_INFO, "Setting servant restart interval to %d",
(int)servant_restart_interval);
break;
case 'I':
timeout_io = sanitized_num_optarg;
cl_log(LOG_INFO, "Setting IO timeout to %d",
(int)timeout_io);
break;
case 'F':
servant_restart_count = sanitized_num_optarg;
cl_log(LOG_INFO, "Servant restart count set to %d",
(int)servant_restart_count);
break;
case 'r':
if (timeout_action) {
free(timeout_action);
}
timeout_action = strdup(sanitized_optarg);
break;
case 'h':
usage();
goto out;
break;
default:
exit_status = -2;
goto out;
break;
}
}
if (disk_count == 0) {
/* if we already have disks from commandline
then it is probably undesirable to add those
from environment (general rule cmdline has precedence)
*/
value = get_env_option("SBD_DEVICE");
if ((value) && strlen(value)) {
#if SUPPORT_SHARED_DISK
int devices = parse_device_line(value);
if(devices < 1) {
fprintf(stderr, "Invalid device line: %s\n", value);
exit_status = -1;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
}
}
if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
watchdog_use = 0;
} else if (W_count > 0) {
watchdog_use = arg_enabled(W_count);
}
if (watchdog_use) {
cl_log(LOG_INFO, "Watchdog enabled.");
} else {
cl_log(LOG_INFO, "Watchdog disabled.");
}
if (c_count > 0) {
check_cluster = arg_enabled(c_count);
}
if (P_count > 0) {
int check_pcmk_arg = arg_enabled(P_count);
if (has_check_pcmk_env && check_pcmk_arg != check_pcmk) {
cl_log(LOG_WARNING, "Pacemaker integration is %s: "
"SBD_PACEMAKER=%s is overridden by %s option. "
"It's recommended to only use SBD_PACEMAKER.",
check_pcmk_arg? "enabled" : "disabled",
check_pcmk? "yes" : "no",
check_pcmk_arg? "-P" : "-PP");
}
check_pcmk = check_pcmk_arg;
}
if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
SECTOR_NAME_MAX);
fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
exit_status = -1;
goto out;
}
if (disk_count > 3) {
fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
exit_status = -1;
goto out;
}
/* There must at least be one command following the options: */
if ((argc - optind) < 1) {
fprintf(stderr, "Not enough arguments.\n");
exit_status = -2;
goto out;
}
if (init_set_proc_title(argc, argv, envp) < 0) {
fprintf(stderr, "Allocation of proc title failed.\n");
exit_status = -1;
goto out;
}
if (timeout_action) {
char *p[2];
int i;
char c;
int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c);
bool parse_error = (nrflags < 1) || (nrflags > 2);
for (i = 0; (i < nrflags) && (i < 2); i++) {
if (!strcmp(p[i], "reboot")) {
timeout_sysrq_char = 'b';
} else if (!strcmp(p[i], "crashdump")) {
timeout_sysrq_char = 'c';
} else if (!strcmp(p[i], "off")) {
timeout_sysrq_char = 'o';
} else if (!strcmp(p[i], "flush")) {
do_flush = true;
} else if (!strcmp(p[i], "noflush")) {
do_flush = false;
} else {
parse_error = true;
}
free(p[i]);
}
if (parse_error) {
fprintf(stderr, "Failed to parse timeout-action \"%s\".\n",
timeout_action);
exit_status = -1;
goto out;
}
}
if (strcmp(argv[optind], "watch") == 0) {
value = get_env_option("SBD_SYNC_RESOURCE_STARTUP");
sync_resource_startup =
crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT);
#if !USE_PACEMAKERD_API
if (sync_resource_startup) {
fprintf(stderr, "Failed to sync resource-startup as "
"SBD was built against pacemaker not supporting pacemakerd-API.\n");
exit_status = -1;
goto out;
}
#else
if (check_pcmk && !sync_resource_startup) {
cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
"pacemakerd-API. Should think about enabling "
"SBD_SYNC_RESOURCE_STARTUP.");
} else if (!check_pcmk && sync_resource_startup) {
fprintf(stderr, "Set SBD_PACEMAKER=yes to allow resource startup syncing. "
"Otherwise explicitly set SBD_SYNC_RESOURCE_STARTUP=no if to intentionally disable.\n");
exit_status = -1;
goto out;
}
#endif
}
#if SUPPORT_SHARED_DISK
if (strcmp(argv[optind], "create") == 0) {
exit_status = init_devices(servants_leader);
} else if (strcmp(argv[optind], "dump") == 0) {
exit_status = dump_headers(servants_leader);
} else if (strcmp(argv[optind], "allocate") == 0) {
exit_status = allocate_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "list") == 0) {
exit_status = list_slots(servants_leader);
} else if (strcmp(argv[optind], "message") == 0) {
exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
} else if (strcmp(argv[optind], "ping") == 0) {
exit_status = ping_via_slots(argv[optind + 1], servants_leader);
} else
#endif
if (strcmp(argv[optind], "query-watchdog") == 0) {
exit_status = watchdog_info();
} else if (strcmp(argv[optind], "test-watchdog") == 0) {
exit_status = watchdog_test();
} else if (strcmp(argv[optind], "watch") == 0) {
/* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
const char *delay_source = delay ? "SBD_DELAY_START" : "";
#if SUPPORT_SHARED_DISK
if(disk_count > 0) {
/* If no devices are specified, its not an error to be unable to find one */
open_any_device(servants_leader);
if (delay_start && delay <= 0) {
delay = get_first_msgwait(servants_leader);
if (delay > 0) {
delay_source = "msgwait";
} else {
cl_log(LOG_WARNING, "No 'msgwait' value from disk, using '2 * watchdog-timeout' for 'delay' starting");
}
}
}
#endif
/* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from:
* SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */
if (do_calculate_timeout_watchdog_warn) {
timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog);
}
if (delay_start) {
/* diskless mode or disk read issues causing get_first_msgwait() to return a 0 for delay */
if (delay <= 0) {
delay = 2 * timeout_watchdog;
delay_source = "watchdog-timeout * 2";
}
cl_log(LOG_DEBUG, "Delay start (yes), (delay: %ld), (delay source: %s)", delay, delay_source);
sleep((unsigned long) delay);
} else {
cl_log(LOG_DEBUG, "Delay start (no)");
}
/* We only want this to have an effect during watch right now;
* pinging and fencing would be too confused */
cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
if (check_pcmk) {
recruit_servant("pcmk", 0);
#if SUPPORT_PLUGIN
check_cluster = 1;
#endif
}
cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
if (check_cluster) {
recruit_servant("cluster", 0);
}
cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout",
do_flush?"Do":"Skip", timeout_sysrq_char);
exit_status = inquisitor();
} else {
exit_status = -2;
}
out:
if (timeout_action) {
free(timeout_action);
}
if (exit_status < 0) {
if (exit_status == -2) {
usage();
} else {
fprintf(stderr, "sbd failed; please check the logs.\n");
}
return (1);
}
return (0);
}
diff --git a/src/sbd-md.c b/src/sbd-md.c
index 2a237ad..3a2e82d 100644
--- a/src/sbd-md.c
+++ b/src/sbd-md.c
@@ -1,1288 +1,1286 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "sbd.h"
#define SBD_MSG_EMPTY 0x00
#define SBD_MSG_TEST 0x01
#define SBD_MSG_RESET 0x02
#define SBD_MSG_OFF 0x03
#define SBD_MSG_EXIT 0x04
#define SBD_MSG_CRASHDUMP 0x05
#define SLOT_TO_SECTOR(slot) (1+slot*2)
#define MBOX_TO_SECTOR(mbox) (2+mbox*2)
extern int disk_count;
/* These have to match the values in the header of the partition */
static char sbd_magic[8] = "SBD_SBD_";
static char sbd_version = 0x02;
struct slot_msg_arg_t {
const char* name;
const char* msg;
};
static signed char
cmd2char(const char *cmd)
{
if (strcmp("clear", cmd) == 0) {
return SBD_MSG_EMPTY;
} else if (strcmp("test", cmd) == 0) {
return SBD_MSG_TEST;
} else if (strcmp("reset", cmd) == 0) {
return SBD_MSG_RESET;
} else if (strcmp("off", cmd) == 0) {
return SBD_MSG_OFF;
} else if (strcmp("exit", cmd) == 0) {
return SBD_MSG_EXIT;
} else if (strcmp("crashdump", cmd) == 0) {
return SBD_MSG_CRASHDUMP;
}
return -1;
}
static const char*
char2cmd(const char cmd)
{
switch (cmd) {
case SBD_MSG_EMPTY:
return "clear";
break;
case SBD_MSG_TEST:
return "test";
break;
case SBD_MSG_RESET:
return "reset";
break;
case SBD_MSG_OFF:
return "off";
break;
case SBD_MSG_EXIT:
return "exit";
break;
case SBD_MSG_CRASHDUMP:
return "crashdump";
break;
default:
return "undefined";
break;
}
}
static void
close_device(struct sbd_context *st)
{
if (!st) {
return;
}
if (st->ioctx) {
io_destroy(st->ioctx);
}
if (st->devfd >= 0) {
close(st->devfd);
}
free(st->buffer);
free(st);
}
static struct sbd_context *
open_device(const char* devname, int loglevel)
{
struct sbd_context *st;
if (!devname)
return NULL;
st = calloc(1, sizeof(struct sbd_context));
if (!st) {
return NULL;
}
st->devfd = -1;
if (io_setup(1, &st->ioctx) != 0) {
cl_perror("io_setup failed");
goto out;
}
st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT);
if (st->devfd == -1) {
if (loglevel == LOG_DEBUG) {
DBGLOG(loglevel, "Opening device %s failed.", devname);
} else {
cl_log(loglevel, "Opening device %s failed.", devname);
}
goto out;
}
ioctl(st->devfd, BLKSSZGET, §or_size);
if (sector_size == 0) {
cl_perror("Get sector size failed.\n");
goto out;
}
if (posix_memalign(&st->buffer, sector_size, sector_size)) {
cl_perror("Couldn't allocate sector-buffer.");
goto out;
}
return st;
out:
close_device(st);
return NULL;
}
static void *
sector_alloc(void)
{
void *x;
x = calloc(1, sector_size);
if (!x) {
exit(1);
}
return x;
}
static int
sector_io(struct sbd_context *st, int sector, void *data, int rw)
{
struct timespec timeout;
struct io_event event;
struct iocb *ios[1] = { &st->io };
- long r;
+ int r;
timeout.tv_sec = timeout_io;
timeout.tv_nsec = 0;
memset(&st->io, 0, sizeof(struct iocb));
if (rw) {
memcpy(st->buffer, data, sector_size);
io_prep_pwrite(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector);
} else {
memset(st->buffer, 0, sector_size);
io_prep_pread(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector);
}
if (io_submit(st->ioctx, 1, ios) != 1) {
cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw);
return -1;
}
errno = 0;
r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout);
if (r < 0 ) {
cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw);
return -1;
- } else if (r < 1L) {
- cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%ld)", rw, r);
+ } else if (r < 1) {
+ cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%d)", rw, r);
r = io_cancel(st->ioctx, ios[0], &event);
if (r) {
DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw);
/* Doesn't really matter, debugging information.
*/
}
return -1;
- } else if (r > 1L) {
- cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r);
+ } else if (r > 1) {
+ cl_log(LOG_ERR, "More than one IO was returned (r=%d)", r);
return -1;
}
/* IO is happy */
if (event.res == sector_size) {
if (!rw) {
memcpy(data, st->buffer, sector_size);
}
return 0;
} else {
cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)",
rw, event.res, sector_size);
return -1;
}
}
static int
sector_write(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 1);
}
static int
sector_read(struct sbd_context *st, int sector, void *data)
{
return sector_io(st, sector, data, 0);
}
static int
slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_read(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node)
{
return sector_write(st, SLOT_TO_SECTOR(slot), s_node);
}
static int
mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox);
}
static int
mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox)
{
void *data;
int rc = 0;
if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0)
return -1;
data = sector_alloc();
if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) {
rc = -1;
goto out;
}
if (memcmp(s_mbox, data, sector_size) != 0) {
cl_log(LOG_ERR, "Write verification failed!");
rc = -1;
goto out;
}
rc = 0;
out:
free(data);
return rc;
}
static int header_write(struct sbd_context *st, struct sector_header_s *s_header)
{
s_header->sector_size = htonl(s_header->sector_size);
s_header->timeout_watchdog = htonl(s_header->timeout_watchdog);
s_header->timeout_allocate = htonl(s_header->timeout_allocate);
s_header->timeout_loop = htonl(s_header->timeout_loop);
s_header->timeout_msgwait = htonl(s_header->timeout_msgwait);
return sector_write(st, 0, s_header);
}
static int
header_read(struct sbd_context *st, struct sector_header_s *s_header)
{
if (sector_read(st, 0, s_header) < 0)
return -1;
s_header->sector_size = ntohl(s_header->sector_size);
s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog);
s_header->timeout_allocate = ntohl(s_header->timeout_allocate);
s_header->timeout_loop = ntohl(s_header->timeout_loop);
s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait);
/* This sets the global defaults: */
timeout_watchdog = s_header->timeout_watchdog;
timeout_allocate = s_header->timeout_allocate;
timeout_loop = s_header->timeout_loop;
timeout_msgwait = s_header->timeout_msgwait;
return 0;
}
static int
valid_header(const struct sector_header_s *s_header)
{
if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) {
cl_log(LOG_ERR, "Header magic does not match.");
return -1;
}
if (s_header->version != sbd_version) {
cl_log(LOG_ERR, "Header version does not match.");
return -1;
}
if (s_header->sector_size != sector_size) {
cl_log(LOG_ERR, "Header sector size does not match.");
return -1;
}
return 0;
}
static struct sector_header_s *
header_get(struct sbd_context *st)
{
struct sector_header_s *s_header;
s_header = sector_alloc();
if (header_read(st, s_header) < 0) {
cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd);
free(s_header);
return NULL;
}
if (valid_header(s_header) < 0) {
cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd);
free(s_header);
return NULL;
}
/* cl_log(LOG_INFO, "Found version %d header with %d slots",
s_header->version, s_header->slots); */
return s_header;
}
static int
header_dump(struct sbd_context *st)
{
struct sector_header_s *s_header;
char uuid[37];
s_header = header_get(st);
if (s_header == NULL)
return -1;
printf("Header version : %u.%u\n", s_header->version,
s_header->minor_version);
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
printf("UUID : %s\n", uuid);
}
printf("Number of slots : %u\n", s_header->slots);
printf("Sector size : %lu\n",
(unsigned long)s_header->sector_size);
printf("Timeout (watchdog) : %lu\n",
(unsigned long)s_header->timeout_watchdog);
printf("Timeout (allocate) : %lu\n",
(unsigned long)s_header->timeout_allocate);
printf("Timeout (loop) : %lu\n",
(unsigned long)s_header->timeout_loop);
printf("Timeout (msgwait) : %lu\n",
(unsigned long)s_header->timeout_msgwait);
free(s_header);
return 0;
}
static int
init_device(struct sbd_context *st)
{
struct sector_header_s *s_header;
struct sector_node_s *s_node;
struct sector_mbox_s *s_mbox;
char uuid[37];
int i;
int rc = 0;
s_header = sector_alloc();
s_node = sector_alloc();
s_mbox = sector_alloc();
memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic));
s_header->version = sbd_version;
s_header->slots = 255;
s_header->sector_size = sector_size;
s_header->timeout_watchdog = timeout_watchdog;
s_header->timeout_allocate = timeout_allocate;
s_header->timeout_loop = timeout_loop;
s_header->timeout_msgwait = timeout_msgwait;
s_header->minor_version = 1;
uuid_generate(s_header->uuid);
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)",
s_header->version, s_header->minor_version,
st->devfd, uuid);
fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n",
s_header->version, s_header->minor_version,
st->devfd, uuid);
if (header_write(st, s_header) < 0) {
rc = -1; goto out;
}
cl_log(LOG_INFO, "Initializing %d slots on device %d",
s_header->slots,
st->devfd);
fprintf(stdout, "Initializing %d slots on device %d\n",
s_header->slots,
st->devfd);
for (i=0;i < s_header->slots;i++) {
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (mbox_write(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
}
out: free(s_mbox);
free(s_node);
free(s_header);
return(rc);
}
/* Check if there already is a slot allocated to said name; returns the
* slot number. If not found, returns -1.
* This is necessary because slots might not be continuous. */
static int
slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name)
{
struct sector_node_s *s_node = NULL;
int i;
int rc = -1;
if (!name) {
cl_log(LOG_ERR, "slot_lookup(): No name specified.\n");
goto out;
}
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -2; goto out;
}
if (s_node->in_use != 0) {
if (strncasecmp(s_node->name, name,
SECTOR_NAME_MAX) == 0) {
DBGLOG(LOG_INFO, "%s owns slot %d", name, i);
rc = i; goto out;
}
}
}
out: free(s_node);
return rc;
}
static int
slot_unused(struct sbd_context *st, const struct sector_header_s *s_header)
{
struct sector_node_s *s_node;
int i;
int rc = -1;
s_node = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use == 0) {
rc = i; goto out;
}
}
out: free(s_node);
return rc;
}
static int
slot_allocate(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_allocate(): No name specified.\n");
fprintf(stderr, "slot_allocate(): No name specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
while (1) {
i = slot_lookup(st, s_header, name);
if ((i >= 0) || (i == -2)) {
/* -1 is "no slot found", in which case we
* proceed to allocate a new one.
* -2 is "read error during lookup", in which
* case we error out too
* >= 0 is "slot already allocated" */
rc = i; goto out;
}
i = slot_unused(st, s_header);
if (i >= 0) {
cl_log(LOG_INFO, "slot %d is unused - trying to own", i);
fprintf(stdout, "slot %d is unused - trying to own\n", i);
memset(s_node, 0, sizeof(*s_node));
s_node->in_use = 1;
strncpy(s_node->name, name, SECTOR_NAME_MAX);
if (slot_write(st, i, s_node) < 0) {
rc = -1; goto out;
}
sleep(timeout_allocate);
} else {
cl_log(LOG_ERR, "No more free slots.");
fprintf(stderr, "No more free slots.\n");
rc = -1; goto out;
}
}
out: free(s_mbox);
free(s_node);
free(s_header);
return(rc);
}
static int
slot_list(struct sbd_context *st)
{
struct sector_header_s *s_header = NULL;
struct sector_node_s *s_node = NULL;
struct sector_mbox_s *s_mbox = NULL;
int i;
int rc = 0;
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
s_node = sector_alloc();
s_mbox = sector_alloc();
for (i=0; i < s_header->slots; i++) {
if (slot_read(st, i, s_node) < 0) {
rc = -1; goto out;
}
if (s_node->in_use > 0) {
if (mbox_read(st, i, s_mbox) < 0) {
rc = -1; goto out;
}
printf("%d\t%s\t%s\t%s\n",
i, s_node->name, char2cmd(s_mbox->cmd),
s_mbox->from);
}
}
out: free(s_mbox);
free(s_node);
free(s_header);
return rc;
}
static int
slot_msg(struct sbd_context *st, const char *name, const char *cmd)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int rc = 0;
char uuid[37];
if (!name || !cmd) {
cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device UUID: %s", uuid);
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = cmd2char(cmd);
if (s_mbox->cmd < 0) {
cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd);
rc = -1; goto out;
}
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
cl_log(LOG_INFO, "Writing %s to node slot %s",
cmd, name);
if (mbox_write_verify(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
if (strcasecmp(cmd, "exit") != 0) {
cl_log(LOG_INFO, "Messaging delay: %d",
(int)timeout_msgwait);
sleep(timeout_msgwait);
}
cl_log(LOG_INFO, "%s successfully delivered to %s",
cmd, name);
out: free(s_mbox);
free(s_header);
return rc;
}
static int
slot_ping(struct sbd_context *st, const char *name)
{
struct sector_header_s *s_header = NULL;
struct sector_mbox_s *s_mbox = NULL;
int mbox;
int waited = 0;
int rc = 0;
if (!name) {
cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n");
rc = -1; goto out;
}
s_header = header_get(st);
if (!s_header) {
rc = -1; goto out;
}
if (strcmp(name, "LOCAL") == 0) {
name = local_uname;
}
mbox = slot_lookup(st, s_header, name);
if (mbox < 0) {
cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name);
rc = -1; goto out;
}
s_mbox = sector_alloc();
s_mbox->cmd = SBD_MSG_TEST;
strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX);
DBGLOG(LOG_DEBUG, "Pinging node %s", name);
if (mbox_write(st, mbox, s_mbox) < -1) {
rc = -1; goto out;
}
rc = -1;
while (waited <= timeout_msgwait) {
if (mbox_read(st, mbox, s_mbox) < 0)
break;
if (s_mbox->cmd != SBD_MSG_TEST) {
rc = 0;
break;
}
sleep(1);
waited++;
}
if (rc == 0) {
cl_log(LOG_DEBUG, "%s successfully pinged.", name);
} else {
cl_log(LOG_ERR, "%s failed to ping.", name);
}
out: free(s_mbox);
free(s_header);
return rc;
}
int init_devices(struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Initializing device %s\n",
s->devname);
st = open_device(s->devname, LOG_ERR);
if (!st) {
return -1;
}
rc = init_device(st);
close_device(st);
if (rc == -1) {
fprintf(stderr, "Failed to init device %s\n", s->devname);
return rc;
}
fprintf(stdout, "Device %s is initialized.\n", s->devname);
}
fprintf(stdout, "Did you check sbd service down on all nodes before? If not do so now and restart afterwards.\n");
return 0;
}
static int slot_msg_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
struct sbd_context *st;
const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
cl_log(LOG_INFO, "Delivery process handling %s",
devname);
rc = slot_msg(st, arg->name, arg->msg);
close_device(st);
return rc;
}
static int slot_ping_wrapper(const char* devname, int mode, const void* argp)
{
int rc = 0;
const char* name = (const char*)argp;
struct sbd_context *st;
st = open_device(devname, LOG_WARNING);
if (!st)
return -1;
rc = slot_ping(st, name);
close_device(st);
return rc;
}
int allocate_slots(const char *name, struct servants_list_item *servants)
{
int rc = 0;
struct sbd_context *st;
struct servants_list_item *s;
for (s = servants; s; s = s->next) {
fprintf(stdout, "Trying to allocate slot for %s on device %s.\n",
name,
s->devname);
st = open_device(s->devname, LOG_WARNING);
if (!st) {
return -1;
}
rc = slot_allocate(st, name);
close_device(st);
if (rc < 0)
return rc;
fprintf(stdout, "Slot for %s has been allocated on %s.\n",
name,
s->devname);
}
return 0;
}
int list_slots(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
int rv = 0;
st = open_device(s->devname, LOG_WARNING);
if (!st) {
rc = -1;
fprintf(stderr, "== disk %s unreadable!\n", s->devname);
continue;
}
rv = slot_list(st);
close_device(st);
if (rv == -1) {
rc = -1;
fprintf(stderr, "== Slots on disk %s NOT dumped\n", s->devname);
}
}
return rc;
}
int ping_via_slots(const char *name, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
if(sbd_is_disk(s)) {
s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name);
}
}
while (servants_finished < disk_count) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = wait(&status))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
servants_finished++;
}
}
}
}
}
return 0;
}
int quorum_write(int good_servants)
{
return (good_servants > disk_count/2);
}
int messenger(const char *name, const char *msg, struct servants_list_item *servants)
{
int sig = 0;
pid_t pid = 0;
int status = 0;
int servants_finished = 0;
int successful_delivery = 0;
sigset_t procmask;
siginfo_t sinfo;
struct servants_list_item *s;
struct slot_msg_arg_t slot_msg_arg = {name, msg};
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &procmask, NULL);
for (s = servants; s; s = s->next) {
s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg);
}
while (!(quorum_write(successful_delivery) ||
(servants_finished == disk_count))) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
servants_finished++;
if (WIFEXITED(status)
&& WEXITSTATUS(status) == 0) {
DBGLOG(LOG_INFO, "Process %d succeeded.",
(int)pid);
successful_delivery++;
} else {
cl_log(LOG_WARNING, "Process %d failed to deliver!",
(int)pid);
}
}
}
}
}
if (quorum_write(successful_delivery)) {
cl_log(LOG_INFO, "Message successfully delivered.");
return 0;
} else {
cl_log(LOG_ERR, "Message is not delivered via more then a half of devices");
return -1;
}
}
unsigned long
get_first_msgwait(struct servants_list_item *servants)
{
unsigned long msgwait = 0;
struct servants_list_item *s = servants;
for (s = servants; s; s = s->next) {
struct sbd_context *st;
struct sector_header_s *s_header;
st = open_device(s->devname, LOG_WARNING);
if (!st) {
continue;
}
s_header = header_get(st);
if (s_header != NULL) {
msgwait = (unsigned long)s_header->timeout_msgwait;
close_device(st);
free(s_header);
return msgwait;
}
close_device(st);
}
return msgwait;
}
int dump_headers(struct servants_list_item *servants)
{
int rc = 0;
struct servants_list_item *s = servants;
struct sbd_context *st;
for (s = servants; s; s = s->next) {
int rv;
fprintf(stdout, "==Dumping header on disk %s\n", s->devname);
st = open_device(s->devname, LOG_WARNING);
if (st) {
rv = header_dump(st);
close_device(st);
} else {
fprintf(stderr, "== disk %s unreadable!\n", s->devname);
rv = -1;
}
if (rv == -1) {
rc = -1;
fprintf(stderr, "==Header on disk %s NOT dumped\n", s->devname);
} else {
fprintf(stdout, "==Header on disk %s is dumped\n", s->devname);
}
}
return rc;
}
void open_any_device(struct servants_list_item *servants)
{
struct sector_header_s *hdr_cur = NULL;
struct timespec t_0;
int t_wait = 0;
bool logged_once = false;
clock_gettime(CLOCK_MONOTONIC, &t_0);
while (!hdr_cur && t_wait < timeout_startup) {
struct timespec t_now;
struct servants_list_item* s;
for (s = servants; s; s = s->next) {
struct sbd_context *st = open_device(s->devname, LOG_DEBUG);
if (!st) {
if (logged_once == false) {
cl_log(LOG_WARNING, "Failed to open %s. "
"Trying any other configured devices, "
"otherwise retrying every %ds within %ds",
s->devname, timeout_loop, timeout_startup);
logged_once = true;
}
continue;
}
hdr_cur = header_get(st);
close_device(st);
if (hdr_cur) {
break;
} else {
if (logged_once == false) {
cl_log(LOG_WARNING, "Failed to read header from %s. "
"Trying any other configured devices, "
"otherwise retrying every %ds within %ds",
s->devname, timeout_loop, timeout_startup);
logged_once = true;
}
}
}
clock_gettime(CLOCK_MONOTONIC, &t_now);
- t_wait = t_now.tv_sec - t_0.tv_sec;
+ t_wait = seconds_diff_timespec(&t_now, &t_0);
if (!hdr_cur) {
sleep(timeout_loop);
}
}
if (hdr_cur) {
timeout_watchdog = hdr_cur->timeout_watchdog;
timeout_allocate = hdr_cur->timeout_allocate;
timeout_loop = hdr_cur->timeout_loop;
timeout_msgwait = hdr_cur->timeout_msgwait;
} else {
cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.",
timeout_startup);
exit(1);
}
free(hdr_cur);
return;
}
/*
::-::-::-::-::-::-::-::-::-::-::-::-::
Begin disk based servant code
::-::-::-::-::-::-::-::-::-::-::-::-::
*/
static int servant_check_timeout_inconsistent(struct sector_header_s *hdr)
{
if (timeout_watchdog != hdr->timeout_watchdog) {
cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device",
- (int)timeout_watchdog, (int)hdr->timeout_watchdog);
+ timeout_watchdog, (int)hdr->timeout_watchdog);
return -1;
}
if (timeout_allocate != hdr->timeout_allocate) {
cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device",
(int)timeout_allocate, (int)hdr->timeout_allocate);
return -1;
}
if (timeout_loop != hdr->timeout_loop) {
cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device",
(int)timeout_loop, (int)hdr->timeout_loop);
return -1;
}
if (timeout_msgwait != hdr->timeout_msgwait) {
cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device",
(int)timeout_msgwait, (int)hdr->timeout_msgwait);
return -1;
}
return 0;
}
int servant_md(const char *diskname, int mode, const void* argp)
{
struct sector_mbox_s *s_mbox = NULL;
struct sector_node_s *s_node = NULL;
struct sector_header_s *s_header = NULL;
int mbox;
int rc = 0;
- time_t t0, t1, latency;
- union sigval signal_value;
+ time_t t0, t1;
+ int latency;
sigset_t servant_masks;
struct sbd_context *st;
pid_t ppid;
char uuid[37];
const struct servants_list_item *s = argp;
cl_log(LOG_INFO, "Servant starting for device %s", diskname);
/* Block most of the signals */
sigfillset(&servant_masks);
sigdelset(&servant_masks, SIGKILL);
sigdelset(&servant_masks, SIGFPE);
sigdelset(&servant_masks, SIGILL);
sigdelset(&servant_masks, SIGSEGV);
sigdelset(&servant_masks, SIGBUS);
sigdelset(&servant_masks, SIGALRM);
/* FIXME: check error */
sigprocmask(SIG_SETMASK, &servant_masks, NULL);
st = open_device(diskname, LOG_WARNING);
if (!st) {
exit(EXIT_MD_SERVANT_IO_FAIL);
}
s_header = header_get(st);
if (!s_header) {
cl_log(LOG_ERR, "Not a valid header on %s", diskname);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (servant_check_timeout_inconsistent(s_header) < 0) {
cl_log(LOG_ERR, "Timeouts on %s do not match first device",
diskname);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (s_header->minor_version > 0) {
uuid_unparse_lower(s_header->uuid, uuid);
cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid);
}
mbox = slot_allocate(st, local_uname);
if (mbox < 0) {
cl_log(LOG_ERR,
"No slot allocated, and automatic allocation failed for disk %s.",
diskname);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
s_node = sector_alloc();
if (slot_read(st, mbox, s_node) < 0) {
cl_log(LOG_ERR, "Unable to read node entry on %s",
diskname);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname);
if (s_header->minor_version == 0) {
set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox);
} else {
set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s",
diskname, mbox, uuid);
}
s_mbox = sector_alloc();
if (s->first_start) {
if (mode > 0) {
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed during start-up in servant.");
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (s_mbox->cmd != SBD_MSG_EXIT &&
s_mbox->cmd != SBD_MSG_EMPTY) {
/* Not a clean stop. Abort start-up */
cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!");
ppid = getppid();
- sigqueue(ppid, SIG_EXITREQ, signal_value);
+ sigqueue_zero(ppid, SIG_EXITREQ);
rc = 0;
goto out;
}
}
DBGLOG(LOG_INFO, "First servant start - zeroing inbox");
memset(s_mbox, 0, sizeof(*s_mbox));
if (mbox_write(st, mbox, s_mbox) < 0) {
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
}
- memset(&signal_value, 0, sizeof(signal_value));
-
while (1) {
struct sector_header_s *s_header_retry = NULL;
struct sector_node_s *s_node_retry = NULL;
t0 = time(NULL);
sleep(timeout_loop);
ppid = getppid();
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
do_timeout_action();
}
/* These attempts are, by definition, somewhat racy. If
* the device is wiped out or corrupted between here and
* us reading our mbox, there is nothing we can do about
* that. But at least we tried. */
s_header_retry = header_get(st);
if (!s_header_retry) {
cl_log(LOG_ERR, "No longer found a valid header on %s", diskname);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) {
cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname);
free(s_header_retry);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
free(s_header_retry);
s_node_retry = sector_alloc();
if (slot_read(st, mbox, s_node_retry) < 0) {
cl_log(LOG_ERR, "slot read failed in servant.");
free(s_node_retry);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) {
cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname);
free(s_node_retry);
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
free(s_node_retry);
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed in servant.");
rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (s_mbox->cmd > 0) {
cl_log(LOG_NOTICE,
"Received command %s from %s on disk %s",
char2cmd(s_mbox->cmd), s_mbox->from, diskname);
switch (s_mbox->cmd) {
case SBD_MSG_TEST:
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
- sigqueue(ppid, SIG_TEST, signal_value);
+ sigqueue_zero(ppid, SIG_TEST);
break;
case SBD_MSG_RESET:
rc = EXIT_MD_SERVANT_REQUEST_RESET;
goto out;
case SBD_MSG_OFF:
rc = EXIT_MD_SERVANT_REQUEST_SHUTOFF;
goto out;
case SBD_MSG_EXIT:
- sigqueue(ppid, SIG_EXITREQ, signal_value);
+ sigqueue_zero(ppid, SIG_EXITREQ);
break;
case SBD_MSG_CRASHDUMP:
rc = EXIT_MD_SERVANT_REQUEST_CRASHDUMP;
goto out;
default:
/* FIXME:
An "unknown" message might result
from a partial write.
log it and clear the slot.
*/
cl_log(LOG_ERR, "Unknown message on disk %s",
diskname);
memset(s_mbox, 0, sizeof(*s_mbox));
mbox_write(st, mbox, s_mbox);
break;
}
}
- sigqueue(ppid, SIG_LIVENESS, signal_value);
+ sigqueue_zero(ppid, SIG_LIVENESS);
t1 = time(NULL);
- latency = t1 - t0;
+ latency = seconds_diff_time_t(t1, t0);
if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: %ds exceeded watchdog warning timeout %ds on disk %s",
- (int)latency, (int)timeout_watchdog_warn,
+ latency, timeout_watchdog_warn,
diskname);
} else if (debug) {
- DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", (int)latency,
+ DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", latency,
diskname);
}
}
out:
free(s_node);
free(s_mbox);
free(s_header);
close_device(st);
exit(rc);
}
diff --git a/src/sbd-watchdog.c b/src/sbd-watchdog.c
new file mode 100644
index 0000000..b0dd5d6
--- /dev/null
+++ b/src/sbd-watchdog.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "sbd.h"
+#ifdef __GLIBC__
+#include <sys/sysmacros.h>
+#endif
+#include <dirent.h>
+#include <limits.h>
+
+/* possibly tunable defaults regarding watchdog operation
+ are found in sbd-common.c
+ */
+
+/* Global, non-tunable variables: */
+int watchdogfd = -1;
+char *watchdogdev = NULL;
+bool watchdogdev_is_default = false;
+bool do_calculate_timeout_watchdog_warn = true;
+int timeout_watchdog_warn =
+ calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT);
+
+#define MAX_WATCHDOGS 64
+#define SYS_CLASS_WATCHDOG "/sys/class/watchdog"
+#define SYS_CHAR_DEV_DIR "/sys/dev/char"
+#define WATCHDOG_NODEDIR "/dev/"
+
+static bool
+is_watchdog(dev_t device)
+{
+ static int num_watchdog_devs = 0;
+ static dev_t watchdog_devs[MAX_WATCHDOGS];
+ struct dirent *entry;
+ int i;
+
+ /* populate on first call */
+ if (num_watchdog_devs == 0) {
+ DIR *dp;
+
+ watchdog_devs[0] = makedev(10,130);
+ num_watchdog_devs = 1;
+
+ /* get additional devices from /sys/class/watchdog */
+ dp = opendir(SYS_CLASS_WATCHDOG);
+ if (dp) {
+ while ((entry = readdir(dp))) {
+ if (entry->d_type == DT_LNK) {
+ FILE *file;
+ char entry_name[NAME_MAX+sizeof(SYS_CLASS_WATCHDOG)+5];
+
+ snprintf(entry_name, sizeof(entry_name),
+ SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
+ file = fopen(entry_name, "r");
+ if (file) {
+ int major, minor;
+
+ if (fscanf(file, "%d:%d", &major, &minor) == 2) {
+ watchdog_devs[num_watchdog_devs++] =
+ makedev(major, minor);
+ }
+ fclose(file);
+ if (num_watchdog_devs == MAX_WATCHDOGS) {
+ break;
+ }
+ }
+ }
+ }
+ closedir(dp);
+ }
+ }
+
+ for (i=0; i < num_watchdog_devs; i++) {
+ if (device == watchdog_devs[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static int
+watchdog_init_interval_fd(int wdfd, int timeout)
+{
+ if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) {
+ cl_perror( "WDIOC_SETTIMEOUT"
+ ": Failed to set watchdog timer to %d seconds.",
+ timeout);
+ cl_log(LOG_CRIT, "Please validate your watchdog configuration!");
+ cl_log(LOG_CRIT, "Choose a different watchdog driver or specify "
+ "-T to skip this if you are completely sure.");
+ return -1;
+ }
+ return 0;
+}
+
+int
+watchdog_init_interval(void)
+{
+ if (watchdogfd < 0) {
+ return 0;
+ }
+
+ if (watchdog_set_timeout == 0) {
+ cl_log(LOG_INFO,
+ "NOT setting watchdog timeout on explicit user request!");
+ return 0;
+ }
+
+ if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) {
+ return -1;
+ }
+ cl_log(LOG_INFO, "Set watchdog timeout to %d seconds.", timeout_watchdog);
+ return 0;
+}
+
+static int
+watchdog_tickle_fd(int wdfd, char *wddev)
+{
+ if (write(wdfd, "", 1) != 1) {
+ cl_perror("Watchdog write failure: %s!", wddev);
+ return -1;
+ }
+ return 0;
+}
+
+int
+watchdog_tickle(void)
+{
+ if (watchdogfd >= 0) {
+ return watchdog_tickle_fd(watchdogfd, watchdogdev);
+ }
+ return 0;
+}
+
+static int
+watchdog_init_fd(char *wddev, int timeout)
+{
+ int wdfd;
+
+ wdfd = open(wddev, O_WRONLY);
+ if (wdfd >= 0) {
+ if (((timeout >= 0) &&
+ (watchdog_init_interval_fd(wdfd, timeout) < 0)) ||
+ (watchdog_tickle_fd(wdfd, wddev) < 0)) {
+ close(wdfd);
+ return -1;
+ }
+ } else {
+ struct stat statbuf;
+
+ if(!stat(wddev, &statbuf) && S_ISCHR(statbuf.st_mode) &&
+ is_watchdog(statbuf.st_rdev)) {
+ cl_perror("Cannot open watchdog device '%s'", wddev);
+ } else {
+ cl_perror("Seems as if '%s' isn't a valid watchdog-device", wddev);
+ }
+ return -1;
+ }
+ return wdfd;
+}
+
+int
+watchdog_init(void)
+{
+ if (watchdogfd < 0 && watchdogdev != NULL) {
+ int timeout = timeout_watchdog;
+
+ if (watchdog_set_timeout == 0) {
+ cl_log(LOG_INFO,
+ "NOT setting watchdog timeout on explicit user request!");
+ timeout = -1;
+ }
+ watchdogfd = watchdog_init_fd(watchdogdev, timeout);
+ if (watchdogfd >= 0) {
+ cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
+ if (watchdog_set_timeout) {
+ cl_log(LOG_INFO, "Set watchdog timeout to %d seconds.",
+ timeout_watchdog);
+ }
+ } else {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void
+watchdog_close_fd(int wdfd, char *wddev, bool disarm)
+{
+ if (disarm) {
+ int r;
+ int flags = WDIOS_DISABLECARD;;
+
+ /* Explicitly disarm it */
+ r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags);
+ if (r < 0) {
+ cl_perror("Failed to disable hardware watchdog %s", wddev);
+ }
+
+ /* To be sure, use magic close logic, too */
+ for (;;) {
+ if (write(wdfd, "V", 1) > 0) {
+ break;
+ }
+ cl_perror("Cannot disable watchdog device %s", wddev);
+ }
+ }
+
+ if (close(wdfd) < 0) {
+ cl_perror("Watchdog close(%d) failed", wdfd);
+ }
+}
+
+void
+watchdog_close(bool disarm)
+{
+ if (watchdogfd < 0) {
+ return;
+ }
+
+ watchdog_close_fd(watchdogfd, watchdogdev, disarm);
+ watchdogfd = -1;
+}
+
+struct watchdog_list_item {
+ dev_t dev;
+ char *dev_node;
+ char *dev_ident;
+ char *dev_driver;
+ pid_t busy_pid;
+ char *busy_name;
+ struct watchdog_list_item *next;
+};
+
+struct link_list_item {
+ char *dev_node;
+ char *link_name;
+ struct link_list_item *next;
+};
+
+static struct watchdog_list_item *watchdog_list = NULL;
+static int watchdog_list_items = 0;
+
+static void
+watchdog_populate_list(void)
+{
+ struct dirent *entry;
+ char entry_name[sizeof(WATCHDOG_NODEDIR)+NAME_MAX];
+ DIR *dp;
+ char buf[NAME_MAX+sizeof(WATCHDOG_NODEDIR)] = "";
+ struct link_list_item *link_list = NULL;
+
+ if (watchdog_list != NULL) {
+ return;
+ }
+
+ /* search for watchdog nodes in /dev */
+ dp = opendir(WATCHDOG_NODEDIR);
+ if (dp) {
+ /* first go for links and memorize them */
+ while ((entry = readdir(dp))) {
+ if (entry->d_type == DT_LNK) {
+ int len;
+
+ snprintf(entry_name, sizeof(entry_name),
+ WATCHDOG_NODEDIR "%s", entry->d_name);
+
+ /* realpath(entry_name, buf) unfortunately does a stat on
+ * target so we can't really use it to check if links stay
+ * within /dev without triggering e.g. AVC-logs (with
+ * SELinux policy that just allows stat within /dev).
+ * Without canonicalization that doesn't actually touch the
+ * filesystem easily available introduce some limitations
+ * for simplicity:
+ * - just simple path without '..'
+ * - just one level of symlinks (avoid e.g. loop-checking)
+ */
+ len = readlink(entry_name, buf, sizeof(buf) - 1);
+ if ((len < 1) ||
+ (len > sizeof(buf) - sizeof(WATCHDOG_NODEDIR) -1 - 1)) {
+ continue;
+ }
+ buf[len] = '\0';
+ if (buf[0] != '/') {
+ memmove(&buf[sizeof(WATCHDOG_NODEDIR)-1], buf, len+1);
+ memcpy(buf, WATCHDOG_NODEDIR, sizeof(WATCHDOG_NODEDIR)-1);
+ len += sizeof(WATCHDOG_NODEDIR)-1;
+ }
+ if (strstr(buf, "/../") ||
+ strncmp(WATCHDOG_NODEDIR, buf,
+ sizeof(WATCHDOG_NODEDIR)-1)) {
+ continue;
+ } else {
+ /* just memorize to avoid statting the target - SELinux */
+ struct link_list_item *lli =
+ calloc(1, sizeof(struct link_list_item));
+
+ if (lli == NULL) {
+ break;
+ }
+ lli->dev_node = strdup(buf);
+ lli->link_name = strdup(entry_name);
+ if ((lli->dev_node == NULL) || (lli->link_name == NULL)) {
+ free(lli->dev_node);
+ free(lli->link_name);
+ free(lli);
+ break;
+ }
+ lli->next = link_list;
+ link_list = lli;
+ }
+ }
+ }
+
+ rewinddir(dp);
+
+ while ((entry = readdir(dp))) {
+ if (entry->d_type == DT_CHR) {
+ struct stat statbuf;
+
+ snprintf(entry_name, sizeof(entry_name),
+ WATCHDOG_NODEDIR "%s", entry->d_name);
+ if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode) &&
+ is_watchdog(statbuf.st_rdev)) {
+
+ int wdfd;
+ struct watchdog_list_item *wdg =
+ calloc(1, sizeof(struct watchdog_list_item));
+ int len;
+ struct link_list_item *tmp_list = NULL;
+
+ if (wdg == NULL) {
+ break;
+ }
+
+ wdg->dev = statbuf.st_rdev;
+ wdg->dev_node = strdup(entry_name);
+ if (wdg->dev_node == NULL) {
+ free(wdg);
+ break;
+ }
+ wdg->next = watchdog_list;
+ watchdog_list = wdg;
+ watchdog_list_items++;
+
+ wdfd = watchdog_init_fd(entry_name, -1);
+ if (wdfd >= 0) {
+ struct watchdog_info ident;
+
+ ident.identity[0] = '\0';
+ ioctl(wdfd, WDIOC_GETSUPPORT, &ident);
+ watchdog_close_fd(wdfd, entry_name, true);
+ if (ident.identity[0]) {
+ wdg->dev_ident = strdup((char *) ident.identity);
+ }
+ }
+
+ snprintf(entry_name, sizeof(entry_name),
+ SYS_CHAR_DEV_DIR "/%d:%d/device/driver",
+ major(wdg->dev), minor(wdg->dev));
+ len = readlink(entry_name, buf, sizeof(buf) - 1);
+ if (len > 0) {
+ buf[len] = '\0';
+ wdg->dev_driver = strdup(basename(buf));
+ } else if ((wdg->dev_ident) &&
+ (strcmp(wdg->dev_ident,
+ "Software Watchdog") == 0)) {
+ wdg->dev_driver = strdup("softdog");
+ }
+
+ /* create dupes if we have memorized links
+ * to this node
+ */
+ for (tmp_list = link_list; tmp_list;
+ tmp_list = tmp_list->next) {
+ if (!strcmp(tmp_list->dev_node,
+ wdg->dev_node)) {
+ struct watchdog_list_item *dupe_wdg =
+ calloc(1, sizeof(struct watchdog_list_item));
+
+ if (dupe_wdg == NULL) {
+ break;
+ }
+ /* as long as we never purge watchdog_list
+ * there is no need to dupe strings
+ */
+ *dupe_wdg = *wdg;
+ dupe_wdg->dev_node = strdup(tmp_list->link_name);
+ if (dupe_wdg->dev_node == NULL) {
+ free(dupe_wdg);
+ break;
+ }
+ dupe_wdg->next = watchdog_list;
+ watchdog_list = dupe_wdg;
+ watchdog_list_items++;
+ }
+ /* for performance reasons we could remove
+ * the link_list entry
+ */
+ }
+ }
+ }
+ }
+
+ closedir(dp);
+ }
+
+ /* cleanup link list */
+ while (link_list) {
+ struct link_list_item *tmp_list = link_list;
+
+ link_list = link_list->next;
+ free(tmp_list->dev_node);
+ free(tmp_list->link_name);
+ free(tmp_list);
+ }
+}
+
+static void
+watchdog_checkbusy()
+{
+ DIR *dproc;
+ struct dirent *entry;
+
+ dproc = opendir("/proc");
+ if (!dproc) {
+ /* no proc directory to search through */
+ return;
+ }
+
+ while ((entry = readdir(dproc)) != NULL) {
+ pid_t local_pid;
+ char *leftover;
+ DIR *dpid;
+ char procpath[NAME_MAX+10] = { 0 };
+
+ if (entry->d_name[0] == '.') {
+ continue;
+ }
+
+ local_pid = strtol(entry->d_name, &leftover, 10);
+ if (leftover[0] != '\0')
+ continue;
+
+ snprintf(procpath, sizeof(procpath), "/proc/%s/fd", entry->d_name);
+ dpid = opendir(procpath);
+ if (!dpid) {
+ /* silently continue - might be just a race */
+ continue;
+ }
+ while ((entry = readdir(dpid)) != NULL) {
+ struct watchdog_list_item *wdg;
+ char entry_name[sizeof(procpath)+NAME_MAX+1] = { 0 };
+ char buf[NAME_MAX+1] = { 0 };
+ int len;
+
+ if (entry->d_type != DT_LNK) {
+ continue;
+ }
+ snprintf(entry_name, sizeof(entry_name),
+ "%s/%s", procpath, entry->d_name);
+ len = readlink(entry_name, buf, sizeof(buf) - 1);
+ if (len < 1) {
+ continue;
+ }
+ buf[len] = '\0';
+ for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
+ if (!strcmp(buf, wdg->dev_node)) {
+ char name[16];
+ FILE *file;
+
+ wdg->busy_pid = local_pid;
+ snprintf(procpath, sizeof(procpath), "/proc/%d/status",
+ local_pid);
+ file = fopen(procpath, "r");
+ if (file) {
+ if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]",
+ name) == 1) {
+ wdg->busy_name = strdup(name);
+ }
+ fclose(file);
+ }
+ }
+ }
+ }
+ closedir(dpid);
+ }
+
+ closedir(dproc);
+
+ return;
+}
+
+int watchdog_info(void)
+{
+ struct watchdog_list_item *wdg;
+ int wdg_cnt = 0;
+
+ watchdog_populate_list();
+ watchdog_checkbusy();
+ printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items);
+ for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) {
+ wdg_cnt++;
+ if (wdg->busy_pid) {
+ printf("\n[%d] %s\nIdentity: Busy: PID %d (%s)\nDriver: %s\n",
+ wdg_cnt, wdg->dev_node,
+ wdg->busy_pid,
+ wdg->busy_name?wdg->busy_name:"<unknown>",
+ wdg->dev_driver?wdg->dev_driver:"<unknown>");
+ } else {
+ printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n",
+ wdg_cnt, wdg->dev_node,
+ wdg->dev_ident?wdg->dev_ident:
+ "Error: device hogged via alias major/minor?",
+ wdg->dev_driver?wdg->dev_driver:"<unknown>");
+ }
+ if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) {
+ printf("CAUTION: Not recommended for use with sbd.\n");
+ }
+ }
+
+ return 0;
+}
+
+int watchdog_test(void)
+{
+ int i;
+
+ if ((watchdog_set_timeout == 0) || !watchdog_use) {
+ printf("\nWatchdog is disabled - aborting test!!!\n");
+ return 0;
+ }
+ if (watchdogdev_is_default) {
+ watchdog_populate_list();
+ if (watchdog_list_items > 1) {
+ printf("\nError: Multiple watchdog devices discovered."
+ "\n Use -w <watchdog> or SBD_WATCHDOG_DEV to specify"
+ "\n which device to reset the system with\n");
+ watchdog_info();
+ return -1;
+ }
+ }
+ if ((isatty(fileno(stdin)))) {
+ char buffer[16];
+ printf("\n");
+ printf(
+ "WARNING: This operation is expected to force-reboot this system\n"
+ " without following any shutdown procedures.\n\n"
+ "Proceed? [NO/Proceed] ");
+
+ if ((fgets(buffer, 16, stdin) == NULL) ||
+ strcmp(buffer, "Proceed\n")) {
+ printf("\nAborting watchdog test!!!\n");
+ return 0;
+ }
+ printf("\n");
+ }
+ printf("Initializing %s with a reset countdown of %d seconds ...\n",
+ watchdogdev, (int) timeout_watchdog);
+ if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) {
+ printf("Failed to initialize watchdog!!!\n");
+ watchdog_info();
+ return -1;
+ }
+ printf("\n");
+ printf(
+ "NOTICE: The watchdog device is expected to reset the system\n"
+ " in %d seconds. If system remains active beyond that time,\n"
+ " watchdog may not be functional.\n\n", timeout_watchdog);
+ for (i=timeout_watchdog; i>1; i--) {
+ printf("Reset countdown ... %d seconds\n", i);
+ sleep(1);
+ }
+ for (i=2; i>0; i--) {
+ printf("System expected to reset any moment ...\n");
+ sleep(1);
+ }
+ for (i=5; i>0; i--) {
+ printf("System should have reset ...\n");
+ sleep(1);
+ }
+ printf("Error: The watchdog device has failed to reboot the system,\n"
+ " and it may not be suitable for usage with sbd.\n");
+
+ /* test should trigger a reboot thus returning is actually bad */
+ return -1;
+}
diff --git a/src/sbd.h b/src/sbd.h
index bbdc6f1..ffeace9 100644
--- a/src/sbd.h
+++ b/src/sbd.h
@@ -1,219 +1,223 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <arpa/inet.h>
#include <asm/unistd.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <libaio.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/watchdog.h>
#include <malloc.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ptrace.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#include <syslog.h>
#include <time.h>
#include <unistd.h>
#include <uuid/uuid.h>
#include <qb/qblog.h>
#include <crm_config.h>
#include <config.h>
/* signals reserved for multi-disk sbd */
#define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */
#define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */
#define SIG_TEST (SIGRTMIN + 3) /* trigger self test */
#define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */
#define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5)
/* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */
/* exit status for disk-servant */
#define EXIT_MD_SERVANT_IO_FAIL 20
#define EXIT_MD_SERVANT_REQUEST_RESET 21
#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22
#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23
/* exit status for pcmk-servant */
#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30
#define HOG_CHAR 0xff
#define SECTOR_NAME_MAX 63
/* Sector data types */
struct sector_header_s {
char magic[8];
unsigned char version;
unsigned char slots;
/* Caveat: stored in network byte-order */
uint32_t sector_size;
uint32_t timeout_watchdog;
uint32_t timeout_allocate;
uint32_t timeout_loop;
uint32_t timeout_msgwait;
/* Minor version for extensions to the core data set:
* compatible and optional values. */
unsigned char minor_version;
uuid_t uuid; /* 16 bytes */
};
struct sector_mbox_s {
signed char cmd;
char from[SECTOR_NAME_MAX+1];
};
struct sector_node_s {
/* slots will be created with in_use == 0 */
char in_use;
char name[SECTOR_NAME_MAX+1];
};
struct servants_list_item {
const char* devname;
pid_t pid;
int restarts;
int restart_blocked;
int outdated;
int first_start;
struct timespec t_last, t_started;
struct servants_list_item *next;
};
struct sbd_context {
int devfd;
io_context_t ioctx;
struct iocb io;
void *buffer;
};
enum pcmk_health
{
pcmk_health_unknown,
pcmk_health_pending,
pcmk_health_transient,
pcmk_health_unclean,
pcmk_health_shutdown,
pcmk_health_online,
pcmk_health_noquorum,
};
void usage(void);
int watchdog_init_interval(void);
int watchdog_tickle(void);
int watchdog_init(void);
void sysrq_init(void);
void watchdog_close(bool disarm);
int watchdog_info(void);
int watchdog_test(void);
void sysrq_trigger(char t);
void do_crashdump(void);
void do_reset(void);
void do_off(void);
void do_timeout_action(void);
pid_t make_daemon(void);
void maximize_priority(void);
void sbd_get_uname(void);
void sbd_set_format_string(int method, const char *daemon);
+int sigqueue_zero(pid_t pid, int sig);
void notify_parent(void);
/* Tunable defaults: */
-extern unsigned long timeout_watchdog;
-extern unsigned long timeout_watchdog_warn;
-extern bool do_calculate_timeout_watchdog_warn;
-extern unsigned long timeout_watchdog_crashdump;
+extern int timeout_watchdog;
+extern int timeout_watchdog_warn;
+extern bool do_calculate_timeout_watchdog_warn;
+extern int timeout_watchdog_crashdump;
extern int timeout_allocate;
extern int timeout_loop;
extern int timeout_msgwait;
extern int timeout_io;
extern int timeout_startup;
extern int watchdog_use;
extern int watchdog_set_timeout;
extern int skip_rt;
extern int debug;
extern int debug_mode;
extern char *watchdogdev;
extern bool watchdogdev_is_default;
extern char* local_uname;
extern bool do_flush;
extern char timeout_sysrq_char;
extern bool move_to_root_cgroup;
extern bool enforce_moving_to_root_cgroup;
extern bool sync_resource_startup;
/* Global, non-tunable variables: */
extern int sector_size;
extern int watchdogfd;
extern const char* cmdname;
typedef int (*functionp_t)(const char* devname, int mode, const void* argp);
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp);
#if SUPPORT_SHARED_DISK
void open_any_device(struct servants_list_item *servants);
int init_devices(struct servants_list_item *servants);
int allocate_slots(const char *name, struct servants_list_item *servants);
int list_slots(struct servants_list_item *servants);
int ping_via_slots(const char *name, struct servants_list_item *servants);
int dump_headers(struct servants_list_item *servants);
unsigned long get_first_msgwait(struct servants_list_item *servants);
int messenger(const char *name, const char *msg, struct servants_list_item *servants);
int servant_md(const char *diskname, int mode, const void* argp);
#endif
int servant_pcmk(const char *diskname, int mode, const void* argp);
int servant_cluster(const char *diskname, int mode, const void* argp);
struct servants_list_item *lookup_servant_by_dev(const char *devname);
struct servants_list_item *lookup_servant_by_pid(pid_t pid);
int init_set_proc_title(int argc, char *argv[], char *envp[]);
void set_proc_title(const char *fmt,...);
#define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args)
# define cl_perror(fmt, args...) do { \
const char *err = strerror(errno); \
cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \
} while(0)
#define DBGLOG(lvl, fmt, args...) do { \
if (debug > 0) cl_log(lvl, fmt, ##args); \
} while(0)
extern int servant_health;
void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
bool sbd_is_disk(struct servants_list_item *servant);
bool sbd_is_pcmk(struct servants_list_item *servant);
bool sbd_is_cluster(struct servants_list_item *servant);
#define calculate_timeout_watchdog_warn(timeout) \
(timeout < 5 ? 2 : \
- (timeout < (ULONG_MAX / 3) ? \
- (((unsigned long) timeout) * 3 / 5) : (((unsigned long) timeout) / 5 * 3)))
+ (timeout < (INT_MAX / 3) ? \
+ (((int) timeout) * 3 / 5) : (((int) timeout) / 5 * 3)))
+
+int seconds_diff_time_t(time_t a, time_t b);
+int seconds_diff_timespec(struct timespec *a, struct timespec *b);
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Apr 21, 7:07 PM (16 h, 46 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1665363
Default Alt Text
(162 KB)
Attached To
Mode
rS SBD
Attached
Detach File
Event Timeline
Log In to Comment