Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index b6c5512..31fd5da 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -1,770 +1,801 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <config.h>
#include <crm_config.h>
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <glib-unix.h>
+// available since glib 2.58
+#ifndef G_SOURCE_FUNC
+#define G_SOURCE_FUNC(f) ((GSourceFunc) (void (*)(void)) (f))
+#endif
+// available since glib 2.32
+#ifndef G_SOURCE_REMOVE
+#define G_SOURCE_REMOVE FALSE
+#endif
+// available since glib 2.32
+#ifndef G_SOURCE_CONTINUE
+#define G_SOURCE_CONTINUE TRUE
+#endif
#endif
#include "sbd.h"
//undef SUPPORT_PLUGIN
//define SUPPORT_PLUGIN 1
/* binary for pacemaker-remote has changed with pacemaker 2 */
#ifdef CRM_SCORE_INFINITY
#define PACEMAKER_REMOTE_BINARY "pacemaker-remoted"
#else
#define PACEMAKER_REMOTE_BINARY "pacemaker_remoted"
#endif
static bool remote_node = false;
static pid_t remoted_pid = 0;
static int reconnect_msec = 1000;
static GMainLoop *mainloop = NULL;
static guint notify_timer = 0;
static crm_cluster_t cluster;
+static void clean_up(int rc);
static gboolean sbd_remote_check(gpointer user_data);
static long unsigned int find_pacemaker_remote(void);
static void sbd_membership_destroy(gpointer user_data);
#if SUPPORT_PLUGIN
static void
sbd_plugin_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
{
if(msg_len > 0) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s", name_for_cluster_type(get_cluster_type()));
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Broken %s message", name_for_cluster_type(get_cluster_type()));
}
notify_parent();
return;
}
#endif
#if SUPPORT_COROSYNC
#if CHECK_VOTEQUORUM_HANDLE
#include <corosync/votequorum.h>
static votequorum_handle_t votequorum_handle = 0;
#endif
#if CHECK_TWO_NODE
static bool two_node = false;
#endif
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
#if CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/votequorum.h>
static bool using_qdevice = false;
static uint32_t qdevice_sync_timeout = /* in seconds */
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
#endif
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
static cmap_track_handle_t track_handle = 0;
static GSource *cmap_source = NULL;
#endif
void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
#if CHECK_TWO_NODE
bool quorum_is_suspect_two_node =
(two_node && ever_seen_both && cpg_membership_entries == 1);
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
bool quorum_is_suspect_qdevice_timing =
using_qdevice && (qdevice_sync_timeout > timeout_watchdog);
#endif
do {
#if CHECK_TWO_NODE
if (quorum_is_suspect_two_node) {
/* Alternative would be asking votequorum for number of votes.
* Using pacemaker's cpg as source for number of active nodes
* avoids binding to an additional library, is definitely
* less code to write and we wouldn't have to combine data
* from 3 sources (cmap, cpg & votequorum) in a potentially
* racy environment.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but requires both nodes present",
name_for_cluster_type(get_cluster_type())
);
break;
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (quorum_is_suspect_qdevice_timing) {
/* We can't really trust quorum info as qdevice-sync_timeout
* makes reaction of quorum too sluggish for our
* watchdog-timeout.
*/
set_servant_health(pcmk_health_noquorum, LOG_WARNING,
"Connected to %s but quorum using qdevice is distrusted "
"for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout "
"(%lus).",
name_for_cluster_type(get_cluster_type()),
qdevice_sync_timeout, timeout_watchdog
);
break;
}
#endif
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to %s (%u members)%s",
name_for_cluster_type(get_cluster_type()),
cpg_membership_entries,
#if CHECK_QDEVICE_SYNC_TIMEOUT
using_qdevice?" using qdevice for quorum":""
#else
""
#endif
);
} while (false);
if (cpg_membership_entries > 1) {
ever_seen_both = true;
}
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Empty %s membership", name_for_cluster_type(get_cluster_type()));
}
}
void
sbd_cpg_membership_dispatch(cpg_handle_t handle,
const struct cpg_name *groupName,
const struct cpg_address *member_list, size_t member_list_entries,
const struct cpg_address *left_list, size_t left_list_entries,
const struct cpg_address *joined_list, size_t joined_list_entries)
{
cpg_membership_entries = member_list_entries;
sbd_cpg_membership_health_update();
notify_parent();
}
#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
int32_t event,
const char *key_name,
struct cmap_notify_value new_val,
struct cmap_notify_value old_val,
void *user_data)
{
switch (event) {
case CMAP_TRACK_ADD:
case CMAP_TRACK_MODIFY:
switch (new_val.type) {
case CMAP_VALUETYPE_UINT8:
#if CHECK_TWO_NODE
if (!strcmp(key_name, "quorum.two_node")) {
two_node = *((uint8_t *) new_val.data);
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_STRING:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.model")) {
using_qdevice =
((new_val.data) && strlen((char *) new_val.data));
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_UINT32:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.sync_timeout")) {
if (new_val.data) {
qdevice_sync_timeout =
*((uint32_t *) new_val.data) / 1000;
} else {
qdevice_sync_timeout =
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
}
} else {
return;
}
break;
#else
return;
#endif
default:
return;
}
break;
case CMAP_TRACK_DELETE:
switch (new_val.type) {
case CMAP_VALUETYPE_UINT8:
#if CHECK_TWO_NODE
if (!strcmp(key_name, "quorum.two_node")) {
two_node = false;
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_STRING:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.model")) {
using_qdevice = false;
} else {
return;
}
break;
#else
return;
#endif
case CMAP_VALUETYPE_UINT32:
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (!strcmp(key_name, "quorum.device.sync_timeout")) {
qdevice_sync_timeout =
VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
} else {
return;
}
break;
#else
return;
#endif
default:
return;
}
break;
default:
return;
}
sbd_cpg_membership_health_update();
notify_parent();
}
static gboolean
-cmap_dispatch_callback (gpointer user_data)
+cmap_dispatch_callback (gint cmap_fd,
+ GIOCondition condition,
+ gpointer user_data)
{
+ /* CMAP connection lost */
+ if (condition & G_IO_HUP) {
+ cl_log(LOG_WARNING, "CMAP service connection lost\n");
+ clean_up(EXIT_CLUSTER_DISCONNECT);
+ /* remove the source from the main loop */
+ return G_SOURCE_REMOVE; /* never reached */
+ }
cmap_dispatch(cmap_handle, CS_DISPATCH_ALL);
- return TRUE;
+ return G_SOURCE_CONTINUE;
}
static void
cmap_destroy(void)
{
if (cmap_source) {
g_source_destroy(cmap_source);
cmap_source = NULL;
}
if (track_handle) {
cmap_track_delete(cmap_handle, track_handle);
track_handle = 0;
}
if (cmap_handle) {
cmap_finalize(cmap_handle);
cmap_handle = 0;
}
}
static gboolean
verify_against_cmap_config(void)
{
#if CHECK_TWO_NODE
uint8_t two_node_u8 = 0;
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
char *qdevice_model = NULL;
#endif
int cmap_fd;
if (!track_handle) {
if (cmap_initialize(&cmap_handle) != CS_OK) {
cl_log(LOG_WARNING, "Cannot initialize CMAP service\n");
goto out;
}
#if CHECK_TWO_NODE
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (cmap_track_add(cmap_handle, "quorum.device.model",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n");
goto out;
}
if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING,
"Failed adding CMAP tracker for qdevice-sync_timeout\n");
goto out;
}
#endif
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n");
goto out;
}
if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) {
cl_log(LOG_WARNING, "Couldn't create source for cmap\n");
goto out;
}
- g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL);
+ g_source_set_callback(cmap_source, G_SOURCE_FUNC(cmap_dispatch_callback), NULL, NULL);
g_source_attach(cmap_source, NULL);
}
#if CHECK_TWO_NODE
if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8)
== CS_OK) {
cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
"Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
#endif
#if CHECK_QDEVICE_SYNC_TIMEOUT
if (cmap_get_string(cmap_handle, "quorum.device.model",
&qdevice_model) == CS_OK) {
using_qdevice = qdevice_model && strlen(qdevice_model);
cl_log(using_qdevice? LOG_NOTICE : LOG_INFO,
"Corosync is%s using qdevice", using_qdevice?"":" not");
} else {
cl_log(LOG_INFO, "quorum.device.model not present in cmap\n");
}
if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout",
&qdevice_sync_timeout) == CS_OK) {
qdevice_sync_timeout /= 1000;
cl_log(LOG_INFO,
"Corosync is using qdevice-sync_timeout=%ds",
qdevice_sync_timeout);
} else {
cl_log(LOG_INFO,
"quorum.device.sync_timeout not present in cmap\n");
}
#endif
return TRUE;
out:
cmap_destroy();
return FALSE;
}
#endif
#endif
static gboolean
notify_timer_cb(gpointer data)
{
cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":"");
if(remote_node) {
sbd_remote_check(NULL);
return TRUE;
}
switch (get_cluster_type()) {
#if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS
case pcmk_cluster_classic_ais:
send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais);
break;
#endif
case pcmk_cluster_corosync:
do {
#if SUPPORT_COROSYNC && CHECK_VOTEQUORUM_HANDLE
struct votequorum_info info;
if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) {
votequorum_finalize(votequorum_handle);
if (votequorum_initialize(&votequorum_handle, NULL) != CS_OK) {
votequorum_handle = 0;
break;
}
if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) {
break;
}
}
#endif
notify_parent();
} while (0);
break;
#if HAVE_DECL_PCMK_CLUSTER_CMAN
case pcmk_cluster_cman:
notify_parent();
break;
#endif
default:
break;
}
return TRUE;
}
static void
sbd_membership_connect(void)
{
bool connected = false;
cl_log(LOG_INFO, "Attempting cluster connection");
cluster.destroy = sbd_membership_destroy;
#if SUPPORT_PLUGIN
cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch;
#endif
#if SUPPORT_COROSYNC
cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch;
#endif
while(connected == false) {
enum cluster_type_e stack = get_cluster_type();
if(get_cluster_type() == pcmk_cluster_unknown) {
crm_debug("Attempting pacemaker remote connection");
/* Nothing is up, go looking for the pacemaker remote process */
if(find_pacemaker_remote() > 0) {
connected = true;
}
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
if (verify_against_cmap_config()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
}
#endif
}
if(connected == false) {
cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000);
sleep(reconnect_msec / 1000);
}
}
set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership");
notify_parent();
notify_timer_cb(NULL);
}
static void
sbd_membership_destroy(gpointer user_data)
{
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
if (get_cluster_type() != pcmk_cluster_unknown) {
#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
cmap_destroy();
#endif
}
set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated");
notify_parent();
/* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */
sbd_membership_connect();
}
/*
* \internal
* \brief Get process ID and name associated with a /proc directory entry
*
* \param[in] entry Directory entry (must be result of readdir() on /proc)
* \param[out] name If not NULL, a char[16] to hold the process name
* \param[out] pid If not NULL, will be set to process ID of entry
*
* \return 0 on success, -1 if entry is not for a process or info not found
*
* \note This should be called only on Linux systems, as not all systems that
* support /proc store process names and IDs in the same way.
* Copied from the Pacemaker implementation.
*/
int
sbd_procfs_process_info(struct dirent *entry, char *name, int *pid)
{
int fd, local_pid;
FILE *file;
struct stat statbuf;
char procpath[128] = { 0 };
/* We're only interested in entries whose name is a PID,
* so skip anything non-numeric or that is too long.
*
* 114 = 128 - strlen("/proc/") - strlen("/status") - 1
*/
local_pid = atoi(entry->d_name);
if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) {
return -1;
}
if (pid) {
*pid = local_pid;
}
/* Get this entry's file information */
strcpy(procpath, "/proc/");
strcat(procpath, entry->d_name);
fd = open(procpath, O_RDONLY);
if (fd < 0 ) {
return -1;
}
if (fstat(fd, &statbuf) < 0) {
close(fd);
return -1;
}
close(fd);
/* We're only interested in subdirectories */
if (!S_ISDIR(statbuf.st_mode)) {
return -1;
}
/* Read the first entry ("Name:") from the process's status file.
* We could handle the valgrind case if we parsed the cmdline file
* instead, but that's more of a pain than it's worth.
*/
if (name != NULL) {
strcat(procpath, "/status");
file = fopen(procpath, "r");
if (!file) {
return -1;
}
if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) != 1) {
fclose(file);
return -1;
}
fclose(file);
}
return 0;
}
static gboolean
sbd_remote_check(gpointer user_data)
{
static int have_proc_pid = 0;
int running = 0;
cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid);
if(have_proc_pid == 0) {
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid());
have_proc_pid = 1;
if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) {
have_proc_pid = -1;
}
}
if (remoted_pid <= 0) {
set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection");
goto notify;
} else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) {
/* Not running */
} else if(have_proc_pid == -1) {
running = 1;
cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid);
} else {
int rc = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX];
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid);
rc = readlink(proc_path, exe_path, PATH_MAX - 1);
if (rc < 0) {
crm_perror(LOG_ERR, "Could not read from %s", proc_path);
goto done;
}
exe_path[rc] = 0;
if (strcmp(exe_path, SBINDIR "/" PACEMAKER_REMOTE_BINARY) == 0) {
cl_log(LOG_DEBUG, "Process %s (%ld) is active",
exe_path, (long)remoted_pid);
running = 1;
}
}
done:
if(running) {
set_servant_health(pcmk_health_online, LOG_INFO,
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
} else {
set_servant_health(pcmk_health_unclean, LOG_WARNING,
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
}
notify:
notify_parent();
if(running == 0) {
sbd_membership_connect();
}
return true;
}
static long unsigned int
find_pacemaker_remote(void)
{
DIR *dp;
char entry_name[16];
struct dirent *entry;
dp = opendir("/proc");
if (!dp) {
/* no proc directory to search through */
cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components");
return FALSE;
}
while ((entry = readdir(dp)) != NULL) {
int pid;
if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) {
continue;
}
/* entry_name is truncated to 16 characters including the nul terminator */
cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid);
if (strncmp(entry_name, PACEMAKER_REMOTE_BINARY, 15) == 0) {
cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid);
remoted_pid = pid;
remote_node = true;
break;
}
}
closedir(dp);
return remoted_pid;
}
static void
clean_up(int rc)
{
#if CHECK_VOTEQUORUM_HANDLE
votequorum_finalize(votequorum_handle);
votequorum_handle = 0; /* there isn't really an invalid handle value
* just to be back where we started
*/
#endif
+
+#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+ cmap_destroy();
+#endif
+
+ if (rc >= 0) {
+ exit(rc);
+ }
+
return;
}
static void
cluster_shutdown(int nsig)
{
clean_up(0);
}
int
servant_cluster(const char *diskname, int mode, const void* argp)
{
enum cluster_type_e cluster_stack = get_cluster_type();
crm_system_name = strdup("sbd:cluster");
cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack));
set_proc_title("sbd: watcher: Cluster");
sbd_membership_connect();
/* stonith_our_uname = cluster.uname; */
/* stonith_our_uuid = cluster.uuid; */
mainloop = g_main_loop_new(NULL, FALSE);
notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL);
mainloop_add_signal(SIGTERM, cluster_shutdown);
mainloop_add_signal(SIGINT, cluster_shutdown);
g_main_loop_run(mainloop);
g_main_loop_unref(mainloop);
clean_up(0);
return 0; /* never reached */
}
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index 4fec2fd..5f8d5d0 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -1,1334 +1,1348 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <crm/common/util.h>
#include "sbd.h"
#define LOCKSTRLEN 11
static struct servants_list_item *servants_leader = NULL;
int disk_priority = 1;
int check_pcmk = 1;
int check_cluster = 1;
int disk_count = 0;
int servant_count = 0;
int servant_restart_interval = 5;
int servant_restart_count = 1;
int start_mode = 0;
char* pidfile = NULL;
bool do_flush = true;
char timeout_sysrq_char = 'b';
bool move_to_root_cgroup = true;
bool enforce_moving_to_root_cgroup = false;
bool sync_resource_startup = false;
int parse_device_line(const char *line);
static const char *
get_env_option(const char *option)
{
const char *value = getenv(option);
size_t max = 0;
size_t lpc = 0;
if (value == NULL) {
return NULL;
}
max = strlen(value);
for (lpc = 0; lpc < max; lpc++) {
if (!isspace(value[lpc])) {
break;
}
}
return (strlen(value + lpc) > 0 ? (value + lpc) : NULL);
}
static int
recruit_servant(const char *devname, pid_t pid)
{
struct servants_list_item *s = servants_leader;
struct servants_list_item *newbie;
if (lookup_servant_by_dev(devname)) {
cl_log(LOG_DEBUG, "Servant %s already exists", devname);
return 0;
}
newbie = malloc(sizeof(*newbie));
if (newbie) {
memset(newbie, 0, sizeof(*newbie));
newbie->devname = strdup(devname);
newbie->pid = pid;
newbie->first_start = 1;
}
if (!newbie || !newbie->devname) {
fprintf(stderr, "heap allocation failed in recruit_servant.\n");
exit(1);
}
/* some sanity-check on our newbie */
if (sbd_is_disk(newbie)) {
cl_log(LOG_INFO, "Monitoring %s", devname);
disk_count++;
} else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) {
/* alive just after pcmk and cluster servants have shown up */
newbie->outdated = 1;
} else {
/* toss our newbie */
cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname);
free((void *) newbie->devname);
free(newbie);
return -1;
}
if (!s) {
servants_leader = newbie;
} else {
while (s->next)
s = s->next;
s->next = newbie;
}
servant_count++;
return 0;
}
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
{
pid_t pid = 0;
int rc = 0;
pid = fork();
if (pid == 0) { /* child */
maximize_priority();
sbd_set_format_string(QB_LOG_SYSLOG, devname);
rc = (*functionp)(devname, mode, argp);
if (rc == -1)
exit(1);
else
exit(0);
} else if (pid != -1) { /* parent */
return pid;
} else {
cl_log(LOG_ERR,"Failed to fork servant");
exit(1);
}
}
struct servants_list_item *lookup_servant_by_dev(const char *devname)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (strcasecmp(s->devname, devname) == 0)
break;
}
return s;
}
struct servants_list_item *lookup_servant_by_pid(pid_t pid)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
if (s->pid == pid)
break;
}
return s;
}
int check_all_dead(void)
{
struct servants_list_item *s;
int r = 0;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if (r == -1 && errno == ESRCH)
continue;
return 0;
}
}
return 1;
}
void servant_start(struct servants_list_item *s)
{
int r = 0;
union sigval svalue;
if (s->pid != 0) {
r = sigqueue(s->pid, 0, svalue);
if ((r != -1 || errno != ESRCH))
return;
}
s->restarts++;
if (sbd_is_disk(s)) {
#if SUPPORT_SHARED_DISK
DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
s->pid = assign_servant(s->devname, servant_md, start_mode, s);
#else
cl_log(LOG_ERR, "Shared disk functionality not supported");
return;
#endif
} else if(sbd_is_pcmk(s)) {
DBGLOG(LOG_INFO, "Starting Pacemaker servant");
s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
} else if(sbd_is_cluster(s)) {
DBGLOG(LOG_INFO, "Starting Cluster servant");
s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
} else {
cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
}
clock_gettime(CLOCK_MONOTONIC, &s->t_started);
return;
}
void servants_start(void)
{
struct servants_list_item *s;
for (s = servants_leader; s; s = s->next) {
s->restarts = 0;
servant_start(s);
}
}
void servants_kill(void)
{
struct servants_list_item *s;
union sigval svalue;
for (s = servants_leader; s; s = s->next) {
if (s->pid != 0)
sigqueue(s->pid, SIGKILL, svalue);
}
}
static inline void cleanup_servant_by_pid(pid_t pid)
{
struct servants_list_item* s;
s = lookup_servant_by_pid(pid);
if (s) {
cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
s->devname, s->pid);
s->pid = 0;
} else {
/* This most likely is a stray signal from somewhere, or
* a SIGCHLD for a process that has previously
* explicitly disconnected. */
DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
pid);
}
}
int inquisitor_decouple(void)
{
pid_t ppid = getppid();
union sigval signal_value;
/* During start-up, we only arm the watchdog once we've got
* quorum at least once. */
if (watchdog_use) {
if (watchdog_init() < 0) {
return -1;
}
}
if (ppid > 1) {
sigqueue(ppid, SIG_LIVENESS, signal_value);
}
return 0;
}
static int sbd_lock_running(long pid)
{
int rc = 0;
long mypid;
int running = 0;
char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
/* check if pid is running */
if (kill(pid, 0) < 0 && errno == ESRCH) {
goto bail;
}
#ifndef HAVE_PROC_PID
return 1;
#endif
/* check to make sure pid hasn't been reused by another process */
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
rc = readlink(proc_path, exe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
exe_path[rc] = 0;
mypid = (unsigned long) getpid();
snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
rc = readlink(proc_path, myexe_path, PATH_MAX-1);
if(rc < 0) {
cl_perror("Could not read from %s", proc_path);
goto bail;
}
myexe_path[rc] = 0;
if(strcmp(exe_path, myexe_path) == 0) {
running = 1;
}
bail:
return running;
}
static int
sbd_lock_pidfile(const char *filename)
{
char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
int fd;
long pid, mypid;
int rc;
struct stat sbuf;
if (filename == NULL) {
errno = EFAULT;
return -1;
}
mypid = (unsigned long) getpid();
snprintf(lf_name, sizeof(lf_name), "%s",filename);
snprintf(tf_name, sizeof(tf_name), "%s.%lu",
filename, mypid);
if ((fd = open(lf_name, O_RDONLY)) >= 0) {
if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
sleep(1); /* if someone was about to create one,
* give'm a sec to do so
* Though if they follow our protocol,
* this won't happen. They should really
* put the pid in, then link, not the
* other way around.
*/
}
if (read(fd, buf, sizeof(buf)) < 1) {
/* lockfile empty -> rm it and go on */;
} else {
if (sscanf(buf, "%ld", &pid) < 1) {
/* lockfile screwed up -> rm it and go on */
} else {
if (pid > 1 && (getpid() != pid)
&& sbd_lock_running(pid)) {
/* is locked by existing process
* -> give up */
close(fd);
return -1;
} else {
/* stale lockfile -> rm it and go on */
}
}
}
unlink(lf_name);
close(fd);
}
if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
/* Hmmh, why did we fail? Anyway, nothing we can do about it */
return -3;
}
/* Slight overkill with the %*d format ;-) */
snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
/* Again, nothing we can do about this */
rc = -3;
close(fd);
goto out;
}
close(fd);
switch (link(tf_name, lf_name)) {
case 0:
if (stat(tf_name, &sbuf) < 0) {
/* something weird happened */
rc = -3;
break;
}
if (sbuf.st_nlink < 2) {
/* somehow, it didn't get through - NFS trouble? */
rc = -2;
break;
}
rc = 0;
break;
case EEXIST:
rc = -1;
break;
default:
rc = -3;
}
out:
unlink(tf_name);
return rc;
}
/*
* Unlock a file (remove its lockfile)
* do we need to check, if its (still) ours? No, IMHO, if someone else
* locked our line, it's his fault -tho
* returns 0 on success
* <0 if some failure occured
*/
static int
sbd_unlock_pidfile(const char *filename)
{
char lf_name[256];
if (filename == NULL) {
errno = EFAULT;
return -1;
}
snprintf(lf_name, sizeof(lf_name), "%s", filename);
return unlink(lf_name);
}
int cluster_alive(bool all)
{
int alive = 1;
struct servants_list_item* s;
if(servant_count == disk_count) {
return 0;
}
for (s = servants_leader; s; s = s->next) {
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if(s->outdated) {
alive = 0;
} else if(all == false) {
return 1;
}
}
}
return alive;
}
int quorum_read(int good_servants)
{
if (disk_count > 2)
return (good_servants > disk_count/2);
else
return (good_servants > 0);
}
void inquisitor_child(void)
{
int sig, pid;
sigset_t procmask;
siginfo_t sinfo;
int status;
struct timespec timeout;
int exiting = 0;
int decoupled = 0;
int cluster_appeared = 0;
int pcmk_override = 0;
time_t latency;
struct timespec t_last_tickle, t_now;
struct servants_list_item* s;
if (debug_mode) {
cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
}
set_proc_title("sbd: inquisitor");
if (pidfile) {
if (sbd_lock_pidfile(pidfile) < 0) {
exit(1);
}
}
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIGTERM);
sigaddset(&procmask, SIG_LIVENESS);
sigaddset(&procmask, SIG_EXITREQ);
sigaddset(&procmask, SIG_TEST);
sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
sigaddset(&procmask, SIG_RESTART);
sigaddset(&procmask, SIGUSR1);
sigaddset(&procmask, SIGUSR2);
sigprocmask(SIG_BLOCK, &procmask, NULL);
servants_start();
timeout.tv_sec = timeout_loop;
timeout.tv_nsec = 0;
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
while (1) {
bool tickle = 0;
bool can_detach = 0;
int good_servants = 0;
sig = sigtimedwait(&procmask, &sinfo, &timeout);
clock_gettime(CLOCK_MONOTONIC, &t_now);
if (sig == SIG_EXITREQ || sig == SIGTERM) {
servants_kill();
watchdog_close(true);
exiting = 1;
} else if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
} else {
s = lookup_servant_by_pid(pid);
if (sbd_is_disk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_MD_SERVANT_IO_FAIL:
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
s->devname);
break;
case EXIT_MD_SERVANT_REQUEST_RESET:
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
do_reset();
break;
case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
do_off();
break;
case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
do_crashdump();
break;
default:
break;
}
}
} else if (sbd_is_pcmk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
/* revert to state prior to pacemaker-detection */
s->restarts = 0;
s->restart_blocked = 0;
cluster_appeared = 0;
s->outdated = 1;
s->t_last.tv_sec = 0;
break;
default:
break;
}
}
+ } else if (sbd_is_cluster(s)) {
+ if (WIFEXITED(status)) {
+ switch(WEXITSTATUS(status)) {
+ case EXIT_CLUSTER_DISCONNECT:
+ cl_log(LOG_WARNING, "Cluster-Servant has exited (connection lost)");
+ s->restarts = 0;
+ s->restart_blocked = 0;
+ s->outdated = 1;
+ s->t_last.tv_sec = 0;
+ break;
+ default:
+ break;
+ }
+ }
}
cleanup_servant_by_pid(pid);
}
}
} else if (sig == SIG_PCMK_UNHEALTHY) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
if (s->outdated == 0) {
cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
}
s->t_last.tv_sec = 1;
} else {
cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
}
} else if (sig == SIG_LIVENESS) {
s = lookup_servant_by_pid(sinfo.si_pid);
if (s) {
s->first_start = 0;
clock_gettime(CLOCK_MONOTONIC, &s->t_last);
}
} else if (sig == SIG_TEST) {
} else if (sig == SIGUSR1) {
if (exiting)
continue;
servants_start();
}
if (exiting) {
if (check_all_dead()) {
if (pidfile) {
sbd_unlock_pidfile(pidfile);
}
exit(0);
} else
continue;
}
good_servants = 0;
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_last.tv_sec;
if (!s->t_last.tv_sec)
continue;
if (age < (int)(timeout_io+timeout_loop)) {
if (sbd_is_disk(s)) {
good_servants++;
}
if (s->outdated) {
cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
}
s->outdated = 0;
} else if (!s->outdated) {
if (!s->restart_blocked) {
cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
}
s->outdated = 1;
}
}
if(disk_count == 0) {
/* NO disks, everything is up to the cluster */
if(cluster_alive(true)) {
/* We LIVE! */
if(cluster_appeared == false) {
cl_log(LOG_INFO, "Active cluster detected");
}
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(cluster_alive(false)) {
if(!decoupled) {
/* On the way up, detach and arm the watchdog */
cl_log(LOG_INFO, "Partial cluster detected, detaching");
}
can_detach = 1;
tickle = !cluster_appeared;
} else if(!decoupled) {
/* Stay alive until the cluster comes up */
tickle = !cluster_appeared;
}
} else if(disk_priority == 1 || servant_count == disk_count) {
if (quorum_read(good_servants)) {
/* There are disks and we're connected to the majority of them */
tickle = 1;
can_detach = 1;
pcmk_override = 0;
} else if (servant_count > disk_count && cluster_alive(true)) {
tickle = 1;
if(!pcmk_override) {
cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
pcmk_override = 1; /* Only log this message once */
}
}
} else if(cluster_alive(true) && quorum_read(good_servants)) {
/* Both disk and cluster servants are healthy */
tickle = 1;
can_detach = 1;
cluster_appeared = 1;
} else if(quorum_read(good_servants)) {
/* The cluster takes priority but only once
* connected for the first time.
*
* Until then, we tickle based on disk quorum.
*/
can_detach = 1;
tickle = !cluster_appeared;
}
/* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
/* quorum_read(good_servants), good_servants, tickle, disk_count); */
if(tickle) {
watchdog_tickle();
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
}
if (!decoupled && can_detach) {
/* We only do this at the point either the disk or
* cluster servants become healthy
*/
cl_log(LOG_DEBUG, "Decoupling");
if (inquisitor_decouple() < 0) {
servants_kill();
exiting = 1;
continue;
} else {
decoupled = 1;
}
}
/* Note that this can actually be negative, since we set
* last_tickle after we set now. */
latency = t_now.tv_sec - t_last_tickle.tv_sec;
if (timeout_watchdog && (latency > (int)timeout_watchdog)) {
if (!decoupled) {
/* We're still being watched by our
* parent. We don't fence, but exit. */
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
servants_kill();
exiting = 1;
continue;
}
if (debug_mode < 2) {
/* At level 2 or above, we do nothing, but expect
* things to eventually return to
* normal. */
do_timeout_action();
} else {
cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
}
}
if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) {
cl_log(LOG_WARNING,
"Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)",
(int)latency, (int)timeout_watchdog_warn, good_servants);
if (debug_mode && watchdog_use) {
/* In debug mode, trigger a reset before the watchdog can panic the machine */
do_timeout_action();
}
}
for (s = servants_leader; s; s = s->next) {
int age = t_now.tv_sec - s->t_started.tv_sec;
if (age > servant_restart_interval) {
s->restarts = 0;
s->restart_blocked = 0;
}
if (servant_restart_count
&& (s->restarts >= servant_restart_count)
&& !s->restart_blocked) {
if (servant_restart_count > 1) {
cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
(int)servant_restart_count, s->devname);
}
s->restart_blocked = 1;
}
if (!s->restart_blocked) {
servant_start(s);
}
}
}
/* not reached */
exit(0);
}
int inquisitor(void)
{
int sig, pid, inquisitor_pid;
int status;
sigset_t procmask;
siginfo_t sinfo;
/* Where's the best place for sysrq init ?*/
sysrq_init();
sigemptyset(&procmask);
sigaddset(&procmask, SIGCHLD);
sigaddset(&procmask, SIG_LIVENESS);
sigprocmask(SIG_BLOCK, &procmask, NULL);
inquisitor_pid = make_daemon();
if (inquisitor_pid == 0) {
inquisitor_child();
}
/* We're the parent. Wait for a happy signal from our child
* before we proceed - we either get "SIG_LIVENESS" when the
* inquisitor has completed the first successful round, or
* ECHLD when it exits with an error. */
while (1) {
sig = sigwaitinfo(&procmask, &sinfo);
if (sig == SIGCHLD) {
while ((pid = waitpid(-1, &status, WNOHANG))) {
if (pid == -1 && errno == ECHILD) {
break;
}
/* We got here because the inquisitor
* did not succeed. */
return -1;
}
} else if (sig == SIG_LIVENESS) {
/* Inquisitor started up properly. */
return 0;
} else {
fprintf(stderr, "Nobody expected the spanish inquisition!\n");
continue;
}
}
/* not reached */
return -1;
}
int
parse_device_line(const char *line)
{
size_t lpc = 0;
size_t last = 0;
size_t max = 0;
int found = 0;
bool skip_space = true;
int space_run = 0;
if (!line) {
return 0;
}
max = strlen(line);
cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line);
for (lpc = 0; lpc <= max; lpc++) {
if (isspace(line[lpc])) {
if (skip_space) {
last = lpc + 1;
} else {
space_run++;
}
continue;
}
skip_space = false;
if (line[lpc] == ';' || line[lpc] == 0) {
int rc = 0;
char *entry = calloc(1, 1 + lpc - last);
if (entry) {
rc = sscanf(line + last, "%[^;]", entry);
} else {
fprintf(stderr, "Heap allocation failed parsing device-line.\n");
exit(1);
}
if (rc != 1) {
cl_log(LOG_WARNING, "Could not parse: '%s'", line + last);
} else {
entry[strlen(entry)-space_run] = '\0';
cl_log(LOG_DEBUG, "Adding '%s'", entry);
if (recruit_servant(entry, 0) != 0) {
free(entry);
// sbd should refuse to start if any of the configured device names is invalid.
return -1;
}
found++;
}
free(entry);
skip_space = true;
last = lpc + 1;
}
space_run = 0;
}
return found;
}
#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c"
static void
sbd_log_filter_ctl(const char *files, uint8_t priority)
{
if (files == NULL) {
files = SBD_SOURCE_FILES;
}
qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
}
int
arg_enabled(int arg_count)
{
return arg_count % 2;
}
int main(int argc, char **argv, char **envp)
{
int exit_status = 0;
int c;
int W_count = 0;
int c_count = 0;
int P_count = 0;
int qb_facility;
const char *value = NULL;
bool delay_start = false;
long delay = 0;
char *timeout_action = NULL;
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
cmdname = argv[0];
} else {
++cmdname;
}
watchdogdev = strdup("/dev/watchdog");
watchdogdev_is_default = true;
qb_facility = qb_log_facility2int("daemon");
qb_log_init(cmdname, qb_facility, LOG_WARNING);
sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
sbd_log_filter_ctl(NULL, LOG_NOTICE);
sbd_get_uname();
value = get_env_option("SBD_PACEMAKER");
if(value) {
check_pcmk = crm_is_true(value);
check_cluster = crm_is_true(value);
}
cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default");
value = get_env_option("SBD_STARTMODE");
if(value == NULL) {
} else if(strcmp(value, "clean") == 0) {
start_mode = 1;
} else if(strcmp(value, "always") == 0) {
start_mode = 0;
}
cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
value = get_env_option("SBD_WATCHDOG_DEV");
if(value) {
free(watchdogdev);
watchdogdev = strdup(value);
watchdogdev_is_default = false;
}
/* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
* This is for backward compatibility. */
value = get_env_option("SBD_WATCHDOG");
if(value) {
watchdog_use = crm_is_true(value);
}
value = get_env_option("SBD_WATCHDOG_TIMEOUT");
if(value) {
timeout_watchdog = crm_get_msec(value) / 1000;
}
value = get_env_option("SBD_PIDFILE");
if(value) {
pidfile = strdup(value);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
}
value = get_env_option("SBD_DELAY_START");
if(value) {
delay_start = crm_is_true(value);
if (!delay_start) {
delay = crm_get_msec(value) / 1000;
if (delay > 0) {
delay_start = true;
}
}
}
cl_log(LOG_DEBUG, "Delay start: %s%s%s",
delay_start? "yes (" : "no",
delay_start? (delay > 0 ? value: "msgwait") : "",
delay_start? ")" : "");
value = get_env_option("SBD_TIMEOUT_ACTION");
if(value) {
timeout_action = strdup(value);
}
value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP");
if(value) {
move_to_root_cgroup = crm_is_true(value);
if (move_to_root_cgroup) {
enforce_moving_to_root_cgroup = true;
} else {
if (strcmp(value, "auto") == 0) {
move_to_root_cgroup = true;
}
}
}
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
switch (c) {
case 'D':
break;
case 'Z':
debug_mode++;
cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
break;
case 'R':
skip_rt = 1;
cl_log(LOG_INFO, "Realtime mode deactivated.");
break;
case 'S':
start_mode = atoi(optarg);
cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
break;
case 's':
timeout_startup = atoi(optarg);
cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
break;
case 'v':
debug++;
if(debug == 1) {
sbd_log_filter_ctl(NULL, LOG_INFO);
cl_log(LOG_INFO, "Verbose mode enabled.");
} else if(debug == 2) {
sbd_log_filter_ctl(NULL, LOG_DEBUG);
cl_log(LOG_INFO, "Debug mode enabled.");
} else if(debug == 3) {
/* Go nuts, turn on pacemaker's logging too */
sbd_log_filter_ctl("*", LOG_DEBUG);
cl_log(LOG_INFO, "Debug library mode enabled.");
}
break;
case 'T':
watchdog_set_timeout = 0;
cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
break;
case 'W':
W_count++;
break;
case 'w':
cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
free(watchdogdev);
watchdogdev = strdup(optarg);
watchdogdev_is_default = false;
break;
case 'd':
#if SUPPORT_SHARED_DISK
if (recruit_servant(optarg, 0) != 0) {
fprintf(stderr, "Invalid device: %s\n", optarg);
exit_status = -1;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
break;
case 'c':
c_count++;
break;
case 'P':
P_count++;
break;
case 'z':
disk_priority = 0;
break;
case 'n':
local_uname = strdup(optarg);
cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
break;
case 'p':
pidfile = strdup(optarg);
cl_log(LOG_INFO, "pidfile set to %s", pidfile);
break;
case 'C':
timeout_watchdog_crashdump = atoi(optarg);
cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
(int)timeout_watchdog_crashdump);
break;
case '1':
timeout_watchdog = atoi(optarg);
break;
case '2':
timeout_allocate = atoi(optarg);
break;
case '3':
timeout_loop = atoi(optarg);
break;
case '4':
timeout_msgwait = atoi(optarg);
break;
case '5':
timeout_watchdog_warn = atoi(optarg);
do_calculate_timeout_watchdog_warn = false;
cl_log(LOG_INFO, "Setting latency warning to %d",
(int)timeout_watchdog_warn);
break;
case 't':
servant_restart_interval = atoi(optarg);
cl_log(LOG_INFO, "Setting servant restart interval to %d",
(int)servant_restart_interval);
break;
case 'I':
timeout_io = atoi(optarg);
cl_log(LOG_INFO, "Setting IO timeout to %d",
(int)timeout_io);
break;
case 'F':
servant_restart_count = atoi(optarg);
cl_log(LOG_INFO, "Servant restart count set to %d",
(int)servant_restart_count);
break;
case 'r':
if (timeout_action) {
free(timeout_action);
}
timeout_action = strdup(optarg);
break;
case 'h':
usage();
goto out;
break;
default:
exit_status = -2;
goto out;
break;
}
}
if (disk_count == 0) {
/* if we already have disks from commandline
then it is probably undesirable to add those
from environment (general rule cmdline has precedence)
*/
value = get_env_option("SBD_DEVICE");
if ((value) && strlen(value)) {
#if SUPPORT_SHARED_DISK
int devices = parse_device_line(value);
if(devices < 1) {
fprintf(stderr, "Invalid device line: %s\n", value);
exit_status = -1;
goto out;
}
#else
fprintf(stderr, "Shared disk functionality not supported\n");
exit_status = -2;
goto out;
#endif
}
}
if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
watchdog_use = 0;
} else if (W_count > 0) {
watchdog_use = arg_enabled(W_count);
}
if (watchdog_use) {
cl_log(LOG_INFO, "Watchdog enabled.");
} else {
cl_log(LOG_INFO, "Watchdog disabled.");
}
if (c_count > 0) {
check_cluster = arg_enabled(c_count);
}
if (P_count > 0) {
check_pcmk = arg_enabled(P_count);
}
if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
SECTOR_NAME_MAX);
fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
exit_status = -1;
goto out;
}
if (disk_count > 3) {
fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
exit_status = -1;
goto out;
}
/* There must at least be one command following the options: */
if ((argc - optind) < 1) {
fprintf(stderr, "Not enough arguments.\n");
exit_status = -2;
goto out;
}
if (init_set_proc_title(argc, argv, envp) < 0) {
fprintf(stderr, "Allocation of proc title failed.\n");
exit_status = -1;
goto out;
}
if (timeout_action) {
char *p[2];
int i;
char c;
int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c);
bool parse_error = (nrflags < 1) || (nrflags > 2);
for (i = 0; (i < nrflags) && (i < 2); i++) {
if (!strcmp(p[i], "reboot")) {
timeout_sysrq_char = 'b';
} else if (!strcmp(p[i], "crashdump")) {
timeout_sysrq_char = 'c';
} else if (!strcmp(p[i], "off")) {
timeout_sysrq_char = 'o';
} else if (!strcmp(p[i], "flush")) {
do_flush = true;
} else if (!strcmp(p[i], "noflush")) {
do_flush = false;
} else {
parse_error = true;
}
free(p[i]);
}
if (parse_error) {
fprintf(stderr, "Failed to parse timeout-action \"%s\".\n",
timeout_action);
exit_status = -1;
goto out;
}
}
if (strcmp(argv[optind], "watch") == 0) {
value = get_env_option("SBD_SYNC_RESOURCE_STARTUP");
sync_resource_startup =
crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT);
#if !USE_PACEMAKERD_API
if (sync_resource_startup) {
fprintf(stderr, "Failed to sync resource-startup as "
"SBD was built against pacemaker not supporting pacemakerd-API.\n");
exit_status = -1;
goto out;
}
#else
if (!sync_resource_startup) {
cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
"pacemakerd-API. Should think about enabling "
"SBD_SYNC_RESOURCE_STARTUP.");
}
#endif
}
#if SUPPORT_SHARED_DISK
if (strcmp(argv[optind], "create") == 0) {
exit_status = init_devices(servants_leader);
} else if (strcmp(argv[optind], "dump") == 0) {
exit_status = dump_headers(servants_leader);
} else if (strcmp(argv[optind], "allocate") == 0) {
exit_status = allocate_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "list") == 0) {
exit_status = list_slots(servants_leader);
} else if (strcmp(argv[optind], "message") == 0) {
exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
} else if (strcmp(argv[optind], "ping") == 0) {
exit_status = ping_via_slots(argv[optind + 1], servants_leader);
} else if (strcmp(argv[optind], "watch") == 0) {
if(disk_count > 0) {
/* If no devices are specified, its not an error to be unable to find one */
open_any_device(servants_leader);
}
if (delay_start) {
if (delay <= 0) {
delay = get_first_msgwait(servants_leader);
}
sleep((unsigned long) delay);
}
} else {
exit_status = -2;
}
#endif
/* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from:
* SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */
if (do_calculate_timeout_watchdog_warn) {
timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog);
}
if (strcmp(argv[optind], "query-watchdog") == 0) {
exit_status = watchdog_info();
} else if (strcmp(argv[optind], "test-watchdog") == 0) {
exit_status = watchdog_test();
} else if (strcmp(argv[optind], "watch") == 0) {
/* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
/* We only want this to have an effect during watch right now;
* pinging and fencing would be too confused */
cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
if (check_pcmk) {
recruit_servant("pcmk", 0);
#if SUPPORT_PLUGIN
check_cluster = 1;
#endif
}
cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
if (check_cluster) {
recruit_servant("cluster", 0);
}
cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout",
do_flush?"Do":"Skip", timeout_sysrq_char);
exit_status = inquisitor();
}
out:
if (timeout_action) {
free(timeout_action);
}
if (exit_status < 0) {
if (exit_status == -2) {
usage();
} else {
fprintf(stderr, "sbd failed; please check the logs.\n");
}
return (1);
}
return (0);
}
diff --git a/src/sbd.h b/src/sbd.h
index 7c3c1ec..ecf7027 100644
--- a/src/sbd.h
+++ b/src/sbd.h
@@ -1,218 +1,221 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <arpa/inet.h>
#include <asm/unistd.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <libaio.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/watchdog.h>
#include <malloc.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ptrace.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#include <syslog.h>
#include <time.h>
#include <unistd.h>
#include <uuid/uuid.h>
#include <qb/qblog.h>
#include <crm_config.h>
#include <config.h>
/* signals reserved for multi-disk sbd */
#define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */
#define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */
#define SIG_TEST (SIGRTMIN + 3) /* trigger self test */
#define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */
#define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5)
/* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */
/* exit status for disk-servant */
#define EXIT_MD_SERVANT_IO_FAIL 20
#define EXIT_MD_SERVANT_REQUEST_RESET 21
#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22
#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23
/* exit status for pcmk-servant */
#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30
+/* exit status for cluster-servant */
+#define EXIT_CLUSTER_DISCONNECT 40
+
#define HOG_CHAR 0xff
#define SECTOR_NAME_MAX 63
/* Sector data types */
struct sector_header_s {
char magic[8];
unsigned char version;
unsigned char slots;
/* Caveat: stored in network byte-order */
uint32_t sector_size;
uint32_t timeout_watchdog;
uint32_t timeout_allocate;
uint32_t timeout_loop;
uint32_t timeout_msgwait;
/* Minor version for extensions to the core data set:
* compatible and optional values. */
unsigned char minor_version;
uuid_t uuid; /* 16 bytes */
};
struct sector_mbox_s {
signed char cmd;
char from[SECTOR_NAME_MAX+1];
};
struct sector_node_s {
/* slots will be created with in_use == 0 */
char in_use;
char name[SECTOR_NAME_MAX+1];
};
struct servants_list_item {
const char* devname;
pid_t pid;
int restarts;
int restart_blocked;
int outdated;
int first_start;
struct timespec t_last, t_started;
struct servants_list_item *next;
};
struct sbd_context {
int devfd;
io_context_t ioctx;
struct iocb io;
};
enum pcmk_health
{
pcmk_health_unknown,
pcmk_health_pending,
pcmk_health_transient,
pcmk_health_unclean,
pcmk_health_shutdown,
pcmk_health_online,
pcmk_health_noquorum,
};
void usage(void);
int watchdog_init_interval(void);
int watchdog_tickle(void);
int watchdog_init(void);
void sysrq_init(void);
void watchdog_close(bool disarm);
int watchdog_info(void);
int watchdog_test(void);
void sysrq_trigger(char t);
void do_crashdump(void);
void do_reset(void);
void do_off(void);
void do_timeout_action(void);
pid_t make_daemon(void);
void maximize_priority(void);
void sbd_get_uname(void);
void sbd_set_format_string(int method, const char *daemon);
void notify_parent(void);
/* Tunable defaults: */
extern unsigned long timeout_watchdog;
extern unsigned long timeout_watchdog_warn;
extern bool do_calculate_timeout_watchdog_warn;
extern unsigned long timeout_watchdog_crashdump;
extern int timeout_allocate;
extern int timeout_loop;
extern int timeout_msgwait;
extern int timeout_io;
extern int timeout_startup;
extern int watchdog_use;
extern int watchdog_set_timeout;
extern int skip_rt;
extern int debug;
extern int debug_mode;
extern char *watchdogdev;
extern bool watchdogdev_is_default;
extern char* local_uname;
extern bool do_flush;
extern char timeout_sysrq_char;
extern bool move_to_root_cgroup;
extern bool enforce_moving_to_root_cgroup;
extern bool sync_resource_startup;
/* Global, non-tunable variables: */
extern int sector_size;
extern int watchdogfd;
extern const char* cmdname;
typedef int (*functionp_t)(const char* devname, int mode, const void* argp);
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp);
#if SUPPORT_SHARED_DISK
void open_any_device(struct servants_list_item *servants);
int init_devices(struct servants_list_item *servants);
int allocate_slots(const char *name, struct servants_list_item *servants);
int list_slots(struct servants_list_item *servants);
int ping_via_slots(const char *name, struct servants_list_item *servants);
int dump_headers(struct servants_list_item *servants);
unsigned long get_first_msgwait(struct servants_list_item *servants);
int messenger(const char *name, const char *msg, struct servants_list_item *servants);
int servant_md(const char *diskname, int mode, const void* argp);
#endif
int servant_pcmk(const char *diskname, int mode, const void* argp);
int servant_cluster(const char *diskname, int mode, const void* argp);
struct servants_list_item *lookup_servant_by_dev(const char *devname);
struct servants_list_item *lookup_servant_by_pid(pid_t pid);
int init_set_proc_title(int argc, char *argv[], char *envp[]);
void set_proc_title(const char *fmt,...);
#define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args)
# define cl_perror(fmt, args...) do { \
const char *err = strerror(errno); \
cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \
} while(0)
#define DBGLOG(lvl, fmt, args...) do { \
if (debug > 0) cl_log(lvl, fmt, ##args); \
} while(0)
extern int servant_health;
void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
bool sbd_is_disk(struct servants_list_item *servant);
bool sbd_is_pcmk(struct servants_list_item *servant);
bool sbd_is_cluster(struct servants_list_item *servant);
#define calculate_timeout_watchdog_warn(timeout) \
(timeout < 5 ? 2 : \
(timeout < (ULONG_MAX / 3) ? \
(((unsigned long) timeout) * 3 / 5) : (((unsigned long) timeout) / 5 * 3)))

File Metadata

Mime Type
text/x-diff
Expires
Thu, Jul 10, 1:43 AM (20 h, 55 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2009568
Default Alt Text
(68 KB)

Event Timeline