Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 4806b0a..cd1308c 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -1,701 +1,793 @@
/*
* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
*
* Based on crm_mon.c, which was:
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/* TODO list:
*
* - Trying to shutdown a node if no devices are up will fail, since SBD
* currently uses a message via the disk to achieve this.
*
* - Shutting down cluster nodes while the majority of devices is down
* will eventually take the cluster below the quorum threshold, at which
* time the remaining cluster nodes will all immediately suicide.
*
*/
#include <sys/param.h>
#include <crm/crm.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <sys/utsname.h>
+#ifdef __linux__
+#include <grp.h> /* getgrnam_r, initgroups */
+#include <pwd.h> /* getpwuid_r */
+#endif
+
#include <config.h>
#include <crm_config.h>
#include <crm/msg_xml.h>
#include <crm/common/util.h>
#include <crm/common/xml.h>
#include <crm/common/ipc.h>
#include <crm/common/mainloop.h>
#include <crm/cib.h>
#include <crm/pengine/status.h>
#include "sbd.h"
#ifndef HAVE_PE_NEW_WORKING_SET
#define pe_reset_working_set(data_set) cleanup_calculations(data_set)
static pe_working_set_t *
pe_new_working_set()
{
pe_working_set_t *data_set = calloc(1, sizeof(pe_working_set_t));
if (data_set != NULL) {
set_working_set_defaults(data_set);
}
return data_set;
}
static void
pe_free_working_set(pe_working_set_t *data_set)
{
if (data_set != NULL) {
pe_reset_working_set(data_set);
free(data_set);
}
}
#endif
static void clean_up(int rc);
#if USE_PACEMAKERD_API
#include <crm/common/ipc_pacemakerd.h>
static pcmk_ipc_api_t *pacemakerd_api = NULL;
static time_t last_ok = (time_t) 0;
static void
pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
enum pcmk_ipc_event event_type, crm_exit_t status,
void *event_data, void *user_data)
{
pcmk_pacemakerd_api_reply_t *reply = event_data;
switch (event_type) {
case pcmk_ipc_event_disconnect:
/* Unexpected */
cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
return;
case pcmk_ipc_event_reply:
break;
default:
return;
}
if (status != CRM_EX_OK) {
cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
crm_exit_str(status));
return;
}
if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
reply->reply_type);
} else {
if ((reply->data.ping.last_good != (time_t) 0) &&
(reply->data.ping.status == pcmk_rc_ok)) {
switch (reply->data.ping.state) {
case pcmk_pacemakerd_state_running:
case pcmk_pacemakerd_state_shutting_down:
last_ok = reply->data.ping.last_good;
break;
case pcmk_pacemakerd_state_shutdown_complete:
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
break;
default:
break;
}
}
}
}
#endif
extern int disk_count;
static void clean_up(int rc);
static void crm_diff_update(const char *event, xmlNode * msg);
static int cib_connect(gboolean full);
static void compute_status(pe_working_set_t * data_set);
static gboolean mon_refresh_state(gpointer user_data);
static GMainLoop *mainloop = NULL;
static guint timer_id_reconnect = 0;
static guint timer_id_notify = 0;
static int reconnect_msec = 1000;
static int cib_connected = 0;
static cib_t *cib = NULL;
static xmlNode *current_cib = NULL;
static pe_working_set_t *data_set = NULL;
static long last_refresh = 0;
static int pcmk_clean_shutdown = 0;
static int pcmk_shutdown = 0;
static gboolean
mon_timer_reconnect(gpointer data)
{
int rc = 0;
if (timer_id_reconnect > 0) {
g_source_remove(timer_id_reconnect);
}
rc = cib_connect(TRUE);
if (rc != 0) {
cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc);
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
} else {
cl_log(LOG_INFO, "CIB reconnect successful");
}
return FALSE;
}
static void
mon_cib_connection_destroy(gpointer user_data)
{
if (cib) {
cib->cmds->signoff(cib);
/* retrigger as last one might have been skipped */
mon_refresh_state(NULL);
if ((pcmk_clean_shutdown) && (!sync_resource_startup)) {
/* assume a graceful pacemaker-shutdown */
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
}
/* getting here we aren't sure about the pacemaker-state
so try to use the timeout to reconnect and get
everything sorted out again
*/
pcmk_shutdown = 0;
set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
}
cib_connected = 0;
/* no sense in looking into outdated cib, trying to apply patch, ... */
if (current_cib) {
free_xml(current_cib);
current_cib = NULL;
}
return;
}
static void
mon_retrieve_current_cib()
{
xmlNode *xml_cib = NULL;
int options = cib_scope_local | cib_sync_call;
int rc = pcmk_ok;
const char* element_name;
free_xml(current_cib);
current_cib = NULL;
rc = cib->cmds->query(cib, NULL, &xml_cib, options);
if (rc != pcmk_ok) {
crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc);
free_xml(xml_cib);
return;
} else if (xml_cib == NULL) {
crm_err("Couldn't retrieve the CIB: empty result");
return;
}
element_name = crm_element_name(xml_cib);
if (element_name && !strcmp(element_name, XML_TAG_CIB)) {
current_cib = xml_cib;
} else {
free_xml(xml_cib);
}
return;
}
static gboolean
mon_timer_notify(gpointer data)
{
static int counter = 0;
int counter_max = timeout_watchdog / timeout_loop / 2;
if (timer_id_notify > 0) {
g_source_remove(timer_id_notify);
}
#if USE_PACEMAKERD_API
{
time_t now = time(NULL);
if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
#endif
if (cib_connected) {
if (counter == counter_max) {
mon_retrieve_current_cib();
mon_refresh_state(NULL);
counter = 0;
} else {
cib->cmds->noop(cib, 0);
notify_parent();
counter++;
}
}
#if USE_PACEMAKERD_API
}
}
if (pcmk_connect_ipc(pacemakerd_api,
pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
}
#endif
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
return FALSE;
}
/*
* Mainloop signal handler.
*/
static void
mon_shutdown(int nsig)
{
clean_up(0);
}
static int
cib_connect(gboolean full)
{
int rc = 0;
CRM_CHECK(cib != NULL, return -EINVAL);
cib_connected = 0;
crm_xml_init();
if (cib->state != cib_connected_query && cib->state != cib_connected_command) {
rc = cib->cmds->signon(cib, crm_system_name, cib_query);
if (rc != 0) {
return rc;
}
mon_retrieve_current_cib();
mon_refresh_state(NULL);
if (full) {
if (rc == 0) {
rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy);
if (rc == -EPROTONOSUPPORT) {
/* Notification setup failed, won't be able to reconnect after failure */
rc = 0;
}
}
if (rc == 0) {
cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update);
}
if (rc != 0) {
/* Notification setup failed, could not monitor CIB actions */
clean_up(-rc);
}
}
}
if (!rc) {
cib_connected = 1;
}
return rc;
}
static void
compute_status(pe_working_set_t * data_set)
{
static int updates = 0;
static int ever_had_quorum = FALSE;
node_t *node = NULL;
updates++;
if (data_set->dc_node == NULL) {
set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now.");
notify_parent();
return;
}
node = pe_find_node(data_set->nodes, local_uname);
if ((node == NULL) || (node->details == NULL)) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname);
notify_parent();
return;
}
if (node->details->online == FALSE) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE");
} else if (node->details->unclean) {
set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN");
} else if (node->details->pending) {
set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending");
} else if (data_set->flags & pe_flag_have_quorum) {
set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online");
ever_had_quorum = TRUE;
} else if(disk_count > 0) {
set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost");
} else if(ever_had_quorum == FALSE) {
set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet");
} else {
/* We lost quorum, and there are no disks present
* Setting healthy > 2 here will result in us self-fencing
*/
switch (data_set->no_quorum_policy) {
case no_quorum_freeze:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources");
break;
#if HAVE_ENUM_NO_QUORUM_DEMOTE
case no_quorum_demote:
set_servant_health(pcmk_health_transient, LOG_INFO,
"Quorum lost: Demote promotable resources and stop others");
break;
#endif
case no_quorum_stop:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources");
break;
case no_quorum_ignore:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore");
break;
default:
/* immediate reboot is the most excessive action we take
use for no_quorum_suicide and everything we don't know yet
*/
set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence");
break;
}
}
/* If we are in shutdown-state once this will go on till the end.
* If we've on top reached a state of 0 locally running resources
* we can assume a clean shutdown.
* Tricky are the situations where the node is in maintenance-mode
* or resources are unmanaged. So if the node is in maintenance or
* all left-over running resources are unmanaged we assume intention.
*/
if (node->details->shutdown) {
pcmk_shutdown = 1;
}
if (pcmk_shutdown)
{
pcmk_clean_shutdown = 1;
if (!(node->details->maintenance)) {
GListPtr iter;
for (iter = node->details->running_rsc;
iter != NULL; iter = iter->next) {
resource_t *rsc = (resource_t *) iter->data;
if (is_set(rsc->flags, pe_rsc_managed)) {
pcmk_clean_shutdown = 0;
crm_debug("not clean as %s managed and still running",
rsc->id);
break;
}
}
if (pcmk_clean_shutdown) {
crm_debug("pcmk_clean_shutdown because "
"all managed resources down");
}
} else {
crm_debug("pcmk_clean_shutdown because node is in maintenance");
}
}
notify_parent();
return;
}
static crm_trigger_t *refresh_trigger = NULL;
static gboolean
mon_trigger_refresh(gpointer user_data)
{
mainloop_set_trigger(refresh_trigger);
mon_refresh_state(NULL);
return FALSE;
}
#define XPATH_SHUTDOWN "//" XML_CIB_TAG_STATE "[@uname='%s']/" \
XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" \
XML_CIB_TAG_NVPAIR "[@name='" XML_CIB_ATTR_SHUTDOWN "']"
static gboolean
shutdown_attr_in_cib(void)
{
xmlNode *match = NULL;
char *xpath_string;
xpath_string = crm_strdup_printf(XPATH_SHUTDOWN, local_uname);
if (xpath_string) {
match = get_xpath_object(xpath_string, current_cib, LOG_TRACE);
free(xpath_string);
}
return (match != NULL);
}
static void
crm_diff_update(const char *event, xmlNode * msg)
{
int rc = -1;
const char *op = NULL;
long now = time(NULL);
static int updates = 0;
static mainloop_timer_t *refresh_timer = NULL;
if(refresh_timer == NULL) {
refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL);
refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer);
}
if (current_cib != NULL) {
xmlNode *cib_last = current_cib;
current_cib = NULL;
rc = cib_apply_patch_event(msg, cib_last, &current_cib, LOG_DEBUG);
free_xml(cib_last);
switch(rc) {
case -pcmk_err_diff_resync:
case -pcmk_err_diff_failed:
crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
case pcmk_ok:
updates++;
break;
default:
crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc);
break;
}
}
if (current_cib == NULL) {
mon_retrieve_current_cib();
}
/* Refresh
* - immediately if the last update was more than 1s ago
* - every 10 updates
* - at most 1s after the last update
* - shutdown attribute for our node set for the first time
*/
if ((!pcmk_shutdown && shutdown_attr_in_cib()) ||
(updates > 10 || (now - last_refresh) > (reconnect_msec / 1000))) {
mon_refresh_state(refresh_timer);
updates = 0;
} else {
mainloop_set_trigger(refresh_trigger);
mainloop_timer_start(refresh_timer);
}
}
static gboolean
mon_refresh_state(gpointer user_data)
{
xmlNode *cib_copy = NULL;
if(current_cib == NULL) {
return FALSE;
}
if(user_data) {
mainloop_timer_t *timer = user_data;
mainloop_timer_stop(timer);
}
cib_copy = copy_xml(current_cib);
if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) {
cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB");
if (cib) {
cib->cmds->signoff(cib);
}
} else {
last_refresh = time(NULL);
data_set->input = cib_copy;
data_set->flags |= pe_flag_have_stonith_resource;
cluster_status(data_set);
compute_status(data_set);
pe_reset_working_set(data_set);
}
return FALSE;
}
static void
clean_up(int rc)
{
if (timer_id_reconnect > 0) {
g_source_remove(timer_id_reconnect);
timer_id_reconnect = 0;
}
if (timer_id_notify > 0) {
g_source_remove(timer_id_notify);
timer_id_notify = 0;
}
if (data_set != NULL) {
pe_free_working_set(data_set);
data_set = NULL;
}
if (cib != NULL) {
cib->cmds->signoff(cib);
cib_delete(cib);
cib = NULL;
}
#if USE_PACEMAKERD_API
if (pacemakerd_api != NULL) {
pcmk_ipc_api_t *capi = pacemakerd_api;
pacemakerd_api = NULL; // Ensure we can't free this twice
pcmk_free_ipc_api(capi);
}
#endif
if (rc >= 0) {
exit(rc);
}
return;
}
+#ifdef __linux__
+#define CLUSTER_GROUP "haclient"
+/* Try to add well-known cluster stack group (CLUSTER_GROUP) as a supplementary
+ group to this root-privileged process for good measure (see the call site);
+ returns 0 in case of success, respective positive exit status (different
+ from 0, 1, and EXIT_MD_IO_FAIL...EXIT_MD_REQUEST_CRASHDUMP) otherwise. */
+static int
+add_cluster_group()
+{
+ int rc = 0;
+ long gr_limit = -1, pw_limit = -1, limit;
+ char *buf;
+ struct group group, *group_result;
+ struct passwd passwd, *passwd_result;
+ gid_t cluster_gid;
+
+ limit = sysconf(_SC_PAGESIZE);
+ limit = (limit > 0) ? limit : 4096; /* assume sufficient, just in case */
+ gr_limit = sysconf(_SC_GETGR_R_SIZE_MAX);
+ gr_limit = (gr_limit > 0) ? gr_limit : limit;
+ pw_limit = sysconf(_SC_GETPW_R_SIZE_MAX);
+ limit = (gr_limit >= pw_limit) ? gr_limit : pw_limit;
+
+ if ((buf = malloc(limit)) == NULL) {
+ return 74; /* EX_IOERR */
+ }
+
+ do {
+ rc = getgrnam_r(CLUSTER_GROUP, &group, buf, limit, &group_result);
+ } while (rc == -1 && errno == EINTR);
+ if (rc == -1 || group_result == NULL) {
+ if (rc == -1) {
+ cl_perror("Unable to get group entry for %s", CLUSTER_GROUP);
+ rc = 69; /* EX_UNAVAILABLE */
+ } else {
+ cl_log(LOG_ERR, "Unable to get group entry for %s", CLUSTER_GROUP);
+ rc = 78; /* EX_CONFIG */
+ }
+ goto bail;
+ }
+ cluster_gid = group.gr_gid;
+
+ do {
+ rc = getpwuid_r(0, &passwd, buf, limit, &passwd_result);
+ } while (rc == -1 && errno == EINTR);
+ if (rc == -1 || passwd_result == NULL) {
+ if (rc == -1) {
+ cl_perror("Unable to get passwd entry for UID=0");
+ rc = 69; /* EX_UNAVAILABLE */
+ } else {
+ cl_log(LOG_ERR, "Unable to get passwd entry for UID=0");
+ rc = 78; /* EX_CONFIG */
+ }
+ goto bail;
+ }
+
+ /* this is supposed to be root, hence (1) shall succeed and
+ (2) the root shall not gain any new unwanted group-based
+ access over what it already has per its standard supplementary
+ groups even if they have been explicitly dropped by now (not
+ the case now; root is usually just in its own group, too) */
+ rc = initgroups(passwd.pw_name, cluster_gid);
+ if (rc == -1) {
+ cl_perror("Unable to set supplementary group %s", CLUSTER_GROUP);
+ rc = (errno == EPERM) ? 77 /* EX_NOPERM */
+ : 74 /* EX_IOERR */;
+ }
+
+bail:
+ free(buf);
+ return rc;
+}
+#endif
+
int
servant_pcmk(const char *diskname, int mode, const void* argp)
{
int exit_code = 0;
crm_system_name = strdup("sbd:pcmk");
cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
set_proc_title("sbd: watcher: Pacemaker");
setenv("PCMK_watchdog", "true", 1);
+#ifdef __linux__
+ /* since we run this as root, we may actually be prevented from
+ accessing hacluster:haclient owned shared IPC mmap'd files of,
+ e.g., pacemakerd-based ("CIB daemon") in some cases where
+ root is not "all powerful" (e.g. under strict SELinux
+ confinement not allowing a DAC_OVERRIDE for any reasons)
+ TODO: first check if CAP_DAC_OVERRIDE is missing? */
+ if ((exit_code = add_cluster_group()) > 0) {
+ cl_log(LOG_CRIT, "Unable to ensure Pacemaker can be watched");
+ clean_up(exit_code);
+ }
+#endif
+
if(debug == 0) {
/* We don't want any noisy crm messages */
set_crm_log_level(LOG_CRIT);
}
if (data_set == NULL) {
data_set = pe_new_working_set();
}
if (data_set == NULL) {
return -1;
}
#if USE_PACEMAKERD_API
{
int rc;
rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
if (pacemakerd_api == NULL) {
cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
pcmk_rc_str(rc));
return -1;
}
pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
do {
rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
if (rc != pcmk_rc_ok) {
cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
pcmk_rc_str(rc));
sleep(reconnect_msec / 1000);
}
} while (rc != pcmk_rc_ok);
/* send a ping to pacemakerd to wake it up */
pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
/* cib should come up now as well so it's time
* to have the inquisitor have a closer look
*/
notify_parent();
}
#endif
if (current_cib == NULL) {
cib = cib_new();
do {
exit_code = cib_connect(TRUE);
if (exit_code != 0) {
sleep(reconnect_msec / 1000);
}
} while (exit_code == -ENOTCONN);
if (exit_code != 0) {
clean_up(-exit_code);
}
}
mainloop = g_main_loop_new(NULL, FALSE);
mainloop_add_signal(SIGTERM, mon_shutdown);
mainloop_add_signal(SIGINT, mon_shutdown);
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
g_main_loop_run(mainloop);
g_main_loop_unref(mainloop);
clean_up(0);
return 0; /* never reached */
}

File Metadata

Mime Type
text/x-diff
Expires
Mon, Feb 24, 8:32 AM (21 h, 48 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1464073
Default Alt Text
(21 KB)

Event Timeline