Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
index 02070bf116..0ccaa1ced1 100644
--- a/daemons/execd/execd_commands.c
+++ b/daemons/execd/execd_commands.c
@@ -1,1922 +1,1884 @@
/*
* Copyright 2012-2021 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <glib.h>
// Check whether we have a high-resolution monotonic clock
#undef PCMK__TIME_USE_CGT
#if HAVE_DECL_CLOCK_MONOTONIC && defined(CLOCK_MONOTONIC)
# define PCMK__TIME_USE_CGT
# include <time.h> /* clock_gettime */
#endif
#include <unistd.h>
#include <crm/crm.h>
+#include <crm/fencing/internal.h>
#include <crm/services.h>
#include <crm/services_internal.h>
#include <crm/common/mainloop.h>
#include <crm/common/ipc.h>
#include <crm/common/ipc_internal.h>
#include <crm/msg_xml.h>
#include "pacemaker-execd.h"
GHashTable *rsc_list = NULL;
typedef struct lrmd_cmd_s {
int timeout;
guint interval_ms;
int start_delay;
int timeout_orig;
int call_id;
int call_opts;
/* Timer ids, must be removed on cmd destruction. */
int delay_id;
int stonith_recurring_id;
int rsc_deleted;
int service_flags;
char *client_id;
char *origin;
char *rsc_id;
char *action;
char *real_action;
char *userdata_str;
pcmk__action_result_t result;
/* We can track operation queue time and run time, to be saved with the CIB
* resource history (and displayed in cluster status). We need
* high-resolution monotonic time for this purpose, so we use
* clock_gettime(CLOCK_MONOTONIC, ...) (if available, otherwise this feature
* is disabled).
*
* However, we also need epoch timestamps for recording the time the command
* last ran and the time its return value last changed, for use in time
* displays (as opposed to interval calculations). We keep time_t values for
* this purpose.
*
* The last run time is used for both purposes, so we keep redundant
* monotonic and epoch values for this. Technically the two could represent
* different times, but since time_t has only second resolution and the
* values are used for distinct purposes, that is not significant.
*/
#ifdef PCMK__TIME_USE_CGT
/* Recurring and systemd operations may involve more than one executor
* command per operation, so they need info about the original and the most
* recent.
*/
struct timespec t_first_run; // When op first ran
struct timespec t_run; // When op most recently ran
struct timespec t_first_queue; // When op was first queued
struct timespec t_queue; // When op was most recently queued
#endif
time_t epoch_last_run; // Epoch timestamp of when op last ran
time_t epoch_rcchange; // Epoch timestamp of when rc last changed
bool first_notify_sent;
int last_notify_rc;
int last_notify_op_status;
int last_pid;
GHashTable *params;
} lrmd_cmd_t;
static void cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc);
static gboolean lrmd_rsc_dispatch(gpointer user_data);
static void cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id);
#ifdef PCMK__TIME_USE_CGT
/*!
* \internal
* \brief Check whether a struct timespec has been set
*
* \param[in] timespec Time to check
*
* \return true if timespec has been set (i.e. is nonzero), false otherwise
*/
static inline bool
time_is_set(struct timespec *timespec)
{
return (timespec != NULL) &&
((timespec->tv_sec != 0) || (timespec->tv_nsec != 0));
}
/*
* \internal
* \brief Set a timespec (and its original if unset) to the current time
*
* \param[out] t_current Where to store current time
* \param[out] t_orig Where to copy t_current if unset
*/
static void
get_current_time(struct timespec *t_current, struct timespec *t_orig)
{
clock_gettime(CLOCK_MONOTONIC, t_current);
if ((t_orig != NULL) && !time_is_set(t_orig)) {
*t_orig = *t_current;
}
}
/*!
* \internal
* \brief Return difference between two times in milliseconds
*
* \param[in] now More recent time (or NULL to use current time)
* \param[in] old Earlier time
*
* \return milliseconds difference (or 0 if old is NULL or unset)
*
* \note Can overflow on 32bit machines when the differences is around
* 24 days or more.
*/
static int
time_diff_ms(struct timespec *now, struct timespec *old)
{
int diff_ms = 0;
if (time_is_set(old)) {
struct timespec local_now = { 0, };
if (now == NULL) {
clock_gettime(CLOCK_MONOTONIC, &local_now);
now = &local_now;
}
diff_ms = (now->tv_sec - old->tv_sec) * 1000
+ (now->tv_nsec - old->tv_nsec) / 1000000;
}
return diff_ms;
}
/*!
* \internal
* \brief Reset a command's operation times to their original values.
*
* Reset a command's run and queued timestamps to the timestamps of the original
* command, so we report the entire time since then and not just the time since
* the most recent command (for recurring and systemd operations).
*
* \param[in] cmd Executor command object to reset
*
* \note It's not obvious what the queued time should be for a systemd
* start/stop operation, which might go like this:
* initial command queued 5ms, runs 3s
* monitor command queued 10ms, runs 10s
* monitor command queued 10ms, runs 10s
* Is the queued time for that operation 5ms, 10ms or 25ms? The current
* implementation will report 5ms. If it's 25ms, then we need to
* subtract 20ms from the total exec time so as not to count it twice.
* We can implement that later if it matters to anyone ...
*/
static void
cmd_original_times(lrmd_cmd_t * cmd)
{
cmd->t_run = cmd->t_first_run;
cmd->t_queue = cmd->t_first_queue;
}
#endif
static inline bool
action_matches(lrmd_cmd_t *cmd, const char *action, guint interval_ms)
{
return (cmd->interval_ms == interval_ms)
&& pcmk__str_eq(cmd->action, action, pcmk__str_casei);
}
/*!
* \internal
* \brief Log the result of an asynchronous command
*
* \param[in] cmd Command to log result for
* \param[in] exec_time_ms Execution time in milliseconds, if known
* \param[in] queue_time_ms Queue time in milliseconds, if known
*/
static void
log_finished(lrmd_cmd_t *cmd, int exec_time_ms, int queue_time_ms)
{
int log_level = LOG_INFO;
GString *str = g_string_sized_new(100); // reasonable starting size
if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
log_level = LOG_DEBUG;
}
g_string_printf(str, "%s %s (call %d",
cmd->rsc_id, cmd->action, cmd->call_id);
if (cmd->last_pid != 0) {
g_string_append_printf(str, ", PID %d", cmd->last_pid);
}
if (cmd->result.execution_status == PCMK_EXEC_DONE) {
g_string_append_printf(str, ") exited with status %d",
cmd->result.exit_status);
} else {
g_string_append_printf(str, ") could not be executed: %s",
pcmk_exec_status_str(cmd->result.execution_status));
}
if (cmd->result.exit_reason != NULL) {
g_string_append_printf(str, " (%s)", cmd->result.exit_reason);
}
#ifdef PCMK__TIME_USE_CGT
g_string_append_printf(str, " (execution time %s",
pcmk__readable_interval(exec_time_ms));
if (queue_time_ms > 0) {
g_string_append_printf(str, " after being queued %s",
pcmk__readable_interval(queue_time_ms));
}
g_string_append(str, ")");
#endif
do_crm_log(log_level, "%s", str->str);
g_string_free(str, TRUE);
}
static void
log_execute(lrmd_cmd_t * cmd)
{
int log_level = LOG_INFO;
if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
log_level = LOG_DEBUG;
}
do_crm_log(log_level, "executing - rsc:%s action:%s call_id:%d",
cmd->rsc_id, cmd->action, cmd->call_id);
}
static const char *
normalize_action_name(lrmd_rsc_t * rsc, const char *action)
{
if (pcmk__str_eq(action, "monitor", pcmk__str_casei) &&
pcmk_is_set(pcmk_get_ra_caps(rsc->class), pcmk_ra_cap_status)) {
return "status";
}
return action;
}
static lrmd_rsc_t *
build_rsc_from_xml(xmlNode * msg)
{
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
lrmd_rsc_t *rsc = NULL;
rsc = calloc(1, sizeof(lrmd_rsc_t));
crm_element_value_int(msg, F_LRMD_CALLOPTS, &rsc->call_opts);
rsc->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
rsc->class = crm_element_value_copy(rsc_xml, F_LRMD_CLASS);
rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER);
rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE);
rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc);
rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running"
return rsc;
}
static lrmd_cmd_t *
create_lrmd_cmd(xmlNode *msg, pcmk__client_t *client)
{
int call_options = 0;
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
lrmd_cmd_t *cmd = NULL;
cmd = calloc(1, sizeof(lrmd_cmd_t));
crm_element_value_int(msg, F_LRMD_CALLOPTS, &call_options);
cmd->call_opts = call_options;
cmd->client_id = strdup(client->id);
crm_element_value_int(msg, F_LRMD_CALLID, &cmd->call_id);
crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &cmd->interval_ms);
crm_element_value_int(rsc_xml, F_LRMD_TIMEOUT, &cmd->timeout);
crm_element_value_int(rsc_xml, F_LRMD_RSC_START_DELAY, &cmd->start_delay);
cmd->timeout_orig = cmd->timeout;
cmd->origin = crm_element_value_copy(rsc_xml, F_LRMD_ORIGIN);
cmd->action = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ACTION);
cmd->userdata_str = crm_element_value_copy(rsc_xml, F_LRMD_RSC_USERDATA_STR);
cmd->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
cmd->params = xml2list(rsc_xml);
if (pcmk__str_eq(g_hash_table_lookup(cmd->params, "CRM_meta_on_fail"), "block", pcmk__str_casei)) {
crm_debug("Setting flag to leave pid group on timeout and "
"only kill action pid for " PCMK__OP_FMT,
cmd->rsc_id, cmd->action, cmd->interval_ms);
cmd->service_flags = pcmk__set_flags_as(__func__, __LINE__,
LOG_TRACE, "Action",
cmd->action, 0,
SVC_ACTION_LEAVE_GROUP,
"SVC_ACTION_LEAVE_GROUP");
}
return cmd;
}
static void
stop_recurring_timer(lrmd_cmd_t *cmd)
{
if (cmd) {
if (cmd->stonith_recurring_id) {
g_source_remove(cmd->stonith_recurring_id);
}
cmd->stonith_recurring_id = 0;
}
}
static void
free_lrmd_cmd(lrmd_cmd_t * cmd)
{
stop_recurring_timer(cmd);
if (cmd->delay_id) {
g_source_remove(cmd->delay_id);
}
if (cmd->params) {
g_hash_table_destroy(cmd->params);
}
pcmk__reset_result(&(cmd->result));
free(cmd->origin);
free(cmd->action);
free(cmd->real_action);
free(cmd->userdata_str);
free(cmd->rsc_id);
free(cmd->client_id);
free(cmd);
}
static gboolean
stonith_recurring_op_helper(gpointer data)
{
lrmd_cmd_t *cmd = data;
lrmd_rsc_t *rsc;
cmd->stonith_recurring_id = 0;
if (!cmd->rsc_id) {
return FALSE;
}
rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
CRM_ASSERT(rsc != NULL);
/* take it out of recurring_ops list, and put it in the pending ops
* to be executed */
rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
#ifdef PCMK__TIME_USE_CGT
get_current_time(&(cmd->t_queue), &(cmd->t_first_queue));
#endif
mainloop_set_trigger(rsc->work);
return FALSE;
}
static inline void
start_recurring_timer(lrmd_cmd_t *cmd)
{
if (cmd && (cmd->interval_ms > 0)) {
cmd->stonith_recurring_id = g_timeout_add(cmd->interval_ms,
stonith_recurring_op_helper,
cmd);
}
}
static gboolean
start_delay_helper(gpointer data)
{
lrmd_cmd_t *cmd = data;
lrmd_rsc_t *rsc = NULL;
cmd->delay_id = 0;
rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
if (rsc) {
mainloop_set_trigger(rsc->work);
}
return FALSE;
}
/*!
* \internal
* \brief Check whether a list already contains the equivalent of a given action
*/
static lrmd_cmd_t *
find_duplicate_action(GList *action_list, lrmd_cmd_t *cmd)
{
for (GList *item = action_list; item != NULL; item = item->next) {
lrmd_cmd_t *dup = item->data;
if (action_matches(cmd, dup->action, dup->interval_ms)) {
return dup;
}
}
return NULL;
}
static bool
merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
{
lrmd_cmd_t * dup = NULL;
bool dup_pending = true;
if (cmd->interval_ms == 0) {
return false;
}
// Search for a duplicate of this action (in-flight or not)
dup = find_duplicate_action(rsc->pending_ops, cmd);
if (dup == NULL) {
dup_pending = false;
dup = find_duplicate_action(rsc->recurring_ops, cmd);
if (dup == NULL) {
return false;
}
}
/* Do not merge fencing monitors marked for cancellation, so we can reply to
* the cancellation separately.
*/
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH,
pcmk__str_casei)
&& (dup->result.execution_status == PCMK_EXEC_CANCELLED)) {
return false;
}
/* This should not occur. If it does, we need to investigate how something
* like this is possible in the controller.
*/
crm_warn("Duplicate recurring op entry detected (" PCMK__OP_FMT
"), merging with previous op entry",
rsc->rsc_id, normalize_action_name(rsc, dup->action),
dup->interval_ms);
// Merge new action's call ID and user data into existing action
dup->first_notify_sent = false;
free(dup->userdata_str);
dup->userdata_str = cmd->userdata_str;
cmd->userdata_str = NULL;
dup->call_id = cmd->call_id;
free_lrmd_cmd(cmd);
cmd = NULL;
/* If dup is not pending, that means it has already executed at least once
* and is waiting in the interval. In that case, stop waiting and initiate
* a new instance now.
*/
if (!dup_pending) {
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH,
pcmk__str_casei)) {
stop_recurring_timer(dup);
stonith_recurring_op_helper(dup);
} else {
services_action_kick(rsc->rsc_id,
normalize_action_name(rsc, dup->action),
dup->interval_ms);
}
}
return true;
}
static void
schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
{
CRM_CHECK(cmd != NULL, return);
CRM_CHECK(rsc != NULL, return);
crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id);
if (merge_recurring_duplicate(rsc, cmd)) {
// Equivalent of cmd has already been scheduled
return;
}
/* The controller expects the executor to automatically cancel
* recurring operations before a resource stops.
*/
if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
cancel_all_recurring(rsc, NULL);
}
rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
#ifdef PCMK__TIME_USE_CGT
get_current_time(&(cmd->t_queue), &(cmd->t_first_queue));
#endif
mainloop_set_trigger(rsc->work);
if (cmd->start_delay) {
cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
}
}
static xmlNode *
create_lrmd_reply(const char *origin, int rc, int call_id)
{
xmlNode *reply = create_xml_node(NULL, T_LRMD_REPLY);
crm_xml_add(reply, F_LRMD_ORIGIN, origin);
crm_xml_add_int(reply, F_LRMD_RC, rc);
crm_xml_add_int(reply, F_LRMD_CALLID, call_id);
return reply;
}
static void
send_client_notify(gpointer key, gpointer value, gpointer user_data)
{
xmlNode *update_msg = user_data;
pcmk__client_t *client = value;
int rc;
int log_level = LOG_WARNING;
const char *msg = NULL;
CRM_CHECK(client != NULL, return);
if (client->name == NULL) {
crm_trace("Skipping notification to client without name");
return;
}
if (pcmk_is_set(client->flags, pcmk__client_to_proxy)) {
/* We only want to notify clients of the executor IPC API. If we are
* running as Pacemaker Remote, we may have clients proxied to other
* IPC services in the cluster, so skip those.
*/
crm_trace("Skipping executor API notification to client %s",
pcmk__client_name(client));
return;
}
rc = lrmd_server_send_notify(client, update_msg);
if (rc == pcmk_rc_ok) {
return;
}
switch (rc) {
case ENOTCONN:
case EPIPE: // Client exited without waiting for notification
log_level = LOG_INFO;
msg = "Disconnected";
break;
default:
msg = pcmk_rc_str(rc);
break;
}
do_crm_log(log_level, "Could not notify client %s: %s " CRM_XS " rc=%d",
pcmk__client_name(client), msg, rc);
}
static void
send_cmd_complete_notify(lrmd_cmd_t * cmd)
{
xmlNode *notify = NULL;
int exec_time = 0;
int queue_time = 0;
#ifdef PCMK__TIME_USE_CGT
exec_time = time_diff_ms(NULL, &(cmd->t_run));
queue_time = time_diff_ms(&cmd->t_run, &(cmd->t_queue));
#endif
log_finished(cmd, exec_time, queue_time);
/* if the first notify result for a cmd has already been sent earlier, and the
* the option to only send notifies on result changes is set. Check to see
* if the last result is the same as the new one. If so, suppress this update */
if (cmd->first_notify_sent && (cmd->call_opts & lrmd_opt_notify_changes_only)) {
if ((cmd->last_notify_rc == cmd->result.exit_status) &&
(cmd->last_notify_op_status == cmd->result.execution_status)) {
/* only send changes */
return;
}
}
cmd->first_notify_sent = true;
cmd->last_notify_rc = cmd->result.exit_status;
cmd->last_notify_op_status = cmd->result.execution_status;
notify = create_xml_node(NULL, T_LRMD_NOTIFY);
crm_xml_add(notify, F_LRMD_ORIGIN, __func__);
crm_xml_add_int(notify, F_LRMD_TIMEOUT, cmd->timeout);
crm_xml_add_ms(notify, F_LRMD_RSC_INTERVAL, cmd->interval_ms);
crm_xml_add_int(notify, F_LRMD_RSC_START_DELAY, cmd->start_delay);
crm_xml_add_int(notify, F_LRMD_EXEC_RC, cmd->result.exit_status);
crm_xml_add_int(notify, F_LRMD_OP_STATUS, cmd->result.execution_status);
crm_xml_add_int(notify, F_LRMD_CALLID, cmd->call_id);
crm_xml_add_int(notify, F_LRMD_RSC_DELETED, cmd->rsc_deleted);
crm_xml_add_ll(notify, F_LRMD_RSC_RUN_TIME,
(long long) cmd->epoch_last_run);
crm_xml_add_ll(notify, F_LRMD_RSC_RCCHANGE_TIME,
(long long) cmd->epoch_rcchange);
#ifdef PCMK__TIME_USE_CGT
crm_xml_add_int(notify, F_LRMD_RSC_EXEC_TIME, exec_time);
crm_xml_add_int(notify, F_LRMD_RSC_QUEUE_TIME, queue_time);
#endif
crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC);
crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id);
if(cmd->real_action) {
crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action);
} else {
crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action);
}
crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str);
crm_xml_add(notify, F_LRMD_RSC_EXIT_REASON, cmd->result.exit_reason);
if (cmd->result.action_stderr != NULL) {
crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stderr);
} else if (cmd->result.action_stdout != NULL) {
crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stdout);
}
if (cmd->params) {
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
xmlNode *args = create_xml_node(notify, XML_TAG_ATTRS);
g_hash_table_iter_init(&iter, cmd->params);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
hash2smartfield((gpointer) key, (gpointer) value, args);
}
}
if (cmd->client_id && (cmd->call_opts & lrmd_opt_notify_orig_only)) {
pcmk__client_t *client = pcmk__find_client_by_id(cmd->client_id);
if (client) {
send_client_notify(client->id, client, notify);
}
} else {
pcmk__foreach_ipc_client(send_client_notify, notify);
}
free_xml(notify);
}
static void
send_generic_notify(int rc, xmlNode * request)
{
if (pcmk__ipc_client_count() != 0) {
int call_id = 0;
xmlNode *notify = NULL;
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
const char *op = crm_element_value(request, F_LRMD_OPERATION);
crm_element_value_int(request, F_LRMD_CALLID, &call_id);
notify = create_xml_node(NULL, T_LRMD_NOTIFY);
crm_xml_add(notify, F_LRMD_ORIGIN, __func__);
crm_xml_add_int(notify, F_LRMD_RC, rc);
crm_xml_add_int(notify, F_LRMD_CALLID, call_id);
crm_xml_add(notify, F_LRMD_OPERATION, op);
crm_xml_add(notify, F_LRMD_RSC_ID, rsc_id);
pcmk__foreach_ipc_client(send_client_notify, notify);
free_xml(notify);
}
}
static void
cmd_reset(lrmd_cmd_t * cmd)
{
cmd->last_pid = 0;
#ifdef PCMK__TIME_USE_CGT
memset(&cmd->t_run, 0, sizeof(cmd->t_run));
memset(&cmd->t_queue, 0, sizeof(cmd->t_queue));
#endif
cmd->epoch_last_run = 0;
pcmk__reset_result(&(cmd->result));
cmd->result.execution_status = PCMK_EXEC_DONE;
}
static void
cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc)
{
crm_trace("Resource operation rsc:%s action:%s completed (%p %p)", cmd->rsc_id, cmd->action,
rsc ? rsc->active : NULL, cmd);
if (rsc && (rsc->active == cmd)) {
rsc->active = NULL;
mainloop_set_trigger(rsc->work);
}
if (!rsc) {
cmd->rsc_deleted = 1;
}
/* reset original timeout so client notification has correct information */
cmd->timeout = cmd->timeout_orig;
send_cmd_complete_notify(cmd);
if ((cmd->interval_ms != 0)
&& (cmd->result.execution_status == PCMK_EXEC_CANCELLED)) {
if (rsc) {
rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
}
free_lrmd_cmd(cmd);
} else if (cmd->interval_ms == 0) {
if (rsc) {
rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
}
free_lrmd_cmd(cmd);
} else {
/* Clear all the values pertaining just to the last iteration of a recurring op. */
cmd_reset(cmd);
}
}
static int
stonith2uniform_rc(const char *action, int rc)
{
switch (rc) {
case pcmk_ok:
rc = PCMK_OCF_OK;
break;
case -ENODEV:
/* This should be possible only for probes in practice, but
* interpret for all actions to be safe.
*/
if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) {
rc = PCMK_OCF_NOT_RUNNING;
} else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) {
rc = PCMK_OCF_OK;
} else {
rc = PCMK_OCF_NOT_INSTALLED;
}
break;
case -EOPNOTSUPP:
rc = PCMK_OCF_UNIMPLEMENT_FEATURE;
break;
default:
rc = PCMK_OCF_UNKNOWN_ERROR;
break;
}
return rc;
}
static int
action_get_uniform_rc(svc_action_t *action)
{
lrmd_cmd_t *cmd = action->cb_data;
if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH,
pcmk__str_casei)) {
return stonith2uniform_rc(cmd->action, action->rc);
} else {
enum ocf_exitcode code = services_result2ocf(action->standard,
cmd->action, action->rc);
// Cast variable instead of function return to keep compilers happy
return (int) code;
}
}
struct notify_new_client_data {
xmlNode *notify;
pcmk__client_t *new_client;
};
static void
notify_one_client(gpointer key, gpointer value, gpointer user_data)
{
pcmk__client_t *client = value;
struct notify_new_client_data *data = user_data;
if (!pcmk__str_eq(client->id, data->new_client->id, pcmk__str_casei)) {
send_client_notify(key, (gpointer) client, (gpointer) data->notify);
}
}
void
notify_of_new_client(pcmk__client_t *new_client)
{
struct notify_new_client_data data;
data.new_client = new_client;
data.notify = create_xml_node(NULL, T_LRMD_NOTIFY);
crm_xml_add(data.notify, F_LRMD_ORIGIN, __func__);
crm_xml_add(data.notify, F_LRMD_OPERATION, LRMD_OP_NEW_CLIENT);
pcmk__foreach_ipc_client(notify_one_client, &data);
free_xml(data.notify);
}
void
client_disconnect_cleanup(const char *client_id)
{
GHashTableIter iter;
lrmd_rsc_t *rsc = NULL;
char *key = NULL;
g_hash_table_iter_init(&iter, rsc_list);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
if (rsc->call_opts & lrmd_opt_drop_recurring) {
/* This client is disconnecting, drop any recurring operations
* it may have initiated on the resource */
cancel_all_recurring(rsc, client_id);
}
}
}
static void
action_complete(svc_action_t * action)
{
lrmd_rsc_t *rsc;
lrmd_cmd_t *cmd = action->cb_data;
#ifdef PCMK__TIME_USE_CGT
const char *rclass = NULL;
bool goagain = false;
#endif
if (!cmd) {
crm_err("Completed executor action (%s) does not match any known operations",
action->id);
return;
}
#ifdef PCMK__TIME_USE_CGT
if (cmd->result.exit_status != action->rc) {
cmd->epoch_rcchange = time(NULL);
}
#endif
cmd->last_pid = action->pid;
pcmk__set_result(&(cmd->result), action_get_uniform_rc(action),
action->status, services__exit_reason(action));
rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
#ifdef PCMK__TIME_USE_CGT
if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei)) {
rclass = resources_find_service_class(rsc->type);
} else if(rsc) {
rclass = rsc->class;
}
if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) {
if (pcmk__result_ok(&(cmd->result))
&& pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) {
/* systemd returns from start and stop actions after the action
* begins, not after it completes. We have to jump through a few
* hoops so that we don't report 'complete' to the rest of pacemaker
* until it's actually done.
*/
goagain = true;
cmd->real_action = cmd->action;
cmd->action = strdup("monitor");
} else if (cmd->real_action != NULL) {
// This is follow-up monitor to check whether start/stop completed
if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
goagain = true;
} else if (pcmk__result_ok(&(cmd->result))
&& pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) {
goagain = true;
} else {
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
int timeout_left = cmd->timeout_orig - time_sum;
crm_debug("%s systemd %s is now complete (elapsed=%dms, "
"remaining=%dms): %s (%d)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left,
services_ocf_exitcode_str(cmd->result.exit_status),
cmd->result.exit_status);
cmd_original_times(cmd);
// Monitors may return "not running", but start/stop shouldn't
if ((cmd->result.execution_status == PCMK_EXEC_DONE)
&& (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) {
if (pcmk__str_eq(cmd->real_action, "start", pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR;
} else if (pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_OK;
}
}
}
}
}
#endif
#if SUPPORT_NAGIOS
if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) {
if (action_matches(cmd, "monitor", 0)
&& pcmk__result_ok(&(cmd->result))) {
/* Successfully executed --version for the nagios plugin */
cmd->result.exit_status = PCMK_OCF_NOT_RUNNING;
} else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)
&& !pcmk__result_ok(&(cmd->result))) {
#ifdef PCMK__TIME_USE_CGT
goagain = true;
#endif
}
}
#endif
#ifdef PCMK__TIME_USE_CGT
if (goagain) {
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
int timeout_left = cmd->timeout_orig - time_sum;
int delay = cmd->timeout_orig / 10;
if(delay >= timeout_left && timeout_left > 20) {
delay = timeout_left/2;
}
delay = QB_MIN(2000, delay);
if (delay < timeout_left) {
cmd->start_delay = delay;
cmd->timeout = timeout_left;
if (pcmk__result_ok(&(cmd->result))) {
crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay);
} else if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->action, time_sum, timeout_left, delay);
} else {
crm_notice("%s %s failed '%s' (%d): re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->action,
services_ocf_exitcode_str(cmd->result.exit_status),
cmd->result.exit_status, time_sum, timeout_left,
delay);
}
cmd_reset(cmd);
if(rsc) {
rsc->active = NULL;
}
schedule_lrmd_cmd(rsc, cmd);
/* Don't finalize cmd, we're not done with it yet */
return;
} else {
crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)",
cmd->rsc_id,
(cmd->real_action? cmd->real_action : cmd->action),
cmd->result.exit_status, time_sum, timeout_left);
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");
cmd_original_times(cmd);
}
}
#endif
pcmk__set_result_output(&(cmd->result), services__grab_stdout(action),
services__grab_stderr(action));
cmd_finalize(cmd, rsc);
}
-/*!
- * \internal
- * \brief Determine operation status of a stonith operation
- *
- * Non-stonith resource operations get their operation status directly from the
- * service library, but the fencer does not have an equivalent, so we must infer
- * an operation status from the fencer API's return code.
- *
- * \param[in] action Name of action performed on stonith resource
- * \param[in] interval_ms Action interval
- * \param[in] rc Action result from fencer
- *
- * \return Operation status corresponding to fencer API return code
- */
-static int
-stonith_rc2status(const char *action, guint interval_ms, int rc)
-{
- int status = PCMK_EXEC_DONE;
-
- switch (rc) {
- case pcmk_ok:
- break;
-
- case -EOPNOTSUPP:
- case -EPROTONOSUPPORT:
- status = PCMK_EXEC_NOT_SUPPORTED;
- break;
-
- case -ETIME:
- case -ETIMEDOUT:
- status = PCMK_EXEC_TIMEOUT;
- break;
-
- case -ENOTCONN:
- case -ECOMM:
- // Couldn't talk to fencer
- status = PCMK_EXEC_ERROR;
- break;
-
- case -ENODEV:
- // The device is not registered with the fencer
- status = PCMK_EXEC_ERROR;
- break;
-
- default:
- break;
- }
- return status;
-}
-
static void
stonith_action_complete(lrmd_cmd_t * cmd, int rc)
{
// This can be NULL if resource was removed before command completed
lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc);
/* This function may be called with status already set to cancelled, if a
* pending action was aborted. Otherwise, we need to determine status from
* the fencer return code.
*/
if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) {
- cmd->result.execution_status = stonith_rc2status(cmd->action,
- cmd->interval_ms, rc);
+ cmd->result.execution_status = stonith__legacy2status(rc);
+
+ // Simplify status codes from fencer
+ switch (cmd->result.execution_status) {
+ case PCMK_EXEC_NOT_CONNECTED:
+ case PCMK_EXEC_INVALID:
+ case PCMK_EXEC_NO_FENCE_DEVICE:
+ case PCMK_EXEC_NO_SECRETS:
+ cmd->result.execution_status = PCMK_EXEC_ERROR;
+ break;
+ default:
+ break;
+ }
// Certain successful actions change the known state of the resource
if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
} else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
}
}
}
// Give the user more detail than an OCF code
if (rc != -pcmk_err_generic) {
cmd->result.exit_reason = strdup(pcmk_strerror(rc));
}
/* The recurring timer should not be running at this point in any case, but
* as a failsafe, stop it if it is.
*/
stop_recurring_timer(cmd);
/* Reschedule this command if appropriate. If a recurring command is *not*
* rescheduled, its status must be PCMK_EXEC_CANCELLED, otherwise it will
* not be removed from recurring_ops by cmd_finalize().
*/
if (rsc && (cmd->interval_ms > 0)
&& (cmd->result.execution_status != PCMK_EXEC_CANCELLED)) {
start_recurring_timer(cmd);
}
cmd_finalize(cmd, rsc);
}
static void
lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
stonith_action_complete(data->userdata, data->rc);
}
void
stonith_connection_failed(void)
{
GHashTableIter iter;
GList *cmd_list = NULL;
GList *cmd_iter = NULL;
lrmd_rsc_t *rsc = NULL;
char *key = NULL;
g_hash_table_iter_init(&iter, rsc_list);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) {
/* If we registered this fence device, we don't know whether the
* fencer still has the registration or not. Cause future probes to
* return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or
* started successfully. This is especially important if the
* controller also went away (possibly due to a cluster layer
* restart) and won't receive our client notification of any
* monitors finalized below.
*/
if (rsc->st_probe_rc == pcmk_ok) {
rsc->st_probe_rc = pcmk_err_generic;
}
if (rsc->active) {
cmd_list = g_list_append(cmd_list, rsc->active);
}
if (rsc->recurring_ops) {
cmd_list = g_list_concat(cmd_list, rsc->recurring_ops);
}
if (rsc->pending_ops) {
cmd_list = g_list_concat(cmd_list, rsc->pending_ops);
}
rsc->pending_ops = rsc->recurring_ops = NULL;
}
}
if (!cmd_list) {
return;
}
crm_err("Connection to fencer failed, finalizing %d pending operations",
g_list_length(cmd_list));
for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
stonith_action_complete(cmd_iter->data, -ENOTCONN);
}
g_list_free(cmd_list);
}
/*!
* \internal
* \brief Execute a stonith resource "start" action
*
* Start a stonith resource by registering it with the fencer.
* (Stonith agents don't have a start command.)
*
* \param[in] stonith_api Connection to fencer
* \param[in] rsc Stonith resource to start
* \param[in] cmd Start command to execute
*
* \return pcmk_ok on success, -errno otherwise
*/
static int
execd_stonith_start(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd)
{
char *key = NULL;
char *value = NULL;
stonith_key_value_t *device_params = NULL;
int rc = pcmk_ok;
// Convert command parameters to stonith API key/values
if (cmd->params) {
GHashTableIter iter;
g_hash_table_iter_init(&iter, cmd->params);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
device_params = stonith_key_value_add(device_params, key, value);
}
}
/* The fencer will automatically register devices via CIB notifications
* when the CIB changes, but to avoid a possible race condition between
* the fencer receiving the notification and the executor requesting that
* resource, the executor registers the device as well. The fencer knows how
* to handle duplicate registrations.
*/
rc = stonith_api->cmds->register_device(stonith_api, st_opt_sync_call,
cmd->rsc_id, rsc->provider,
rsc->type, device_params);
stonith_key_value_freeall(device_params, 1, 1);
return rc;
}
/*!
* \internal
* \brief Execute a stonith resource "stop" action
*
* Stop a stonith resource by unregistering it with the fencer.
* (Stonith agents don't have a stop command.)
*
* \param[in] stonith_api Connection to fencer
* \param[in] rsc Stonith resource to stop
*
* \return pcmk_ok on success, -errno otherwise
*/
static inline int
execd_stonith_stop(stonith_t *stonith_api, const lrmd_rsc_t *rsc)
{
/* @TODO Failure would indicate a problem communicating with fencer;
* perhaps we should try reconnecting and retrying a few times?
*/
return stonith_api->cmds->remove_device(stonith_api, st_opt_sync_call,
rsc->rsc_id);
}
/*!
* \internal
* \brief Initiate a stonith resource agent recurring "monitor" action
*
* \param[in] stonith_api Connection to fencer
* \param[in] rsc Stonith resource to monitor
* \param[in] cmd Monitor command being executed
*
* \return pcmk_ok if monitor was successfully initiated, -errno otherwise
*/
static inline int
execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd)
{
int rc = stonith_api->cmds->monitor(stonith_api, 0, cmd->rsc_id,
cmd->timeout / 1000);
rc = stonith_api->cmds->register_callback(stonith_api, rc, 0, 0, cmd,
"lrmd_stonith_callback",
lrmd_stonith_callback);
if (rc == TRUE) {
rsc->active = cmd;
rc = pcmk_ok;
} else {
rc = -pcmk_err_generic;
}
return rc;
}
static void
lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
{
int rc = 0;
bool do_monitor = FALSE;
stonith_t *stonith_api = get_stonith_connection();
if (!stonith_api) {
rc = -ENOTCONN;
} else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
rc = execd_stonith_start(stonith_api, rsc, cmd);
if (rc == 0) {
do_monitor = TRUE;
}
} else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
rc = execd_stonith_stop(stonith_api, rsc);
} else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
if (cmd->interval_ms > 0) {
do_monitor = TRUE;
} else {
rc = rsc->st_probe_rc;
}
}
if (do_monitor) {
rc = execd_stonith_monitor(stonith_api, rsc, cmd);
if (rc == pcmk_ok) {
// Don't clean up yet, we will find out result of the monitor later
return;
}
}
stonith_action_complete(cmd, rc);
}
static int
lrmd_rsc_execute_service_lib(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
{
svc_action_t *action = NULL;
GHashTable *params_copy = NULL;
CRM_ASSERT(rsc);
CRM_ASSERT(cmd);
crm_trace("Creating action, resource:%s action:%s class:%s provider:%s agent:%s",
rsc->rsc_id, cmd->action, rsc->class, rsc->provider, rsc->type);
#if SUPPORT_NAGIOS
/* Recurring operations are cancelled anyway for a stop operation */
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)
&& pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_OK;
goto exec_done;
}
#endif
params_copy = pcmk__str_table_dup(cmd->params);
action = services__create_resource_action(rsc->rsc_id, rsc->class, rsc->provider,
rsc->type,
normalize_action_name(rsc, cmd->action),
cmd->interval_ms, cmd->timeout,
params_copy, cmd->service_flags);
if (action == NULL) {
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_ERROR, strerror(ENOMEM));
goto exec_done;
}
if (action->rc != PCMK_OCF_UNKNOWN) {
pcmk__set_result(&(cmd->result), action->rc, action->status,
services__exit_reason(action));
services_action_free(action);
goto exec_done;
}
action->cb_data = cmd;
if (services_action_async(action, action_complete)) {
/* When services_action_async() returns TRUE, the callback might have
* been called -- in this case action_complete(), which might free cmd,
* so cmd cannot be used here.
*/
return TRUE;
}
pcmk__set_result(&(cmd->result), action->rc, action->status,
services__exit_reason(action));
services_action_free(action);
action = NULL;
exec_done:
cmd_finalize(cmd, rsc);
return TRUE;
}
static gboolean
lrmd_rsc_execute(lrmd_rsc_t * rsc)
{
lrmd_cmd_t *cmd = NULL;
CRM_CHECK(rsc != NULL, return FALSE);
if (rsc->active) {
crm_trace("%s is still active", rsc->rsc_id);
return TRUE;
}
if (rsc->pending_ops) {
GList *first = rsc->pending_ops;
cmd = first->data;
if (cmd->delay_id) {
crm_trace
("Command %s %s was asked to run too early, waiting for start_delay timeout of %dms",
cmd->rsc_id, cmd->action, cmd->start_delay);
return TRUE;
}
rsc->pending_ops = g_list_remove_link(rsc->pending_ops, first);
g_list_free_1(first);
#ifdef PCMK__TIME_USE_CGT
get_current_time(&(cmd->t_run), &(cmd->t_first_run));
#endif
cmd->epoch_last_run = time(NULL);
}
if (!cmd) {
crm_trace("Nothing further to do for %s", rsc->rsc_id);
return TRUE;
}
rsc->active = cmd; /* only one op at a time for a rsc */
if (cmd->interval_ms) {
rsc->recurring_ops = g_list_append(rsc->recurring_ops, cmd);
}
log_execute(cmd);
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) {
lrmd_rsc_execute_stonith(rsc, cmd);
} else {
lrmd_rsc_execute_service_lib(rsc, cmd);
}
return TRUE;
}
static gboolean
lrmd_rsc_dispatch(gpointer user_data)
{
return lrmd_rsc_execute(user_data);
}
void
free_rsc(gpointer data)
{
GList *gIter = NULL;
lrmd_rsc_t *rsc = data;
int is_stonith = pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH,
pcmk__str_casei);
gIter = rsc->pending_ops;
while (gIter != NULL) {
GList *next = gIter->next;
lrmd_cmd_t *cmd = gIter->data;
/* command was never executed */
cmd->result.execution_status = PCMK_EXEC_CANCELLED;
cmd_finalize(cmd, NULL);
gIter = next;
}
/* frees list, but not list elements. */
g_list_free(rsc->pending_ops);
gIter = rsc->recurring_ops;
while (gIter != NULL) {
GList *next = gIter->next;
lrmd_cmd_t *cmd = gIter->data;
if (is_stonith) {
cmd->result.execution_status = PCMK_EXEC_CANCELLED;
/* If a stonith command is in-flight, just mark it as cancelled;
* it is not safe to finalize/free the cmd until the stonith api
* says it has either completed or timed out.
*/
if (rsc->active != cmd) {
cmd_finalize(cmd, NULL);
}
} else {
/* This command is already handed off to service library,
* let service library cancel it and tell us via the callback
* when it is cancelled. The rsc can be safely destroyed
* even if we are waiting for the cancel result */
services_action_cancel(rsc->rsc_id,
normalize_action_name(rsc, cmd->action),
cmd->interval_ms);
}
gIter = next;
}
/* frees list, but not list elements. */
g_list_free(rsc->recurring_ops);
free(rsc->rsc_id);
free(rsc->class);
free(rsc->provider);
free(rsc->type);
mainloop_destroy_trigger(rsc->work);
free(rsc);
}
static int
process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id,
xmlNode **reply)
{
int rc = pcmk_ok;
const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION);
if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) {
crm_err("Cluster API version must be greater than or equal to %s, not %s",
LRMD_MIN_PROTOCOL_VERSION, protocol_version);
rc = -EPROTO;
}
if (pcmk__xe_attr_is_true(request, F_LRMD_IS_IPC_PROVIDER)) {
#ifdef PCMK__COMPILE_REMOTE
if ((client->remote != NULL) && client->remote->tls_handshake_complete) {
// This is a remote connection from a cluster node's controller
ipc_proxy_add_provider(client);
} else {
rc = -EACCES;
}
#else
rc = -EPROTONOSUPPORT;
#endif
}
*reply = create_lrmd_reply(__func__, rc, call_id);
crm_xml_add(*reply, F_LRMD_OPERATION, CRM_OP_REGISTER);
crm_xml_add(*reply, F_LRMD_CLIENTID, client->id);
crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
return rc;
}
static int
process_lrmd_rsc_register(pcmk__client_t *client, uint32_t id, xmlNode *request)
{
int rc = pcmk_ok;
lrmd_rsc_t *rsc = build_rsc_from_xml(request);
lrmd_rsc_t *dup = g_hash_table_lookup(rsc_list, rsc->rsc_id);
if (dup &&
pcmk__str_eq(rsc->class, dup->class, pcmk__str_casei) &&
pcmk__str_eq(rsc->provider, dup->provider, pcmk__str_casei) && pcmk__str_eq(rsc->type, dup->type, pcmk__str_casei)) {
crm_notice("Ignoring duplicate registration of '%s'", rsc->rsc_id);
free_rsc(rsc);
return rc;
}
g_hash_table_replace(rsc_list, rsc->rsc_id, rsc);
crm_info("Cached agent information for '%s'", rsc->rsc_id);
return rc;
}
static xmlNode *
process_lrmd_get_rsc_info(xmlNode *request, int call_id)
{
int rc = pcmk_ok;
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
xmlNode *reply = NULL;
lrmd_rsc_t *rsc = NULL;
if (rsc_id == NULL) {
rc = -ENODEV;
} else {
rsc = g_hash_table_lookup(rsc_list, rsc_id);
if (rsc == NULL) {
crm_info("Agent information for '%s' not in cache", rsc_id);
rc = -ENODEV;
}
}
reply = create_lrmd_reply(__func__, rc, call_id);
if (rsc) {
crm_xml_add(reply, F_LRMD_RSC_ID, rsc->rsc_id);
crm_xml_add(reply, F_LRMD_CLASS, rsc->class);
crm_xml_add(reply, F_LRMD_PROVIDER, rsc->provider);
crm_xml_add(reply, F_LRMD_TYPE, rsc->type);
}
return reply;
}
static int
process_lrmd_rsc_unregister(pcmk__client_t *client, uint32_t id,
xmlNode *request)
{
int rc = pcmk_ok;
lrmd_rsc_t *rsc = NULL;
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
if (!rsc_id) {
return -ENODEV;
}
rsc = g_hash_table_lookup(rsc_list, rsc_id);
if (rsc == NULL) {
crm_info("Ignoring unregistration of resource '%s', which is not registered",
rsc_id);
return pcmk_ok;
}
if (rsc->active) {
/* let the caller know there are still active ops on this rsc to watch for */
crm_trace("Operation (0x%p) still in progress for unregistered resource %s",
rsc->active, rsc_id);
rc = -EINPROGRESS;
}
g_hash_table_remove(rsc_list, rsc_id);
return rc;
}
static int
process_lrmd_rsc_exec(pcmk__client_t *client, uint32_t id, xmlNode *request)
{
lrmd_rsc_t *rsc = NULL;
lrmd_cmd_t *cmd = NULL;
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
int call_id;
if (!rsc_id) {
return -EINVAL;
}
if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) {
crm_info("Resource '%s' not found (%d active resources)",
rsc_id, g_hash_table_size(rsc_list));
return -ENODEV;
}
cmd = create_lrmd_cmd(request, client);
call_id = cmd->call_id;
/* Don't reference cmd after handing it off to be scheduled.
* The cmd could get merged and freed. */
schedule_lrmd_cmd(rsc, cmd);
return call_id;
}
static int
cancel_op(const char *rsc_id, const char *action, guint interval_ms)
{
GList *gIter = NULL;
lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, rsc_id);
/* How to cancel an action.
* 1. Check pending ops list, if it hasn't been handed off
* to the service library or stonith recurring list remove
* it there and that will stop it.
* 2. If it isn't in the pending ops list, then it's either a
* recurring op in the stonith recurring list, or the service
* library's recurring list. Stop it there
* 3. If not found in any lists, then this operation has either
* been executed already and is not a recurring operation, or
* never existed.
*/
if (!rsc) {
return -ENODEV;
}
for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) {
lrmd_cmd_t *cmd = gIter->data;
if (action_matches(cmd, action, interval_ms)) {
cmd->result.execution_status = PCMK_EXEC_CANCELLED;
cmd_finalize(cmd, rsc);
return pcmk_ok;
}
}
if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) {
/* The service library does not handle stonith operations.
* We have to handle recurring stonith operations ourselves. */
for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) {
lrmd_cmd_t *cmd = gIter->data;
if (action_matches(cmd, action, interval_ms)) {
cmd->result.execution_status = PCMK_EXEC_CANCELLED;
if (rsc->active != cmd) {
cmd_finalize(cmd, rsc);
}
return pcmk_ok;
}
}
} else if (services_action_cancel(rsc_id,
normalize_action_name(rsc, action),
interval_ms) == TRUE) {
/* The service library will tell the action_complete callback function
* this action was cancelled, which will destroy the cmd and remove
* it from the recurring_op list. Do not do that in this function
* if the service library says it cancelled it. */
return pcmk_ok;
}
return -EOPNOTSUPP;
}
static void
cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id)
{
GList *cmd_list = NULL;
GList *cmd_iter = NULL;
/* Notice a copy of each list is created when concat is called.
* This prevents odd behavior from occurring when the cmd_list
* is iterated through later on. It is possible the cancel_op
* function may end up modifying the recurring_ops and pending_ops
* lists. If we did not copy those lists, our cmd_list iteration
* could get messed up.*/
if (rsc->recurring_ops) {
cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->recurring_ops));
}
if (rsc->pending_ops) {
cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->pending_ops));
}
if (!cmd_list) {
return;
}
for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
lrmd_cmd_t *cmd = cmd_iter->data;
if (cmd->interval_ms == 0) {
continue;
}
if (client_id && !pcmk__str_eq(cmd->client_id, client_id, pcmk__str_casei)) {
continue;
}
cancel_op(rsc->rsc_id, cmd->action, cmd->interval_ms);
}
/* frees only the copied list data, not the cmds */
g_list_free(cmd_list);
}
static int
process_lrmd_rsc_cancel(pcmk__client_t *client, uint32_t id, xmlNode *request)
{
xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
const char *action = crm_element_value(rsc_xml, F_LRMD_RSC_ACTION);
guint interval_ms = 0;
crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &interval_ms);
if (!rsc_id || !action) {
return -EINVAL;
}
return cancel_op(rsc_id, action, interval_ms);
}
static void
add_recurring_op_xml(xmlNode *reply, lrmd_rsc_t *rsc)
{
xmlNode *rsc_xml = create_xml_node(reply, F_LRMD_RSC);
crm_xml_add(rsc_xml, F_LRMD_RSC_ID, rsc->rsc_id);
for (GList *item = rsc->recurring_ops; item != NULL; item = item->next) {
lrmd_cmd_t *cmd = item->data;
xmlNode *op_xml = create_xml_node(rsc_xml, T_LRMD_RSC_OP);
crm_xml_add(op_xml, F_LRMD_RSC_ACTION,
(cmd->real_action? cmd->real_action : cmd->action));
crm_xml_add_ms(op_xml, F_LRMD_RSC_INTERVAL, cmd->interval_ms);
crm_xml_add_int(op_xml, F_LRMD_TIMEOUT, cmd->timeout_orig);
}
}
static xmlNode *
process_lrmd_get_recurring(xmlNode *request, int call_id)
{
int rc = pcmk_ok;
const char *rsc_id = NULL;
lrmd_rsc_t *rsc = NULL;
xmlNode *reply = NULL;
xmlNode *rsc_xml = NULL;
// Resource ID is optional
rsc_xml = first_named_child(request, F_LRMD_CALLDATA);
if (rsc_xml) {
rsc_xml = first_named_child(rsc_xml, F_LRMD_RSC);
}
if (rsc_xml) {
rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
}
// If resource ID is specified, resource must exist
if (rsc_id != NULL) {
rsc = g_hash_table_lookup(rsc_list, rsc_id);
if (rsc == NULL) {
crm_info("Resource '%s' not found (%d active resources)",
rsc_id, g_hash_table_size(rsc_list));
rc = -ENODEV;
}
}
reply = create_lrmd_reply(__func__, rc, call_id);
// If resource ID is not specified, check all resources
if (rsc_id == NULL) {
GHashTableIter iter;
char *key = NULL;
g_hash_table_iter_init(&iter, rsc_list);
while (g_hash_table_iter_next(&iter, (gpointer *) &key,
(gpointer *) &rsc)) {
add_recurring_op_xml(reply, rsc);
}
} else if (rsc) {
add_recurring_op_xml(reply, rsc);
}
return reply;
}
void
process_lrmd_message(pcmk__client_t *client, uint32_t id, xmlNode *request)
{
int rc = pcmk_ok;
int call_id = 0;
const char *op = crm_element_value(request, F_LRMD_OPERATION);
int do_reply = 0;
int do_notify = 0;
xmlNode *reply = NULL;
/* Certain IPC commands may be done only by privileged users (i.e. root or
* hacluster), because they would otherwise provide a means of bypassing
* ACLs.
*/
bool allowed = pcmk_is_set(client->flags, pcmk__client_privileged);
crm_trace("Processing %s operation from %s", op, client->id);
crm_element_value_int(request, F_LRMD_CALLID, &call_id);
if (pcmk__str_eq(op, CRM_OP_IPC_FWD, pcmk__str_none)) {
#ifdef PCMK__COMPILE_REMOTE
if (allowed) {
ipc_proxy_forward_client(client, request);
} else {
rc = -EACCES;
}
#else
rc = -EPROTONOSUPPORT;
#endif
do_reply = 1;
} else if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) {
rc = process_lrmd_signon(client, request, call_id, &reply);
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_RSC_REG, pcmk__str_none)) {
if (allowed) {
rc = process_lrmd_rsc_register(client, id, request);
do_notify = 1;
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_RSC_INFO, pcmk__str_none)) {
if (allowed) {
reply = process_lrmd_get_rsc_info(request, call_id);
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_RSC_UNREG, pcmk__str_none)) {
if (allowed) {
rc = process_lrmd_rsc_unregister(client, id, request);
/* don't notify anyone about failed un-registers */
if (rc == pcmk_ok || rc == -EINPROGRESS) {
do_notify = 1;
}
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_RSC_EXEC, pcmk__str_none)) {
if (allowed) {
rc = process_lrmd_rsc_exec(client, id, request);
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_RSC_CANCEL, pcmk__str_none)) {
if (allowed) {
rc = process_lrmd_rsc_cancel(client, id, request);
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_POKE, pcmk__str_none)) {
do_notify = 1;
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_CHECK, pcmk__str_none)) {
if (allowed) {
xmlNode *data = get_message_xml(request, F_LRMD_CALLDATA);
CRM_LOG_ASSERT(data != NULL);
pcmk__valid_sbd_timeout(crm_element_value(data, F_LRMD_WATCHDOG));
} else {
rc = -EACCES;
}
} else if (pcmk__str_eq(op, LRMD_OP_ALERT_EXEC, pcmk__str_none)) {
if (allowed) {
rc = process_lrmd_alert_exec(client, id, request);
} else {
rc = -EACCES;
}
do_reply = 1;
} else if (pcmk__str_eq(op, LRMD_OP_GET_RECURRING, pcmk__str_none)) {
if (allowed) {
reply = process_lrmd_get_recurring(request, call_id);
} else {
rc = -EACCES;
}
do_reply = 1;
} else {
rc = -EOPNOTSUPP;
do_reply = 1;
crm_err("Unknown IPC request '%s' from client %s",
op, pcmk__client_name(client));
}
if (rc == -EACCES) {
crm_warn("Rejecting IPC request '%s' from unprivileged client %s",
op, pcmk__client_name(client));
}
crm_debug("Processed %s operation from %s: rc=%d, reply=%d, notify=%d",
op, client->id, rc, do_reply, do_notify);
if (do_reply) {
int send_rc = pcmk_rc_ok;
if (reply == NULL) {
reply = create_lrmd_reply(__func__, rc, call_id);
}
send_rc = lrmd_server_send_reply(client, id, reply);
free_xml(reply);
if (send_rc != pcmk_rc_ok) {
crm_warn("Reply to client %s failed: %s " CRM_XS " rc=%d",
pcmk__client_name(client), pcmk_rc_str(send_rc), send_rc);
}
}
if (do_notify) {
send_generic_notify(rc, request);
}
}
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index 6a7e4232c0..80f6443bea 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -1,202 +1,204 @@
/*
* Copyright 2011-2021 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef STONITH_NG_INTERNAL__H
# define STONITH_NG_INTERNAL__H
# include <glib.h>
# include <crm/common/ipc.h>
# include <crm/common/xml.h>
# include <crm/common/output_internal.h>
# include <crm/stonith-ng.h>
enum st_device_flags
{
st_device_supports_list = 0x0001,
st_device_supports_status = 0x0002,
st_device_supports_reboot = 0x0004,
st_device_supports_parameter_plug = 0x0008,
st_device_supports_parameter_port = 0x0010,
};
#define stonith__set_device_flags(device_flags, device_id, flags_to_set) do { \
device_flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
"Fence device", device_id, \
(device_flags), (flags_to_set), \
#flags_to_set); \
} while (0)
#define stonith__set_call_options(st_call_opts, call_for, flags_to_set) do { \
st_call_opts = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
"Fencer call", (call_for), \
(st_call_opts), (flags_to_set), \
#flags_to_set); \
} while (0)
#define stonith__clear_call_options(st_call_opts, call_for, flags_to_clear) do { \
st_call_opts = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
"Fencer call", (call_for), \
(st_call_opts), (flags_to_clear), \
#flags_to_clear); \
} while (0)
struct stonith_action_s;
typedef struct stonith_action_s stonith_action_t;
stonith_action_t *stonith_action_create(const char *agent,
const char *_action,
const char *victim,
uint32_t victim_nodeid,
int timeout,
GHashTable * device_args,
GHashTable * port_map,
const char * host_arg);
void stonith__destroy_action(stonith_action_t *action);
pcmk__action_result_t *stonith__action_result(stonith_action_t *action);
int stonith__result2rc(const pcmk__action_result_t *result);
int
stonith_action_execute_async(stonith_action_t * action,
void *userdata,
void (*done) (int pid,
const pcmk__action_result_t *result,
void *user_data),
void (*fork_cb) (int pid, void *user_data));
xmlNode *create_level_registration_xml(const char *node, const char *pattern,
const char *attr, const char *value,
int level,
stonith_key_value_t *device_list);
xmlNode *create_device_registration_xml(const char *id,
enum stonith_namespace namespace,
const char *agent,
stonith_key_value_t *params,
const char *rsc_provides);
void stonith__register_messages(pcmk__output_t *out);
GList *stonith__parse_targets(const char *hosts);
gboolean stonith__later_succeeded(stonith_history_t *event, stonith_history_t *top_history);
stonith_history_t *stonith__sort_history(stonith_history_t *history);
void stonith__device_parameter_flags(uint32_t *device_flags,
const char *device_name,
xmlNode *metadata);
# define ST_LEVEL_MAX 10
# define F_STONITH_CLIENTID "st_clientid"
# define F_STONITH_CALLOPTS "st_callopt"
# define F_STONITH_CALLID "st_callid"
# define F_STONITH_CALLDATA "st_calldata"
# define F_STONITH_OPERATION "st_op"
# define F_STONITH_TARGET "st_target"
# define F_STONITH_REMOTE_OP_ID "st_remote_op"
# define F_STONITH_REMOTE_OP_ID_RELAY "st_remote_op_relay"
# define F_STONITH_RC "st_rc"
/*! Timeout period per a device execution */
# define F_STONITH_TIMEOUT "st_timeout"
# define F_STONITH_TOLERANCE "st_tolerance"
# define F_STONITH_DELAY "st_delay"
/*! Action specific timeout period returned in query of fencing devices. */
# define F_STONITH_ACTION_TIMEOUT "st_action_timeout"
/*! Host in query result is not allowed to run this action */
# define F_STONITH_ACTION_DISALLOWED "st_action_disallowed"
/*! Maximum of random fencing delay for a device */
# define F_STONITH_DELAY_MAX "st_delay_max"
/*! Base delay used for a fencing delay */
# define F_STONITH_DELAY_BASE "st_delay_base"
/*! Has this device been verified using a monitor type
* operation (monitor, list, status) */
# define F_STONITH_DEVICE_VERIFIED "st_monitor_verified"
/*! device is required for this action */
# define F_STONITH_DEVICE_REQUIRED "st_required"
/*! number of available devices in query result */
# define F_STONITH_AVAILABLE_DEVICES "st-available-devices"
# define F_STONITH_CALLBACK_TOKEN "st_async_id"
# define F_STONITH_CLIENTNAME "st_clientname"
# define F_STONITH_CLIENTNODE "st_clientnode"
# define F_STONITH_NOTIFY_ACTIVATE "st_notify_activate"
# define F_STONITH_NOTIFY_DEACTIVATE "st_notify_deactivate"
# define F_STONITH_DELEGATE "st_delegate"
/*! The node initiating the stonith operation. If an operation
* is relayed, this is the last node the operation lands on. When
* in standalone mode, origin is the client's id that originated the
* operation. */
# define F_STONITH_ORIGIN "st_origin"
# define F_STONITH_HISTORY_LIST "st_history"
# define F_STONITH_DATE "st_date"
# define F_STONITH_DATE_NSEC "st_date_nsec"
# define F_STONITH_STATE "st_state"
# define F_STONITH_ACTIVE "st_active"
# define F_STONITH_DIFFERENTIAL "st_differential"
# define F_STONITH_DEVICE "st_device_id"
# define F_STONITH_ACTION "st_device_action"
# define F_STONITH_MERGED "st_op_merged"
# define T_STONITH_NG "stonith-ng"
# define T_STONITH_REPLY "st-reply"
/*! For async operations, an event from the server containing
* the total amount of time the server is allowing for the operation
* to take place is returned to the client. */
# define T_STONITH_TIMEOUT_VALUE "st-async-timeout-value"
# define T_STONITH_NOTIFY "st_notify"
# define STONITH_ATTR_ACTION_OP "action"
# define STONITH_OP_EXEC "st_execute"
# define STONITH_OP_TIMEOUT_UPDATE "st_timeout_update"
# define STONITH_OP_QUERY "st_query"
# define STONITH_OP_FENCE "st_fence"
# define STONITH_OP_RELAY "st_relay"
# define STONITH_OP_DEVICE_ADD "st_device_register"
# define STONITH_OP_DEVICE_DEL "st_device_remove"
# define STONITH_OP_FENCE_HISTORY "st_fence_history"
# define STONITH_OP_LEVEL_ADD "st_level_add"
# define STONITH_OP_LEVEL_DEL "st_level_remove"
# define STONITH_WATCHDOG_AGENT "fence_watchdog"
/* Don't change 2 below as it would break rolling upgrade */
# define STONITH_WATCHDOG_AGENT_INTERNAL "#watchdog"
# define STONITH_WATCHDOG_ID "watchdog"
/* Exported for crm_mon to reference */
int stonith__failed_history(pcmk__output_t *out, va_list args);
int stonith__history(pcmk__output_t *out, va_list args);
int stonith__full_history(pcmk__output_t *out, va_list args);
int stonith__pending_actions(pcmk__output_t *out, va_list args);
stonith_history_t *stonith__first_matching_event(stonith_history_t *history,
bool (*matching_fn)(stonith_history_t *, void *),
void *user_data);
bool stonith__event_state_pending(stonith_history_t *history, void *user_data);
bool stonith__event_state_eq(stonith_history_t *history, void *user_data);
bool stonith__event_state_neq(stonith_history_t *history, void *user_data);
+int stonith__legacy2status(int rc);
+
/*!
* \internal
* \brief Is a fencing operation in pending state?
*
* \param[in] state State as enum op_state value
*
* \return A boolean
*/
static inline bool
stonith__op_state_pending(enum op_state state)
{
return state != st_failed && state != st_done;
}
gboolean stonith__watchdog_fencing_enabled_for_node(const char *node);
gboolean stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node);
#endif
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index 64d3afd5d9..9e785595a8 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -1,545 +1,581 @@
/*
* Copyright 2004-2021 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <libgen.h>
#include <inttypes.h>
#include <sys/types.h>
#include <glib.h>
#include <crm/crm.h>
#include <crm/stonith-ng.h>
#include <crm/fencing/internal.h>
#include <crm/msg_xml.h>
#include <crm/services_internal.h>
#include "fencing_private.h"
struct stonith_action_s {
/*! user defined data */
char *agent;
char *action;
char *victim;
GHashTable *args;
int timeout;
int async;
void *userdata;
void (*done_cb) (int pid, const pcmk__action_result_t *result,
void *user_data);
void (*fork_cb) (int pid, void *user_data);
svc_action_t *svc_action;
/*! internal timing information */
time_t initial_start_time;
int tries;
int remaining_timeout;
int max_retries;
int pid;
pcmk__action_result_t result;
};
static int internal_stonith_action_execute(stonith_action_t *action);
static void log_action(stonith_action_t *action, pid_t pid);
/*!
* \internal
* \brief Set an action's result based on services library result
*
* \param[in] action Fence action to set result for
* \param[in] svc_action Service action to get result from
*/
static void
set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action)
{
pcmk__set_result(&(action->result), svc_action->rc, svc_action->status,
services__exit_reason(svc_action));
pcmk__set_result_output(&(action->result),
services__grab_stdout(svc_action),
services__grab_stderr(svc_action));
}
static void
log_action(stonith_action_t *action, pid_t pid)
{
/* The services library has already logged the output at info or debug
* level, so just raise to warning for stderr.
*/
if (action->result.action_stderr != NULL) {
/* Logging the whole string confuses syslog when the string is xml */
char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid);
crm_log_output(LOG_WARNING, prefix, action->result.action_stderr);
free(prefix);
}
}
static void
append_config_arg(gpointer key, gpointer value, gpointer user_data)
{
/* The fencer will filter "action" out when it registers the device,
* but ignore it here in case any external API users don't.
*
* Also filter out parameters handled directly by Pacemaker.
*/
if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei)
&& !pcmk_stonith_param(key)
&& (strstr(key, CRM_META) == NULL)
&& !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) {
crm_trace("Passing %s=%s with fence action",
(const char *) key, (const char *) (value? value : ""));
g_hash_table_insert((GHashTable *) user_data,
strdup(key), strdup(value? value : ""));
}
}
static GHashTable *
make_args(const char *agent, const char *action, const char *victim,
uint32_t victim_nodeid, GHashTable * device_args,
GHashTable * port_map, const char *host_arg)
{
GHashTable *arg_list = NULL;
const char *value = NULL;
CRM_CHECK(action != NULL, return NULL);
arg_list = pcmk__strkey_table(free, free);
// Add action to arguments (using an alias if requested)
if (device_args) {
char buffer[512];
snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action);
value = g_hash_table_lookup(device_args, buffer);
if (value) {
crm_debug("Substituting '%s' for fence action %s targeting %s",
value, action, victim);
action = value;
}
}
g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP),
strdup(action));
/* If this is a fencing operation against another node, add more standard
* arguments.
*/
if (victim && device_args) {
const char *param = NULL;
/* Always pass the target's name, per
* https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md
*/
g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim));
// If the target's node ID was specified, pass it, too
if (victim_nodeid) {
char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid);
// cts-fencing looks for this log message
crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s",
nodeid, action, victim);
g_hash_table_insert(arg_list, strdup("nodeid"), nodeid);
}
// Check whether target must be specified in some other way
param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT);
if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none)
&& !pcmk__str_eq(param, "none", pcmk__str_casei)) {
if (param == NULL) {
/* Use the caller's default for pcmk_host_argument, or "port" if
* none was given
*/
param = (host_arg == NULL)? "port" : host_arg;
}
value = g_hash_table_lookup(device_args, param);
if (pcmk__str_eq(value, "dynamic",
pcmk__str_casei|pcmk__str_null_matches)) {
/* If the host argument was "dynamic" or not explicitly specified,
* add it with the target
*/
const char *alias = NULL;
if (port_map) {
alias = g_hash_table_lookup(port_map, victim);
}
if (alias == NULL) {
alias = victim;
}
crm_debug("Passing %s='%s' with fence action %s targeting %s",
param, alias, action, victim);
g_hash_table_insert(arg_list, strdup(param), strdup(alias));
}
}
}
if (device_args) {
g_hash_table_foreach(device_args, append_config_arg, arg_list);
}
return arg_list;
}
/*!
* \internal
* \brief Free all memory used by a stonith action
*
* \param[in,out] action Action to free
*/
void
stonith__destroy_action(stonith_action_t *action)
{
if (action) {
free(action->agent);
if (action->args) {
g_hash_table_destroy(action->args);
}
free(action->action);
free(action->victim);
if (action->svc_action) {
services_action_free(action->svc_action);
}
pcmk__reset_result(&(action->result));
free(action);
}
}
/*!
* \internal
* \brief Get the result of an executed stonith action
*
* \param[in] action Executed action
*
* \return Pointer to action's result (or NULL if \p action is NULL)
*/
pcmk__action_result_t *
stonith__action_result(stonith_action_t *action)
{
return (action == NULL)? NULL : &(action->result);
}
#define FAILURE_MAX_RETRIES 2
stonith_action_t *
stonith_action_create(const char *agent,
const char *_action,
const char *victim,
uint32_t victim_nodeid,
int timeout, GHashTable * device_args,
GHashTable * port_map, const char *host_arg)
{
stonith_action_t *action;
action = calloc(1, sizeof(stonith_action_t));
action->args = make_args(agent, _action, victim, victim_nodeid,
device_args, port_map, host_arg);
crm_debug("Preparing '%s' action for %s using agent %s",
_action, (victim? victim : "no target"), agent);
action->agent = strdup(agent);
action->action = strdup(_action);
if (victim) {
action->victim = strdup(victim);
}
action->timeout = action->remaining_timeout = timeout;
action->max_retries = FAILURE_MAX_RETRIES;
pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN,
"Initialization bug in fencing library");
if (device_args) {
char buffer[512];
const char *value = NULL;
snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action);
value = g_hash_table_lookup(device_args, buffer);
if (value) {
action->max_retries = atoi(value);
}
}
return action;
}
static gboolean
update_remaining_timeout(stonith_action_t * action)
{
int diff = time(NULL) - action->initial_start_time;
if (action->tries >= action->max_retries) {
crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed",
action->agent, action->action, action->max_retries);
action->remaining_timeout = 0;
} else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT)
&& (diff < (action->timeout * 0.7))) {
/* only set remaining timeout period if there is 30%
* or greater of the original timeout period left */
action->remaining_timeout = action->timeout - diff;
} else {
action->remaining_timeout = 0;
}
return action->remaining_timeout ? TRUE : FALSE;
}
/*!
* \internal
* \brief Map a fencing action result to a standard return code
*
* \param[in] result Fencing action result to map
*
* \return Standard Pacemaker return code that best corresponds to \p result
*/
int
stonith__result2rc(const pcmk__action_result_t *result)
{
if (pcmk__result_ok(result)) {
return pcmk_rc_ok;
}
switch (result->execution_status) {
case PCMK_EXEC_PENDING: return EINPROGRESS;
case PCMK_EXEC_CANCELLED: return ECANCELED;
case PCMK_EXEC_TIMEOUT: return ETIME;
case PCMK_EXEC_NOT_INSTALLED: return ENOENT;
case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP;
case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN;
case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV;
case PCMK_EXEC_NO_SECRETS: return EACCES;
/* For the fencing API, PCMK_EXEC_INVALID is used with fencer API
* operations that don't involve executing an agent (for example,
* registering devices). This allows us to use the CRM_EX_* codes in the
* exit status for finer-grained responses.
*/
case PCMK_EXEC_INVALID:
switch (result->exit_status) {
case CRM_EX_INSUFFICIENT_PRIV: return EACCES;
case CRM_EX_PROTOCOL: return EPROTO;
/* CRM_EX_EXPIRED is used for orphaned fencing operations left
* over from a previous instance of the fencer. For API backward
* compatibility, this is mapped to the previously used code for
* this case, EHOSTUNREACH.
*/
case CRM_EX_EXPIRED: return EHOSTUNREACH;
default: break;
}
default:
break;
}
// Try to provide useful error code based on result's error output
if (result->action_stderr == NULL) {
return ENODATA;
} else if (strcasestr(result->action_stderr, "timed out")
|| strcasestr(result->action_stderr, "timeout")) {
return ETIME;
} else if (strcasestr(result->action_stderr, "unrecognised action")
|| strcasestr(result->action_stderr, "unrecognized action")
|| strcasestr(result->action_stderr, "unsupported action")) {
return EOPNOTSUPP;
}
// Oh well, we tried
return pcmk_rc_error;
}
+/*!
+ * \internal
+ * \brief Determine execution status equivalent of legacy fencer return code
+ *
+ * Fence action notifications, and fence action callbacks from older fencers
+ * (<=2.1.2) in a rolling upgrade, will have only a legacy return code. Map this
+ * to an execution status as best as possible (essentially, the inverse of
+ * stonith__result2rc()).
+ *
+ * \param[in] rc Legacy return code from fencer
+ *
+ * \return Execution status best corresponding to \p rc
+ */
+int
+stonith__legacy2status(int rc)
+{
+ if (rc >= 0) {
+ return PCMK_EXEC_DONE;
+ }
+ switch (-rc) {
+ case EACCES: return PCMK_EXEC_NO_SECRETS;
+ case ECANCELED: return PCMK_EXEC_CANCELLED;
+ case EHOSTUNREACH: return PCMK_EXEC_INVALID;
+ case EINPROGRESS: return PCMK_EXEC_PENDING;
+ case ENODEV: return PCMK_EXEC_NO_FENCE_DEVICE;
+ case ENOENT: return PCMK_EXEC_NOT_INSTALLED;
+ case ENOTCONN: return PCMK_EXEC_NOT_CONNECTED;
+ case EOPNOTSUPP: return PCMK_EXEC_NOT_SUPPORTED;
+ case EPROTO: return PCMK_EXEC_INVALID;
+ case EPROTONOSUPPORT: return PCMK_EXEC_NOT_SUPPORTED;
+ case ETIME: return PCMK_EXEC_TIMEOUT;
+ case ETIMEDOUT: return PCMK_EXEC_TIMEOUT;
+ default: return PCMK_EXEC_ERROR;
+ }
+}
+
static void
stonith_action_async_done(svc_action_t *svc_action)
{
stonith_action_t *action = (stonith_action_t *) svc_action->cb_data;
set_result_from_svc_action(action, svc_action);
svc_action->params = NULL;
crm_debug("Child process %d performing action '%s' exited with rc %d",
action->pid, action->action, svc_action->rc);
log_action(action, action->pid);
if (!pcmk__result_ok(&(action->result))
&& update_remaining_timeout(action)) {
int rc = internal_stonith_action_execute(action);
if (rc == pcmk_ok) {
return;
}
}
if (action->done_cb) {
action->done_cb(action->pid, &(action->result), action->userdata);
}
action->svc_action = NULL; // don't remove our caller
stonith__destroy_action(action);
}
static void
stonith_action_async_forked(svc_action_t *svc_action)
{
stonith_action_t *action = (stonith_action_t *) svc_action->cb_data;
action->pid = svc_action->pid;
action->svc_action = svc_action;
if (action->fork_cb) {
(action->fork_cb) (svc_action->pid, action->userdata);
}
crm_trace("Child process %d performing action '%s' successfully forked",
action->pid, action->action);
}
static int
internal_stonith_action_execute(stonith_action_t * action)
{
int rc = -EPROTO;
int is_retry = 0;
svc_action_t *svc_action = NULL;
static int stonith_sequence = 0;
char *buffer = NULL;
CRM_CHECK(action != NULL, return -EINVAL);
if ((action->action == NULL) || (action->args == NULL)
|| (action->agent == NULL)) {
pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_ERROR_FATAL, "Bug in fencing library");
return -EINVAL;
}
if (!action->tries) {
action->initial_start_time = time(NULL);
}
action->tries++;
if (action->tries > 1) {
crm_info("Attempt %d to execute %s (%s). remaining timeout is %d",
action->tries, action->agent, action->action, action->remaining_timeout);
is_retry = 1;
}
buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s",
basename(action->agent));
svc_action = services_action_create_generic(buffer, NULL);
free(buffer);
if (svc_action->rc != PCMK_OCF_UNKNOWN) {
set_result_from_svc_action(action, svc_action);
services_action_free(svc_action);
return -E2BIG;
}
svc_action->timeout = 1000 * action->remaining_timeout;
svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH);
svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent),
action->action, action->tries);
svc_action->agent = strdup(action->agent);
svc_action->sequence = stonith_sequence++;
svc_action->params = action->args;
svc_action->cb_data = (void *) action;
svc_action->flags = pcmk__set_flags_as(__func__, __LINE__,
LOG_TRACE, "Action",
svc_action->id, svc_action->flags,
SVC_ACTION_NON_BLOCKED,
"SVC_ACTION_NON_BLOCKED");
/* keep retries from executing out of control and free previous results */
if (is_retry) {
pcmk__reset_result(&(action->result));
sleep(1);
}
if (action->async) {
/* async */
if (services_action_async_fork_notify(svc_action,
&stonith_action_async_done,
&stonith_action_async_forked)) {
pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN,
PCMK_EXEC_PENDING, NULL);
return pcmk_ok;
}
} else if (services_action_sync(svc_action)) { // sync success
rc = pcmk_ok;
} else { // sync failure
rc = -ECONNABORTED;
}
set_result_from_svc_action(action, svc_action);
svc_action->params = NULL;
services_action_free(svc_action);
return rc;
}
/*!
* \internal
* \brief Kick off execution of an async stonith action
*
* \param[in,out] action Action to be executed
* \param[in,out] userdata Datapointer to be passed to callbacks
* \param[in] done Callback to notify action has failed/succeeded
* \param[in] fork_callback Callback to notify successful fork of child
*
* \return pcmk_ok if ownership of action has been taken, -errno otherwise
*/
int
stonith_action_execute_async(stonith_action_t * action,
void *userdata,
void (*done) (int pid,
const pcmk__action_result_t *result,
void *user_data),
void (*fork_cb) (int pid, void *user_data))
{
if (!action) {
return -EINVAL;
}
action->userdata = userdata;
action->done_cb = done;
action->fork_cb = fork_cb;
action->async = 1;
return internal_stonith_action_execute(action);
}
/*!
* \internal
* \brief Execute a stonith action
*
* \param[in,out] action Action to execute
*
* \return pcmk_ok on success, -errno otherwise
*/
int
stonith__execute(stonith_action_t *action)
{
int rc = pcmk_ok;
CRM_CHECK(action != NULL, return -EINVAL);
// Keep trying until success, max retries, or timeout
do {
rc = internal_stonith_action_execute(action);
} while ((rc != pcmk_ok) && update_remaining_timeout(action));
return rc;
}

File Metadata

Mime Type
text/x-diff
Expires
Tue, Jul 8, 6:22 PM (17 h, 22 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2002602
Default Alt Text
(91 KB)

Event Timeline