Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4837927
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
63 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
index e8509c0c88..49c09f653a 100644
--- a/daemons/controld/controld_te_events.c
+++ b/daemons/controld/controld_te_events.c
@@ -1,613 +1,613 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/param.h>
#include <crm/crm.h>
#include <crm/cib.h>
#include <crm/common/xml.h>
#include <pacemaker-controld.h>
#include <crm/common/attrs_internal.h>
#include <crm/common/ipc_attrd_internal.h>
/*!
* \internal
* \brief Action numbers of outside events processed in current update diff
*
* This table is to be used as a set. It should be empty when the transitioner
* begins processing a CIB update diff. It ensures that if there are multiple
* events (for example, "_last_0" and "_last_failure_0") for the same action,
* only one of them updates the failcount. Events that originate outside the
* cluster can't be confirmed, since they're not in the transition graph.
*/
static GHashTable *outside_events = NULL;
/*!
* \internal
* \brief Empty the hash table containing action numbers of outside events
*/
void
controld_remove_all_outside_events(void)
{
if (outside_events != NULL) {
g_hash_table_remove_all(outside_events);
}
}
/*!
* \internal
* \brief Destroy the hash table containing action numbers of outside events
*/
void
controld_destroy_outside_events_table(void)
{
if (outside_events != NULL) {
g_hash_table_destroy(outside_events);
outside_events = NULL;
}
}
/*!
* \internal
* \brief Add an outside event's action number to a set
*
* \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the
* event was not already in the set, or \p pcmk_rc_already otherwise.
*/
static int
record_outside_event(gint action_num)
{
if (outside_events == NULL) {
outside_events = g_hash_table_new(NULL, NULL);
}
if (g_hash_table_add(outside_events, GINT_TO_POINTER(action_num))) {
return pcmk_rc_ok;
}
return pcmk_rc_already;
}
gboolean
fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node)
{
const char *target_uuid = NULL;
const char *router = NULL;
const char *router_uuid = NULL;
xmlNode *last_action = NULL;
GList *gIter = NULL;
GList *gIter2 = NULL;
if (graph == NULL || graph->complete) {
return FALSE;
}
gIter = graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
if (pcmk_any_flags_set(synapse->flags, pcmk__synapse_confirmed|pcmk__synapse_failed)) {
/* We've already been here */
continue;
}
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
if ((action->type == pcmk__pseudo_graph_action)
|| pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
continue;
} else if (action->type == pcmk__cluster_graph_action) {
const char *task = crm_element_value(action->xml,
PCMK_XA_OPERATION);
if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
continue;
}
}
target_uuid = crm_element_value(action->xml,
PCMK__META_ON_NODE_UUID);
router = crm_element_value(action->xml, PCMK__XA_ROUTER_NODE);
if (router) {
const crm_node_t *node =
pcmk__get_node(0, router, NULL,
pcmk__node_search_cluster_member);
if (node) {
router_uuid = node->uuid;
}
}
if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) {
pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
pcmk__set_synapse_flags(synapse, pcmk__synapse_failed);
last_action = action->xml;
stop_te_timer(action);
pcmk__update_graph(graph, action);
if (pcmk_is_set(synapse->flags, pcmk__synapse_executed)) {
crm_notice("Action %d (%s) was pending on %s (offline)",
action->id,
crm_element_value(action->xml,
PCMK__XA_OPERATION_KEY),
down_node);
} else {
crm_info("Action %d (%s) is scheduled for %s (offline)",
action->id,
crm_element_value(action->xml, PCMK__XA_OPERATION_KEY),
down_node);
}
}
}
}
if (last_action != NULL) {
crm_info("Node %s shutdown resulted in un-runnable actions", down_node);
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Node failure", last_action);
return TRUE;
}
return FALSE;
}
/*!
* \internal
* \brief Update failure-related node attributes if warranted
*
* \param[in] event XML describing operation that (maybe) failed
* \param[in] event_node_uuid Node that event occurred on
* \param[in] rc Actual operation return code
* \param[in] target_rc Expected operation return code
* \param[in] do_update If TRUE, do update regardless of operation type
* \param[in] ignore_failures If TRUE, update last failure but not fail count
*
* \return TRUE if this was not a direct nack, success or lrm status refresh
*/
static gboolean
update_failcount(const xmlNode *event, const char *event_node_uuid, int rc,
int target_rc, gboolean do_update, gboolean ignore_failures)
{
guint interval_ms = 0;
char *task = NULL;
char *rsc_id = NULL;
const char *value = NULL;
const char *id = crm_element_value(event, PCMK__XA_OPERATION_KEY);
- const char *on_uname = crm_peer_uname(event_node_uuid);
+ const char *on_uname = pcmk__node_name_from_uuid(event_node_uuid);
const char *origin = crm_element_value(event, PCMK_XA_CRM_DEBUG_ORIGIN);
// Nothing needs to be done for success or status refresh
if (rc == target_rc) {
return FALSE;
} else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) {
crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
id, rc, on_uname);
return FALSE;
}
/* Sanity check */
CRM_CHECK(on_uname != NULL, return TRUE);
CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms),
crm_err("Couldn't parse: %s", pcmk__xe_id(event)); goto bail);
/* Decide whether update is necessary and what value to use */
if ((interval_ms > 0)
|| pcmk__str_eq(task, PCMK_ACTION_PROMOTE, pcmk__str_none)
|| pcmk__str_eq(task, PCMK_ACTION_DEMOTE, pcmk__str_none)) {
do_update = TRUE;
} else if (pcmk__str_eq(task, PCMK_ACTION_START, pcmk__str_none)) {
do_update = TRUE;
value = pcmk__s(controld_globals.transition_graph->failed_start_offset,
PCMK_VALUE_INFINITY);
} else if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_none)) {
do_update = TRUE;
value = pcmk__s(controld_globals.transition_graph->failed_stop_offset,
PCMK_VALUE_INFINITY);
}
if (do_update) {
pcmk__attrd_query_pair_t *fail_pair = NULL;
pcmk__attrd_query_pair_t *last_pair = NULL;
char *fail_name = NULL;
char *last_name = NULL;
GList *attrs = NULL;
uint32_t opts = pcmk__node_attr_none;
char *now = pcmk__ttoa(time(NULL));
// Fail count will be either incremented or set to infinity
if (!pcmk_str_is_infinity(value)) {
value = PCMK_XA_VALUE "++";
}
if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
opts |= pcmk__node_attr_remote;
}
crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
(ignore_failures? "last failure" : "failcount"),
rsc_id, on_uname, task, rc, value, now);
/* Update the fail count, if we're not ignoring failures */
if (!ignore_failures) {
fail_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t));
fail_name = pcmk__failcount_name(rsc_id, task, interval_ms);
fail_pair->name = fail_name;
fail_pair->value = value;
fail_pair->node = on_uname;
attrs = g_list_prepend(attrs, fail_pair);
}
/* Update the last failure time (even if we're ignoring failures,
* so that failure can still be detected and shown, e.g. by crm_mon)
*/
last_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t));
last_name = pcmk__lastfailure_name(rsc_id, task, interval_ms);
last_pair->name = last_name;
last_pair->value = now;
last_pair->node = on_uname;
attrs = g_list_prepend(attrs, last_pair);
update_attrd_list(attrs, opts);
free(fail_name);
free(fail_pair);
free(last_name);
free(last_pair);
g_list_free(attrs);
free(now);
}
bail:
free(rsc_id);
free(task);
return TRUE;
}
pcmk__graph_action_t *
controld_get_action(int id)
{
for (GList *item = controld_globals.transition_graph->synapses;
item != NULL; item = item->next) {
pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) item->data;
for (GList *item2 = synapse->actions; item2; item2 = item2->next) {
pcmk__graph_action_t *action = (pcmk__graph_action_t *) item2->data;
if (action->id == id) {
return action;
}
}
}
return NULL;
}
pcmk__graph_action_t *
get_cancel_action(const char *id, const char *node)
{
GList *gIter = NULL;
GList *gIter2 = NULL;
gIter = controld_globals.transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
const char *task = NULL;
const char *target = NULL;
pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
task = crm_element_value(action->xml, PCMK_XA_OPERATION);
if (!pcmk__str_eq(PCMK_ACTION_CANCEL, task, pcmk__str_casei)) {
continue;
}
task = crm_element_value(action->xml, PCMK__XA_OPERATION_KEY);
if (!pcmk__str_eq(task, id, pcmk__str_casei)) {
crm_trace("Wrong key %s for %s on %s", task, id, node);
continue;
}
target = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID);
if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) {
crm_trace("Wrong node %s for %s on %s", target, id, node);
continue;
}
crm_trace("Found %s on %s", id, node);
return action;
}
}
return NULL;
}
bool
confirm_cancel_action(const char *id, const char *node_id)
{
const char *op_key = NULL;
const char *node_name = NULL;
pcmk__graph_action_t *cancel = get_cancel_action(id, node_id);
if (cancel == NULL) {
return FALSE;
}
op_key = crm_element_value(cancel->xml, PCMK__XA_OPERATION_KEY);
node_name = crm_element_value(cancel->xml, PCMK__META_ON_NODE);
stop_te_timer(cancel);
te_action_confirmed(cancel, controld_globals.transition_graph);
crm_info("Cancellation of %s on %s confirmed (action %d)",
op_key, node_name, cancel->id);
return TRUE;
}
/* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
#define XPATH_DOWNED "//" PCMK__XE_DOWNED \
"/" PCMK_XE_NODE "[@" PCMK_XA_ID "='%s']"
/*!
* \brief Find a transition event that would have made a specified node down
*
* \param[in] target UUID of node to match
*
* \return Matching event if found, NULL otherwise
*/
pcmk__graph_action_t *
match_down_event(const char *target)
{
pcmk__graph_action_t *match = NULL;
xmlXPathObjectPtr xpath_ret = NULL;
GList *gIter, *gIter2;
char *xpath = crm_strdup_printf(XPATH_DOWNED, target);
for (gIter = controld_globals.transition_graph->synapses;
gIter != NULL && match == NULL;
gIter = gIter->next) {
for (gIter2 = ((pcmk__graph_synapse_t * ) gIter->data)->actions;
gIter2 != NULL && match == NULL;
gIter2 = gIter2->next) {
match = (pcmk__graph_action_t *) gIter2->data;
if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) {
xpath_ret = xpath_search(match->xml, xpath);
if (numXpathResults(xpath_ret) < 1) {
match = NULL;
}
freeXpathObject(xpath_ret);
} else {
// Only actions that were actually started can match
match = NULL;
}
}
}
free(xpath);
if (match != NULL) {
crm_debug("Shutdown action %d (%s) found for node %s", match->id,
crm_element_value(match->xml, PCMK__XA_OPERATION_KEY),
target);
} else {
crm_debug("No reason to expect node %s to be down", target);
}
return match;
}
void
process_graph_event(xmlNode *event, const char *event_node)
{
int rc = -1; // Actual result
int target_rc = -1; // Expected result
int status = -1; // Executor status
int callid = -1; // Executor call ID
int transition_num = -1; // Transition number
int action_num = -1; // Action number within transition
char *update_te_uuid = NULL;
bool ignore_failures = FALSE;
const char *id = NULL;
const char *desc = NULL;
const char *magic = NULL;
const char *uname = NULL;
CRM_ASSERT(event != NULL);
/*
<lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
*/
magic = crm_element_value(event, PCMK__XA_TRANSITION_KEY);
if (magic == NULL) {
/* non-change */
return;
}
crm_element_value_int(event, PCMK__XA_OP_STATUS, &status);
if (status == PCMK_EXEC_PENDING) {
return;
}
id = crm_element_value(event, PCMK__XA_OPERATION_KEY);
crm_element_value_int(event, PCMK__XA_RC_CODE, &rc);
crm_element_value_int(event, PCMK__XA_CALL_ID, &callid);
rc = pcmk__effective_rc(rc);
if (decode_transition_key(magic, &update_te_uuid, &transition_num,
&action_num, &target_rc) == FALSE) {
// decode_transition_key() already logged the bad key
crm_err("Can't process action %s result: Incompatible versions? "
CRM_XS " call-id=%d", id, callid);
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Bad event", event);
return;
}
if (transition_num == -1) {
// E.g. crm_resource --fail
if (record_outside_event(action_num) != pcmk_rc_ok) {
crm_debug("Outside event with transition key '%s' has already been "
"processed", magic);
goto bail;
}
desc = "initiated outside of the cluster";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Unexpected event", event);
} else if ((action_num < 0)
|| !pcmk__str_eq(update_te_uuid, controld_globals.te_uuid,
pcmk__str_none)) {
desc = "initiated by a different DC";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Foreign event", event);
} else if ((controld_globals.transition_graph->id != transition_num)
|| controld_globals.transition_graph->complete) {
// Action is not from currently active transition
guint interval_ms = 0;
if (parse_op_key(id, NULL, NULL, &interval_ms)
&& (interval_ms != 0)) {
/* Recurring actions have the transition number they were first
* scheduled in.
*/
if (status == PCMK_EXEC_CANCELLED) {
confirm_cancel_action(id, get_node_id(event));
goto bail;
}
desc = "arrived after initial scheduling";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Change in recurring result", event);
} else if (controld_globals.transition_graph->id != transition_num) {
desc = "arrived really late";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Old event", event);
} else {
desc = "arrived late";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Inactive graph", event);
}
} else {
// Event is result of an action from currently active transition
pcmk__graph_action_t *action = controld_get_action(action_num);
if (action == NULL) {
// Should never happen
desc = "unknown";
abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
"Unknown event", event);
} else if (pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
/* Nothing further needs to be done if the action has already been
* confirmed. This can happen e.g. when processing both an
* "xxx_last_0" or "xxx_last_failure_0" record as well as the main
* history record, which would otherwise result in incorrectly
* bumping the fail count twice.
*/
crm_log_xml_debug(event, "Event already confirmed:");
goto bail;
} else {
/* An action result needs to be confirmed.
* (This is the only case where desc == NULL.)
*/
if (pcmk__str_eq(crm_meta_value(action->params, PCMK_META_ON_FAIL),
PCMK_VALUE_IGNORE, pcmk__str_casei)) {
ignore_failures = TRUE;
} else if (rc != target_rc) {
pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
}
stop_te_timer(action);
te_action_confirmed(action, controld_globals.transition_graph);
if (pcmk_is_set(action->flags, pcmk__graph_action_failed)) {
abort_transition(action->synapse->priority + 1,
pcmk__graph_restart, "Event failed", event);
}
}
}
if (id == NULL) {
id = "unknown action";
}
uname = crm_element_value(event, PCMK__META_ON_NODE);
if (uname == NULL) {
uname = "unknown node";
}
if (status == PCMK_EXEC_INVALID) {
// We couldn't attempt the action
crm_info("Transition %d action %d (%s on %s): %s",
transition_num, action_num, id, uname,
pcmk_exec_status_str(status));
} else if (desc && update_failcount(event, event_node, rc, target_rc,
(transition_num == -1), FALSE)) {
crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
CRM_XS " target-rc=%d rc=%d call-id=%d event='%s'",
transition_num, action_num, id, uname,
services_ocf_exitcode_str(target_rc),
services_ocf_exitcode_str(rc),
target_rc, rc, callid, desc);
} else if (desc) {
crm_info("Transition %d action %d (%s on %s): %s "
CRM_XS " rc=%d target-rc=%d call-id=%d",
transition_num, action_num, id, uname,
desc, rc, target_rc, callid);
} else if (rc == target_rc) {
crm_info("Transition %d action %d (%s on %s) confirmed: %s "
CRM_XS " rc=%d call-id=%d",
transition_num, action_num, id, uname,
services_ocf_exitcode_str(rc), rc, callid);
} else {
update_failcount(event, event_node, rc, target_rc,
(transition_num == -1), ignore_failures);
crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
CRM_XS " target-rc=%d rc=%d call-id=%d",
transition_num, action_num, id, uname,
services_ocf_exitcode_str(target_rc),
services_ocf_exitcode_str(rc),
target_rc, rc, callid);
}
bail:
free(update_te_uuid);
}
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
index de3ccd9607..3e7120933b 100644
--- a/daemons/controld/controld_te_utils.c
+++ b/daemons/controld/controld_te_utils.c
@@ -1,507 +1,507 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <crm/crm.h>
#include <crm/common/xml.h>
#include <pacemaker-controld.h>
//! Triggers transition graph processing
static crm_trigger_t *transition_trigger = NULL;
static GHashTable *node_pending_timers = NULL;
gboolean
stop_te_timer(pcmk__graph_action_t *action)
{
if (action == NULL) {
return FALSE;
}
if (action->timer != 0) {
crm_trace("Stopping action timer");
g_source_remove(action->timer);
action->timer = 0;
} else {
crm_trace("Action timer was already stopped");
return FALSE;
}
return TRUE;
}
static gboolean
te_graph_trigger(gpointer user_data)
{
if (controld_globals.transition_graph == NULL) {
crm_debug("Nothing to do");
return TRUE;
}
crm_trace("Invoking graph %d in state %s",
controld_globals.transition_graph->id,
fsa_state2string(controld_globals.fsa_state));
switch (controld_globals.fsa_state) {
case S_STARTING:
case S_PENDING:
case S_NOT_DC:
case S_HALT:
case S_ILLEGAL:
case S_STOPPING:
case S_TERMINATE:
return TRUE;
default:
break;
}
if (!controld_globals.transition_graph->complete) {
enum pcmk__graph_status graph_rc;
int orig_limit = controld_globals.transition_graph->batch_limit;
int throttled_limit = throttle_get_total_job_limit(orig_limit);
controld_globals.transition_graph->batch_limit = throttled_limit;
graph_rc = pcmk__execute_graph(controld_globals.transition_graph);
controld_globals.transition_graph->batch_limit = orig_limit;
if (graph_rc == pcmk__graph_active) {
crm_trace("Transition not yet complete");
return TRUE;
} else if (graph_rc == pcmk__graph_pending) {
crm_trace("Transition not yet complete - no actions fired");
return TRUE;
}
if (graph_rc != pcmk__graph_complete) {
crm_warn("Transition failed: %s",
pcmk__graph_status2text(graph_rc));
pcmk__log_graph(LOG_NOTICE, controld_globals.transition_graph);
}
}
crm_debug("Transition %d is now complete",
controld_globals.transition_graph->id);
controld_globals.transition_graph->complete = true;
notify_crmd(controld_globals.transition_graph);
return TRUE;
}
/*!
* \internal
* \brief Initialize transition trigger
*/
void
controld_init_transition_trigger(void)
{
transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger,
NULL);
}
/*!
* \internal
* \brief Destroy transition trigger
*/
void
controld_destroy_transition_trigger(void)
{
mainloop_destroy_trigger(transition_trigger);
transition_trigger = NULL;
}
void
controld_trigger_graph_as(const char *fn, int line)
{
crm_trace("%s:%d - Triggered graph processing", fn, line);
mainloop_set_trigger(transition_trigger);
}
static struct abort_timer_s {
bool aborted;
guint id;
int priority;
enum pcmk__graph_next action;
const char *text;
} abort_timer = { 0, };
static gboolean
abort_timer_popped(gpointer data)
{
struct abort_timer_s *abort_timer = (struct abort_timer_s *) data;
if (AM_I_DC && (abort_timer->aborted == FALSE)) {
abort_transition(abort_timer->priority, abort_timer->action,
abort_timer->text, NULL);
}
abort_timer->id = 0;
return FALSE; // do not immediately reschedule timer
}
/*!
* \internal
* \brief Abort transition after delay, if not already aborted in that time
*
* \param[in] abort_text Must be literal string
*/
void
abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action,
const char *abort_text, guint delay_ms)
{
if (abort_timer.id) {
// Timer already in progress, stop and reschedule
g_source_remove(abort_timer.id);
}
abort_timer.aborted = FALSE;
abort_timer.priority = abort_priority;
abort_timer.action = abort_action;
abort_timer.text = abort_text;
abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, &abort_timer);
}
static void
free_node_pending_timer(gpointer data)
{
struct abort_timer_s *node_pending_timer = (struct abort_timer_s *) data;
if (node_pending_timer->id != 0) {
g_source_remove(node_pending_timer->id);
node_pending_timer->id = 0;
}
free(node_pending_timer);
}
static gboolean
node_pending_timer_popped(gpointer key)
{
struct abort_timer_s *node_pending_timer = NULL;
if (node_pending_timers == NULL) {
return FALSE;
}
node_pending_timer = g_hash_table_lookup(node_pending_timers, key);
if (node_pending_timer == NULL) {
return FALSE;
}
crm_warn("Node with " PCMK_XA_ID " '%s' pending timed out (%us) "
"on joining the process group",
(const char *) key, controld_globals.node_pending_timeout);
if (controld_globals.node_pending_timeout > 0) {
abort_timer_popped(node_pending_timer);
}
g_hash_table_remove(node_pending_timers, key);
return FALSE; // do not reschedule timer
}
static void
init_node_pending_timer(const crm_node_t *node, guint timeout)
{
struct abort_timer_s *node_pending_timer = NULL;
char *key = NULL;
if (node->uuid == NULL) {
return;
}
if (node_pending_timers == NULL) {
node_pending_timers = pcmk__strikey_table(free,
free_node_pending_timer);
// The timer is somehow already existing
} else if (g_hash_table_lookup(node_pending_timers, node->uuid) != NULL) {
return;
}
crm_notice("Waiting for pending %s with " PCMK_XA_ID " '%s' "
"to join the process group (timeout=%us)",
node->uname ? node->uname : "node", node->uuid,
controld_globals.node_pending_timeout);
key = pcmk__str_copy(node->uuid);
node_pending_timer = pcmk__assert_alloc(1, sizeof(struct abort_timer_s));
node_pending_timer->aborted = FALSE;
node_pending_timer->priority = PCMK_SCORE_INFINITY;
node_pending_timer->action = pcmk__graph_restart;
node_pending_timer->text = "Node pending timed out";
g_hash_table_replace(node_pending_timers, key, node_pending_timer);
node_pending_timer->id = g_timeout_add_seconds(timeout,
node_pending_timer_popped,
key);
CRM_ASSERT(node_pending_timer->id != 0);
}
static void
remove_node_pending_timer(const char *node_uuid)
{
if (node_pending_timers == NULL) {
return;
}
g_hash_table_remove(node_pending_timers, node_uuid);
}
void
controld_node_pending_timer(const crm_node_t *node)
{
long long remaining_timeout = 0;
/* If the node is not an active cluster node, is leaving the cluster, or is
* already part of CPG, or PCMK_OPT_NODE_PENDING_TIMEOUT is disabled, free
* any node pending timer for it.
*/
if (pcmk_is_set(node->flags, crm_remote_node)
|| (node->when_member <= 1) || (node->when_online > 0)
|| (controld_globals.node_pending_timeout == 0)) {
remove_node_pending_timer(node->uuid);
return;
}
// Node is a cluster member but offline in CPG
remaining_timeout = node->when_member - time(NULL)
+ controld_globals.node_pending_timeout;
/* It already passed node pending timeout somehow.
* Free any node pending timer of it.
*/
if (remaining_timeout <= 0) {
remove_node_pending_timer(node->uuid);
return;
}
init_node_pending_timer(node, remaining_timeout);
}
void
controld_free_node_pending_timers(void)
{
if (node_pending_timers == NULL) {
return;
}
g_hash_table_destroy(node_pending_timers);
node_pending_timers = NULL;
}
static const char *
abort2text(enum pcmk__graph_next abort_action)
{
switch (abort_action) {
case pcmk__graph_done: return "done";
case pcmk__graph_wait: return "stop";
case pcmk__graph_restart: return "restart";
case pcmk__graph_shutdown: return "shutdown";
}
return "unknown";
}
static bool
update_abort_priority(pcmk__graph_t *graph, int priority,
enum pcmk__graph_next action, const char *abort_reason)
{
bool change = FALSE;
if (graph == NULL) {
return change;
}
if (graph->abort_priority < priority) {
crm_debug("Abort priority upgraded from %d to %d", graph->abort_priority, priority);
graph->abort_priority = priority;
if (graph->abort_reason != NULL) {
crm_debug("'%s' abort superseded by %s", graph->abort_reason, abort_reason);
}
graph->abort_reason = abort_reason;
change = TRUE;
}
if (graph->completion_action < action) {
crm_debug("Abort action %s superseded by %s: %s",
abort2text(graph->completion_action), abort2text(action), abort_reason);
graph->completion_action = action;
change = TRUE;
}
return change;
}
void
abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
const char *abort_text, const xmlNode *reason,
const char *fn, int line)
{
int add[] = { 0, 0, 0 };
int del[] = { 0, 0, 0 };
int level = LOG_INFO;
const xmlNode *diff = NULL;
const xmlNode *change = NULL;
CRM_CHECK(controld_globals.transition_graph != NULL, return);
switch (controld_globals.fsa_state) {
case S_STARTING:
case S_PENDING:
case S_NOT_DC:
case S_HALT:
case S_ILLEGAL:
case S_STOPPING:
case S_TERMINATE:
crm_info("Abort %s suppressed: state=%s (%scomplete)",
abort_text, fsa_state2string(controld_globals.fsa_state),
(controld_globals.transition_graph->complete? "" : "in"));
return;
default:
break;
}
abort_timer.aborted = TRUE;
controld_expect_sched_reply(NULL);
if (!controld_globals.transition_graph->complete
&& update_abort_priority(controld_globals.transition_graph,
abort_priority, abort_action,
abort_text)) {
level = LOG_NOTICE;
}
if (reason != NULL) {
const xmlNode *search = NULL;
for(search = reason; search; search = search->parent) {
if (pcmk__xe_is(search, PCMK_XE_DIFF)) {
diff = search;
break;
}
}
if(diff) {
xml_patch_versions(diff, add, del);
for(search = reason; search; search = search->parent) {
if (pcmk__xe_is(search, PCMK_XE_CHANGE)) {
change = search;
break;
}
}
}
}
if (reason == NULL) {
do_crm_log(level,
"Transition %d aborted: %s " CRM_XS " source=%s:%d "
"complete=%s", controld_globals.transition_graph->id,
abort_text, fn, line,
pcmk__btoa(controld_globals.transition_graph->complete));
} else if(change == NULL) {
GString *local_path = pcmk__element_xpath(reason);
CRM_ASSERT(local_path != NULL);
do_crm_log(level, "Transition %d aborted by %s.%s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id, reason->name,
pcmk__xe_id(reason), abort_text, add[0], add[1], add[2], fn,
line, (const char *) local_path->str,
pcmk__btoa(controld_globals.transition_graph->complete));
g_string_free(local_path, TRUE);
} else {
const char *op = crm_element_value(change, PCMK_XA_OPERATION);
const char *path = crm_element_value(change, PCMK_XA_PATH);
if(change == reason) {
if (strcmp(op, PCMK_VALUE_CREATE) == 0) {
reason = reason->children;
} else if (strcmp(op, PCMK_VALUE_MODIFY) == 0) {
reason = pcmk__xe_first_child(reason, PCMK_XE_CHANGE_RESULT,
NULL, NULL);
if(reason) {
reason = reason->children;
}
}
CRM_CHECK(reason != NULL, goto done);
}
if (strcmp(op, PCMK_VALUE_DELETE) == 0) {
const char *shortpath = strrchr(path, '/');
do_crm_log(level, "Transition %d aborted by deletion of %s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id,
(shortpath? (shortpath + 1) : path), abort_text,
add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
} else if (pcmk__xe_is(reason, PCMK_XE_NVPAIR)) {
do_crm_log(level, "Transition %d aborted by %s doing %s %s=%s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id,
crm_element_value(reason, PCMK_XA_ID), op,
crm_element_value(reason, PCMK_XA_NAME),
crm_element_value(reason, PCMK_XA_VALUE),
abort_text, add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
} else if (pcmk__xe_is(reason, PCMK__XE_LRM_RSC_OP)) {
const char *magic = crm_element_value(reason,
PCMK__XA_TRANSITION_MAGIC);
do_crm_log(level, "Transition %d aborted by operation %s '%s' on %s: %s "
CRM_XS " magic=%s cib=%d.%d.%d source=%s:%d complete=%s",
controld_globals.transition_graph->id,
crm_element_value(reason, PCMK__XA_OPERATION_KEY), op,
crm_element_value(reason, PCMK__META_ON_NODE),
abort_text,
magic, add[0], add[1], add[2], fn, line,
pcmk__btoa(controld_globals.transition_graph->complete));
} else if (pcmk__str_any_of((const char *) reason->name,
PCMK__XE_NODE_STATE, PCMK_XE_NODE, NULL)) {
- const char *uname = crm_peer_uname(pcmk__xe_id(reason));
+ const char *uname = pcmk__node_name_from_uuid(pcmk__xe_id(reason));
do_crm_log(level, "Transition %d aborted by %s '%s' on %s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d complete=%s",
controld_globals.transition_graph->id,
reason->name, op, pcmk__s(uname, pcmk__xe_id(reason)),
abort_text, add[0], add[1], add[2], fn, line,
pcmk__btoa(controld_globals.transition_graph->complete));
} else {
const char *id = pcmk__xe_id(reason);
do_crm_log(level, "Transition %d aborted by %s.%s '%s': %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id,
reason->name, pcmk__s(id, ""), pcmk__s(op, "change"),
abort_text, add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
}
}
done:
if (controld_globals.transition_graph->complete) {
if (controld_get_period_transition_timer() > 0) {
controld_stop_transition_timer();
controld_start_transition_timer();
} else {
register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL);
}
return;
}
trigger_graph();
}
diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h
index a233e8471b..fc24c77310 100644
--- a/include/crm/cluster/internal.h
+++ b/include/crm/cluster/internal.h
@@ -1,196 +1,197 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef PCMK__CRM_CLUSTER_INTERNAL__H
# define PCMK__CRM_CLUSTER_INTERNAL__H
# include <stdbool.h>
# include <stdint.h> // uint32_t, uint64_t
# include <glib.h> // gboolean
# include <crm/cluster.h>
enum crm_proc_flag {
/* @COMPAT When crm_node_t:processes is made internal, we can merge this
* into node flags or turn it into a boolean. Until then, in theory
* something could depend on these particular numeric values.
*/
crm_proc_none = 0x00000001,
// Cluster layers
crm_proc_cpg = 0x04000000,
};
// Used with node cache search functions
enum pcmk__node_search_flags {
//! Does not affect search
pcmk__node_search_none = 0,
//! Search for cluster nodes from membership cache
pcmk__node_search_cluster_member = (1 << 0),
//! Search for remote nodes
pcmk__node_search_remote = (1 << 1),
//! Search for cluster member nodes and remote nodes
pcmk__node_search_any = pcmk__node_search_cluster_member
|pcmk__node_search_remote,
/* @COMPAT The values before this must stay the same until we can drop
* support for enum crm_get_peer_flags
*/
//! Search for cluster nodes from CIB (as of last cache refresh)
pcmk__node_search_cluster_cib = (1 << 2),
};
/*!
* \internal
* \brief Return the process bit corresponding to the current cluster stack
*
* \return Process flag if detectable, otherwise 0
*/
static inline uint32_t
crm_get_cluster_proc(void)
{
switch (pcmk_get_cluster_layer()) {
case pcmk_cluster_layer_corosync:
return crm_proc_cpg;
default:
break;
}
return crm_proc_none;
}
/*!
* \internal
* \brief Get log-friendly string description of a Corosync return code
*
* \param[in] error Corosync return code
*
* \return Log-friendly string description corresponding to \p error
*/
static inline const char *
pcmk__cs_err_str(int error)
{
# if SUPPORT_COROSYNC
switch (error) {
case CS_OK: return "OK";
case CS_ERR_LIBRARY: return "Library error";
case CS_ERR_VERSION: return "Version error";
case CS_ERR_INIT: return "Initialization error";
case CS_ERR_TIMEOUT: return "Timeout";
case CS_ERR_TRY_AGAIN: return "Try again";
case CS_ERR_INVALID_PARAM: return "Invalid parameter";
case CS_ERR_NO_MEMORY: return "No memory";
case CS_ERR_BAD_HANDLE: return "Bad handle";
case CS_ERR_BUSY: return "Busy";
case CS_ERR_ACCESS: return "Access error";
case CS_ERR_NOT_EXIST: return "Doesn't exist";
case CS_ERR_NAME_TOO_LONG: return "Name too long";
case CS_ERR_EXIST: return "Exists";
case CS_ERR_NO_SPACE: return "No space";
case CS_ERR_INTERRUPT: return "Interrupt";
case CS_ERR_NAME_NOT_FOUND: return "Name not found";
case CS_ERR_NO_RESOURCES: return "No resources";
case CS_ERR_NOT_SUPPORTED: return "Not supported";
case CS_ERR_BAD_OPERATION: return "Bad operation";
case CS_ERR_FAILED_OPERATION: return "Failed operation";
case CS_ERR_MESSAGE_ERROR: return "Message error";
case CS_ERR_QUEUE_FULL: return "Queue full";
case CS_ERR_QUEUE_NOT_AVAILABLE: return "Queue not available";
case CS_ERR_BAD_FLAGS: return "Bad flags";
case CS_ERR_TOO_BIG: return "Too big";
case CS_ERR_NO_SECTIONS: return "No sections";
}
# endif
return "Corosync error";
}
# if SUPPORT_COROSYNC
#if 0
/* This is the new way to do it, but we still support all Corosync 2 versions,
* and this isn't always available. A better alternative here would be to check
* for support in the configure script and enable this conditionally.
*/
#define pcmk__init_cmap(handle) cmap_initialize_map((handle), CMAP_MAP_ICMAP)
#else
#define pcmk__init_cmap(handle) cmap_initialize(handle)
#endif
char *pcmk__corosync_cluster_name(void);
bool pcmk__corosync_add_nodes(xmlNode *xml_parent);
void pcmk__cpg_confchg_cb(cpg_handle_t handle,
const struct cpg_name *group_name,
const struct cpg_address *member_list,
size_t member_list_entries,
const struct cpg_address *left_list,
size_t left_list_entries,
const struct cpg_address *joined_list,
size_t joined_list_entries);
char *pcmk__cpg_message_data(cpg_handle_t handle, uint32_t sender_id,
uint32_t pid, void *content, uint32_t *kind,
const char **from);
# endif
const char *pcmk__cluster_node_uuid(crm_node_t *node);
char *pcmk__cluster_node_name(uint32_t nodeid);
const char *pcmk__cluster_local_node_name(void);
+const char *pcmk__node_name_from_uuid(const char *uuid);
crm_node_t *crm_update_peer_proc(const char *source, crm_node_t * peer,
uint32_t flag, const char *status);
crm_node_t *pcmk__update_peer_state(const char *source, crm_node_t *node,
const char *state, uint64_t membership);
void pcmk__update_peer_expected(const char *source, crm_node_t *node,
const char *expected);
void pcmk__reap_unseen_nodes(uint64_t ring_id);
void pcmk__corosync_quorum_connect(gboolean (*dispatch)(unsigned long long,
gboolean),
void (*destroy) (gpointer));
enum crm_ais_msg_types pcmk__cluster_parse_msg_type(const char *text);
bool pcmk__cluster_send_message(const crm_node_t *node,
enum crm_ais_msg_types service,
const xmlNode *data);
// Membership
void pcmk__cluster_init_node_caches(void);
void pcmk__cluster_destroy_node_caches(void);
void pcmk__cluster_set_autoreap(bool enable);
void pcmk__cluster_set_status_callback(void (*dispatch)(enum crm_status_type,
crm_node_t *,
const void *));
bool pcmk__cluster_is_node_active(const crm_node_t *node);
unsigned int pcmk__cluster_num_active_nodes(void);
unsigned int pcmk__cluster_num_remote_nodes(void);
crm_node_t *pcmk__cluster_lookup_remote_node(const char *node_name);
void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name);
void pcmk__cluster_forget_remote_node(const char *node_name);
crm_node_t *pcmk__search_node_caches(unsigned int id, const char *uname,
uint32_t flags);
void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id);
void pcmk__refresh_node_caches_from_cib(xmlNode *cib);
crm_node_t *pcmk__get_node(unsigned int id, const char *uname,
const char *uuid, uint32_t flags);
#endif // PCMK__CRM_CLUSTER_INTERNAL__H
diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c
index eb0e25f578..5360e11bed 100644
--- a/lib/cluster/cluster.c
+++ b/lib/cluster/cluster.c
@@ -1,566 +1,592 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <dlfcn.h>
#include <inttypes.h> // PRIu32
#include <stdbool.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/utsname.h> // uname()
#include <glib.h> // gboolean
#include <crm/crm.h>
#include <crm/common/ipc.h>
#include <crm/common/xml.h>
#include <crm/cluster/internal.h>
#include "crmcluster_private.h"
CRM_TRACE_INIT_DATA(cluster);
/*!
* \internal
* \brief Get the message type equivalent of a string
*
* \param[in] text String of message type
*
* \return Message type equivalent of \p text
*/
enum crm_ais_msg_types
pcmk__cluster_parse_msg_type(const char *text)
{
int rc = 0;
int type = crm_msg_none;
CRM_CHECK(text != NULL, return crm_msg_none);
text = pcmk__message_name(text);
if (pcmk__str_eq(text, "ais", pcmk__str_none)) {
return crm_msg_ais;
}
if (pcmk__str_eq(text, CRM_SYSTEM_CIB, pcmk__str_none)) {
return crm_msg_cib;
}
if (pcmk__str_any_of(text, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL)) {
return crm_msg_crmd;
}
if (pcmk__str_eq(text, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
return crm_msg_te;
}
if (pcmk__str_eq(text, CRM_SYSTEM_PENGINE, pcmk__str_none)) {
return crm_msg_pe;
}
if (pcmk__str_eq(text, CRM_SYSTEM_LRMD, pcmk__str_none)) {
return crm_msg_lrmd;
}
if (pcmk__str_eq(text, CRM_SYSTEM_STONITHD, pcmk__str_none)) {
return crm_msg_stonithd;
}
if (pcmk__str_eq(text, "stonith-ng", pcmk__str_none)) {
return crm_msg_stonith_ng;
}
if (pcmk__str_eq(text, "attrd", pcmk__str_none)) {
return crm_msg_attrd;
}
/* This will normally be a transient client rather than a cluster daemon.
* Set the type to the pid of the client.
*
* @TODO Check whether this is necessary and correct.
*/
rc = sscanf(text, "%d", &type);
if ((rc != 1) || (type <= crm_msg_stonith_ng)) {
// Ensure it's sane; don't falsely return a standard message type
type = crm_msg_none;
}
return type;
}
/*!
* \internal
* \brief Get a node's cluster-layer UUID, setting it if not already set
*
* \param[in,out] node Node to check
*
* \return Cluster-layer node UUID of \p node, or \c NULL if unknown
*/
const char *
pcmk__cluster_node_uuid(crm_node_t *node)
{
const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer();
if (node == NULL) {
return NULL;
}
if (node->uuid != NULL) {
return node->uuid;
}
switch (cluster_layer) {
#if SUPPORT_COROSYNC
case pcmk_cluster_layer_corosync:
node->uuid = pcmk__corosync_uuid(node);
return node->uuid;
#endif // SUPPORT_COROSYNC
default:
crm_err("Unsupported cluster layer %s",
pcmk_cluster_layer_text(cluster_layer));
return NULL;
}
}
/*!
* \internal
* \brief Connect to the cluster layer
*
* \param[in,out] cluster Initialized cluster object to connect
*
* \return Standard Pacemaker return code
*/
int
pcmk_cluster_connect(pcmk_cluster_t *cluster)
{
const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer();
const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer);
crm_notice("Connecting to %s cluster layer", cluster_layer_s);
switch (cluster_layer) {
#if SUPPORT_COROSYNC
case pcmk_cluster_layer_corosync:
return pcmk__corosync_connect(cluster);
#endif // SUPPORT_COROSYNC
default:
break;
}
crm_err("Failed to connect to unsupported cluster layer %s",
cluster_layer_s);
return EPROTONOSUPPORT;
}
/*!
* \brief Disconnect from the cluster layer
*
* \param[in,out] cluster Cluster object to disconnect
*
* \return Standard Pacemaker return code
*/
int
pcmk_cluster_disconnect(pcmk_cluster_t *cluster)
{
const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer();
const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer);
crm_info("Disconnecting from %s cluster layer", cluster_layer_s);
switch (cluster_layer) {
#if SUPPORT_COROSYNC
case pcmk_cluster_layer_corosync:
pcmk__corosync_disconnect(cluster);
pcmk__cluster_destroy_node_caches();
return pcmk_rc_ok;
#endif // SUPPORT_COROSYNC
default:
break;
}
crm_err("Failed to disconnect from unsupported cluster layer %s",
cluster_layer_s);
return EPROTONOSUPPORT;
}
/*!
* \brief Allocate a new \p pcmk_cluster_t object
*
* \return A newly allocated \p pcmk_cluster_t object (guaranteed not \c NULL)
* \note The caller is responsible for freeing the return value using
* \p pcmk_cluster_free().
*/
pcmk_cluster_t *
pcmk_cluster_new(void)
{
return (pcmk_cluster_t *) pcmk__assert_alloc(1, sizeof(pcmk_cluster_t));
}
/*!
* \brief Free a \p pcmk_cluster_t object and its dynamically allocated members
*
* \param[in,out] cluster Cluster object to free
*/
void
pcmk_cluster_free(pcmk_cluster_t *cluster)
{
if (cluster == NULL) {
return;
}
free(cluster->uuid);
free(cluster->uname);
free(cluster);
}
/*!
* \brief Set the destroy function for a cluster object
*
* \param[in,out] cluster Cluster object
* \param[in] fn Destroy function to set
*
* \return Standard Pacemaker return code
*/
int
pcmk_cluster_set_destroy_fn(pcmk_cluster_t *cluster, void (*fn)(gpointer))
{
if (cluster == NULL) {
return EINVAL;
}
cluster->destroy = fn;
return pcmk_rc_ok;
}
/*!
* \internal
* \brief Send an XML message via the cluster messaging layer
*
* \param[in] node Cluster node to send message to
* \param[in] service Message type to use in message host info
* \param[in] data XML message to send
*
* \return \c true on success, or \c false otherwise
*/
bool
pcmk__cluster_send_message(const crm_node_t *node,
enum crm_ais_msg_types service, const xmlNode *data)
{
// @TODO Return standard Pacemaker return code
switch (pcmk_get_cluster_layer()) {
#if SUPPORT_COROSYNC
case pcmk_cluster_layer_corosync:
return pcmk__cpg_send_xml(data, node, service);
#endif // SUPPORT_COROSYNC
default:
break;
}
return false;
}
/*!
* \internal
* \brief Get the node name corresponding to a cluster-layer node ID
*
* Get the node name from the cluster layer if possible. Otherwise, if for the
* local node, call \c uname() and get the \c nodename member from the
* <tt>struct utsname</tt> object.
*
* \param[in] nodeid Node ID to check (or 0 for the local node)
*
* \return Node name corresponding to \p nodeid
*
* \note This will fatally exit if \c uname() fails to get the local node name
* or we run out of memory.
* \note The caller is responsible for freeing the return value using \c free().
*/
char *
pcmk__cluster_node_name(uint32_t nodeid)
{
const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer();
const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer);
switch (cluster_layer) {
#if SUPPORT_COROSYNC
case pcmk_cluster_layer_corosync:
return pcmk__corosync_name(0, nodeid);
#else
break;
#endif // SUPPORT_COROSYNC
default:
crm_err("Unsupported cluster layer: %s", cluster_layer_s);
break;
}
if (nodeid == 0) {
struct utsname hostinfo;
crm_notice("Could not get local node name from %s cluster layer, "
"defaulting to local hostname",
cluster_layer_s);
if (uname(&hostinfo) < 0) {
// @TODO Maybe let the caller decide what to do
crm_err("Failed to get the local hostname");
crm_exit(CRM_EX_FATAL);
}
return pcmk__str_copy(hostinfo.nodename);
}
crm_notice("Could not obtain a node name for node with "
PCMK_XA_ID "=" PRIu32,
nodeid);
return NULL;
}
/*!
* \internal
* \brief Get the local node's cluster-layer node name
*
* If getting the node name from the cluster layer is impossible, call
* \c uname() and get the \c nodename member from the <tt>struct utsname</tt>
* object.
*
* \return Local node's name
*
* \note This will fatally exit if \c uname() fails to get the local node name
* or we run out of memory.
*/
const char *
pcmk__cluster_local_node_name(void)
{
// @TODO Refactor to avoid trivially leaking name at exit
static char *name = NULL;
if (name == NULL) {
name = pcmk__cluster_node_name(0);
}
return name;
}
/*!
- * \brief Get the node name corresponding to a node UUID
+ * \internal
+ * \brief Get the node name corresonding to a node UUID
*
- * \param[in] uuid UUID of desired node
+ * Look for the UUID in both the remote node cache and the cluster member cache.
+ * For a Corosync cluster, if no cache entry is found, treat the UUID as a
+ * cluster-layer ID and try again.
*
- * \return name of desired node
+ * \param[in] uuid UUID to search for
*
- * \note This relies on the remote peer cache being populated with all
- * remote nodes in the cluster, so callers should maintain that cache.
+ * \return Node name corresponding to \p uuid if found, or \c NULL otherwise
*/
const char *
-crm_peer_uname(const char *uuid)
+pcmk__node_name_from_uuid(const char *uuid)
{
+ /* @TODO There are too many functions in libcrmcluster that look up a node
+ * from the node caches (possibly creating a cache entry if none exists).
+ * There are at least the following:
+ * * pcmk__cluster_lookup_remote_node()
+ * * pcmk__get_node()
+ * * pcmk__node_name_from_uuid()
+ * * pcmk__search_node_caches()
+ *
+ * There's a lot of duplication among them, but they all do slightly
+ * different things. We should try to clean them up and consolidate them to
+ * the extent possible, likely with new helper functions.
+ */
GHashTableIter iter;
crm_node_t *node = NULL;
CRM_CHECK(uuid != NULL, return NULL);
- /* remote nodes have the same uname and uuid */
+ // Remote nodes have the same uname and uuid
if (g_hash_table_lookup(crm_remote_peer_cache, uuid)) {
return uuid;
}
- /* avoid blocking calls where possible */
+ // Avoid blocking calls where possible
g_hash_table_iter_init(&iter, crm_peer_cache);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
if (pcmk__str_eq(node->uuid, uuid, pcmk__str_casei)) {
- if (node->uname != NULL) {
- return node->uname;
- }
- break;
+ return node->uname;
}
}
- node = NULL;
if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) {
- long long id;
+ long long id = 0;
if ((pcmk__scan_ll(uuid, &id, 0LL) != pcmk_rc_ok)
|| (id < 1LL) || (id > UINT32_MAX)) {
+
crm_err("Invalid Corosync node ID '%s'", uuid);
return NULL;
}
node = pcmk__search_node_caches((uint32_t) id, NULL,
pcmk__node_search_cluster_member);
if (node != NULL) {
crm_info("Setting uuid for node %s[%u] to %s",
node->uname, node->id, uuid);
- node->uuid = strdup(uuid);
+ node->uuid = pcmk__str_copy(uuid);
return node->uname;
}
- return NULL;
}
return NULL;
}
+/*!
+ * \brief Get the node name corresponding to a node UUID
+ *
+ * \param[in] uuid UUID of desired node
+ *
+ * \return name of desired node
+ *
+ * \note This relies on the remote peer cache being populated with all
+ * remote nodes in the cluster, so callers should maintain that cache.
+ */
+const char *
+crm_peer_uname(const char *uuid)
+{
+ return pcmk__node_name_from_uuid(uuid);
+}
+
/*!
* \brief Get a log-friendly string equivalent of a cluster layer
*
* \param[in] layer Cluster layer
*
* \return Log-friendly string corresponding to \p layer
*/
const char *
pcmk_cluster_layer_text(enum pcmk_cluster_layer layer)
{
switch (layer) {
case pcmk_cluster_layer_corosync:
return "corosync";
case pcmk_cluster_layer_unknown:
return "unknown";
case pcmk_cluster_layer_invalid:
return "invalid";
default:
crm_err("Invalid cluster layer: %d", layer);
return "invalid";
}
}
/*!
* \brief Get and validate the local cluster layer
*
* If a cluster layer is not configured via the \c PCMK__ENV_CLUSTER_TYPE local
* option, this will try to detect an active cluster from among the supported
* cluster layers.
*
* \return Local cluster layer
*
* \note This will fatally exit if the configured cluster layer is invalid.
*/
enum pcmk_cluster_layer
pcmk_get_cluster_layer(void)
{
static enum pcmk_cluster_layer cluster_layer = pcmk_cluster_layer_unknown;
const char *cluster = NULL;
// Cluster layer is stable once set
if (cluster_layer != pcmk_cluster_layer_unknown) {
return cluster_layer;
}
cluster = pcmk__env_option(PCMK__ENV_CLUSTER_TYPE);
if (cluster != NULL) {
crm_info("Verifying configured cluster layer '%s'", cluster);
cluster_layer = pcmk_cluster_layer_invalid;
#if SUPPORT_COROSYNC
if (pcmk__str_eq(cluster, PCMK_VALUE_COROSYNC, pcmk__str_casei)) {
cluster_layer = pcmk_cluster_layer_corosync;
}
#endif // SUPPORT_COROSYNC
if (cluster_layer == pcmk_cluster_layer_invalid) {
crm_notice("This installation does not support the '%s' cluster "
"infrastructure: terminating",
cluster);
crm_exit(CRM_EX_FATAL);
}
crm_info("Assuming an active '%s' cluster", cluster);
} else {
// Nothing configured, so test supported cluster layers
#if SUPPORT_COROSYNC
crm_debug("Testing with Corosync");
if (pcmk__corosync_is_active()) {
cluster_layer = pcmk_cluster_layer_corosync;
}
#endif // SUPPORT_COROSYNC
if (cluster_layer == pcmk_cluster_layer_unknown) {
crm_notice("Could not determine the current cluster layer");
} else {
crm_info("Detected an active '%s' cluster",
pcmk_cluster_layer_text(cluster_layer));
}
}
return cluster_layer;
}
// Deprecated functions kept only for backward API compatibility
// LCOV_EXCL_START
#include <crm/cluster/compat.h>
void
set_uuid(xmlNode *xml, const char *attr, crm_node_t *node)
{
crm_xml_add(xml, attr, pcmk__cluster_node_uuid(node));
}
gboolean
crm_cluster_connect(pcmk_cluster_t *cluster)
{
return pcmk_cluster_connect(cluster) == pcmk_rc_ok;
}
void
crm_cluster_disconnect(pcmk_cluster_t *cluster)
{
pcmk_cluster_disconnect(cluster);
}
const char *
name_for_cluster_type(enum cluster_type_e type)
{
switch (type) {
case pcmk_cluster_corosync:
return "corosync";
case pcmk_cluster_unknown:
return "unknown";
case pcmk_cluster_invalid:
return "invalid";
}
crm_err("Invalid cluster type: %d", type);
return "invalid";
}
enum cluster_type_e
get_cluster_type(void)
{
return (enum cluster_type_e) pcmk_get_cluster_layer();
}
gboolean
is_corosync_cluster(void)
{
return pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync;
}
gboolean
send_cluster_message(const crm_node_t *node, enum crm_ais_msg_types service,
const xmlNode *data, gboolean ordered)
{
return pcmk__cluster_send_message(node, service, data);
}
const char *
crm_peer_uuid(crm_node_t *peer)
{
return pcmk__cluster_node_uuid(peer);
}
char *
get_node_name(uint32_t nodeid)
{
return pcmk__cluster_node_name(nodeid);
}
const char *
get_local_node_name(void)
{
return pcmk__cluster_local_node_name();
}
// LCOV_EXCL_STOP
// End deprecated API
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Jul 21, 3:00 AM (1 d, 11 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2082978
Default Alt Text
(63 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment