diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index ddc8524bc3..8fea564e22 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -1,1119 +1,1120 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); /* * stonith failure counting * * We don't want to get stuck in a permanent fencing loop. Keep track of the * number of fencing failures for each target node, and the most we'll restart a * transition for. */ struct st_fail_rec { int count; }; static bool fence_reaction_panic = false; static unsigned long int stonith_max_attempts = 10; static GHashTable *stonith_failures = NULL; /*! * \internal * \brief Update max fencing attempts before giving up * * \param[in] value New max fencing attempts */ static void update_stonith_max_attempts(const char *value) { stonith_max_attempts = char2score(value); if (stonith_max_attempts < 1UL) { stonith_max_attempts = 10UL; } } /*! * \internal * \brief Configure reaction to notification of local node being fenced * * \param[in] reaction_s Reaction type */ static void set_fence_reaction(const char *reaction_s) { if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) { fence_reaction_panic = true; } else { if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) { crm_warn("Invalid value '%s' for %s, using 'stop'", reaction_s, PCMK_OPT_FENCE_REACTION); } fence_reaction_panic = false; } } /*! * \internal * \brief Configure fencing options based on the CIB * * \param[in,out] options Name/value pairs for configured options */ void controld_configure_fencing(GHashTable *options) { const char *value = NULL; value = g_hash_table_lookup(options, PCMK_OPT_FENCE_REACTION); set_fence_reaction(value); value = g_hash_table_lookup(options, PCMK_OPT_STONITH_MAX_ATTEMPTS); update_stonith_max_attempts(value); } static gboolean too_many_st_failures(const char *target) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if (stonith_failures == NULL) { return FALSE; } if (target == NULL) { g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { if (value->count >= stonith_max_attempts) { target = (const char*)key; goto too_many; } } } else { value = g_hash_table_lookup(stonith_failures, target); if ((value != NULL) && (value->count >= stonith_max_attempts)) { goto too_many; } } return FALSE; too_many: crm_warn("Too many failures (%d) to fence %s, giving up", value->count, target); return TRUE; } /*! * \internal * \brief Reset a stonith fail count * * \param[in] target Name of node to reset, or NULL for all */ void st_fail_count_reset(const char *target) { if (stonith_failures == NULL) { return; } if (target) { struct st_fail_rec *rec = NULL; rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count = 0; } } else { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *rec = NULL; g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &rec)) { rec->count = 0; } } } static void st_fail_count_increment(const char *target) { struct st_fail_rec *rec = NULL; if (stonith_failures == NULL) { stonith_failures = pcmk__strkey_table(free, free); } rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); if(rec == NULL) { return; } rec->count = 1; g_hash_table_insert(stonith_failures, pcmk__str_copy(target), rec); } } /* end stonith fail count functions */ static void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } } static void send_stonith_update(pcmk__graph_action_t *action, const char *target, const char *uuid) { int rc = pcmk_ok; pcmk__node_status_t *peer = NULL; /* We (usually) rely on the membership layer to do node_update_cluster, * and the peer status callback to do node_update_peer, because the node * might have already rejoined before we get the stonith result here. */ int flags = node_update_join | node_update_expected; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate. * Try getting any existing node cache entry also by node uuid in case it * doesn't have an uname yet. */ peer = pcmk__get_node(0, target, uuid, pcmk__node_search_any); CRM_CHECK(peer != NULL, return); if (peer->state == NULL) { /* Usually, we rely on the membership layer to update the cluster state * in the CIB. However, if the node has never been seen, do it here, so * the node is not considered unclean. */ flags |= node_update_cluster; } if (peer->xml_id == NULL) { crm_info("Recording XML ID '%s' for node '%s'", uuid, target); peer->xml_id = pcmk__str_copy(uuid); } crmd_peer_down(peer, TRUE); /* Generate a node state update for the CIB */ node_state = create_node_state_update(peer, flags, NULL, __func__); /* we have to mark whether or not remote nodes have already been fenced */ if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) { char *now_s = pcmk__ttoa(time(NULL)); crm_xml_add(node_state, PCMK__XA_NODE_FENCED, now_s); free(now_s); } /* Force our known ID */ crm_xml_add(node_state, PCMK_XA_ID, uuid); rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn, PCMK_XE_STATUS, node_state, cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); // Make sure it sticks /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn, * cib_none); */ controld_delete_node_state(peer->name, controld_section_all, cib_none); pcmk__xml_free(node_state); return; } /*! * \internal * \brief Abort transition due to stonith failure * * \param[in] abort_action Whether to restart or stop transition * \param[in] target Don't restart if this (NULL for any) has too many failures * \param[in] reason Log this stonith action XML as abort reason (or NULL) */ static void abort_for_stonith_failure(enum pcmk__graph_next abort_action, const char *target, const xmlNode *reason) { /* If stonith repeatedly fails, we eventually give up on starting a new * transition for that reason. */ if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) { abort_action = pcmk__graph_wait; } abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed", reason); } /* * stonith cleanup list * * If the DC is shot, proper notifications might not go out. * The stonith cleanup list allows the cluster to (re-)send * notifications once a new DC is elected. */ static GList *stonith_cleanup_list = NULL; /*! * \internal * \brief Add a node to the stonith cleanup list * * \param[in] target Name of node to add */ void add_stonith_cleanup(const char *target) { stonith_cleanup_list = g_list_append(stonith_cleanup_list, pcmk__str_copy(target)); } /*! * \internal * \brief Remove a node from the stonith cleanup list * * \param[in] Name of node to remove */ void remove_stonith_cleanup(const char *target) { GList *iter = stonith_cleanup_list; while (iter != NULL) { GList *tmp = iter; char *iter_name = tmp->data; iter = iter->next; if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) { crm_trace("Removing %s from the cleanup list", iter_name); stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); free(iter_name); } } } /*! * \internal * \brief Purge all entries from the stonith cleanup list */ void purge_stonith_cleanup(void) { if (stonith_cleanup_list) { GList *iter = NULL; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_info("Purging %s from stonith cleanup list", target); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } } /*! * \internal * \brief Send stonith updates for all entries in cleanup list, then purge it */ void execute_stonith_cleanup(void) { GList *iter; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; pcmk__node_status_t *target_node = pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member); const char *uuid = pcmk__cluster_node_uuid(target_node); crm_notice("Marking %s, target of a previous stonith action, as clean", target); send_stonith_update(NULL, target, uuid); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } /* end stonith cleanup list functions */ /* stonith API client * * Functions that need to interact directly with the fencer via its API */ static stonith_t *stonith_api = NULL; static mainloop_timer_t *controld_fencer_connect_timer = NULL; static char *te_client_id = NULL; static gboolean fail_incompletable_stonith(pcmk__graph_t *graph) { GList *lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GList *lpc2 = NULL; pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data; if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data; if ((action->type != pcmk__cluster_graph_action) || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { continue; } task = crm_element_value(action->xml, PCMK_XA_OPERATION); if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); last_action = action->xml; pcmk__update_graph(graph, action); crm_notice("Failing action %d (%s): fencer terminated", action->id, pcmk__xe_id(action->xml)); } } } if (last_action != NULL) { crm_warn("Fencer failure resulted in unrunnable actions"); abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) { te_cleanup_stonith_history_sync(st, FALSE); if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { crm_err("Lost fencer connection (will attempt to reconnect)"); if (!mainloop_timer_running(controld_fencer_connect_timer)) { mainloop_timer_start(controld_fencer_connect_timer); } } else { crm_info("Disconnected from fencer"); } if (stonith_api) { /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(st); } stonith_api->cmds->remove_notification(stonith_api, NULL); } if (AM_I_DC) { fail_incompletable_stonith(controld_globals.transition_graph); trigger_graph(); } } /*! * \internal * \brief Handle an event notification from the fencing API * * \param[in] st Fencing API connection (ignored) * \param[in] event Fencing API event notification */ static void handle_fence_notification(stonith_t *st, stonith_event_t *event) { bool succeeded = true; const char *executioner = "the cluster"; const char *client = "a client"; const char *reason = NULL; int exec_status; if (te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } if (event == NULL) { crm_err("Notify data not found"); return; } if (event->executioner != NULL) { executioner = event->executioner; } if (event->client_origin != NULL) { client = event->client_origin; } exec_status = stonith__event_execution_status(event); if ((stonith__event_exit_status(event) != CRM_EX_OK) || (exec_status != PCMK_EXEC_DONE)) { succeeded = false; if (exec_status == PCMK_EXEC_DONE) { exec_status = PCMK_EXEC_ERROR; } } reason = stonith__event_exit_reason(event); crmd_alert_fencing_op(event); if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) { // Unfencing doesn't need special handling, just a log message if (succeeded) { crm_notice("%s was unfenced by %s at the request of %s@%s", event->target, executioner, client, event->origin); } else { crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", event->target, executioner, pcmk_exec_status_str(exec_status), ((reason == NULL)? "" : ": "), ((reason == NULL)? "" : reason), stonith__event_exit_status(event)); } return; } if (succeeded && pcmk__str_eq(event->target, controld_globals.our_nodename, pcmk__str_casei)) { /* We were notified of our own fencing. Most likely, either fencing was * misconfigured, or fabric fencing that doesn't cut cluster * communication is in use. * * Either way, shutting down the local host is a good idea, to require * administrator intervention. Also, other nodes would otherwise likely * set our status to lost because of the fencing callback and discard * our subsequent election votes as "not part of our cluster". */ crm_crit("We were allegedly just fenced by %s for %s!", executioner, event->origin); // Dumps blackbox if enabled if (fence_reaction_panic) { pcmk__panic(__func__); } else { crm_exit(CRM_EX_FATAL); } return; // Should never get here } /* Update the count of fencing failures for this target, in case we become * DC later. The current DC has already updated its fail count in * tengine_stonith_callback(). */ if (!AM_I_DC) { if (succeeded) { st_fail_count_reset(event->target); } else { st_fail_count_increment(event->target); } } crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " "%s%s%s%s " QB_XS " event=%s", event->target, (succeeded? "" : " not"), event->action, executioner, client, event->origin, (succeeded? "OK" : pcmk_exec_status_str(exec_status)), ((reason == NULL)? "" : " ("), ((reason == NULL)? "" : reason), ((reason == NULL)? "" : ")"), event->id); if (succeeded) { const uint32_t flags = pcmk__node_search_any |pcmk__node_search_cluster_cib; pcmk__node_status_t *peer = pcmk__search_node_caches(0, event->target, flags); const char *uuid = NULL; if (peer == NULL) { return; } uuid = pcmk__cluster_node_uuid(peer); if (AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, event->target, uuid); /* @TODO Ideally, at this point, we'd check whether the fenced node * hosted any guest nodes, and call remote_node_down() for them. * Unfortunately, the controller doesn't have a simple, reliable way * to map hosts to guests. It might be possible to track this in the * peer cache via refresh_remote_nodes(). For now, we rely on the * scheduler creating fence pseudo-events for the guests. */ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { /* Abort the current transition if it wasn't the cluster that * initiated fencing. */ crm_info("External fencing operation from %s fenced %s", client, event->target); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "External Fencing Operation", NULL); } } else if (pcmk__str_eq(controld_globals.dc_name, event->target, pcmk__str_null_matches|pcmk__str_casei) && !pcmk_is_set(peer->flags, pcmk__node_status_remote)) { // Assume the target was our DC if we don't currently have one if (controld_globals.dc_name != NULL) { crm_notice("Fencing target %s was our DC", event->target); } else { crm_notice("Fencing target %s may have been our DC", event->target); } /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (pcmk__str_eq(event->executioner, controld_globals.our_nodename, pcmk__str_casei)) { send_stonith_update(NULL, event->target, uuid); } add_stonith_cleanup(event->target); } /* If the target is a remote node, and we host its connection, * immediately fail all monitors so it can be recovered quickly. * The connection won't necessarily drop when a remote node is fenced, * so the failure might not otherwise be detected until the next poke. */ if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) { remote_ra_fail(event->target); } crmd_peer_down(peer, TRUE); } } /*! * \brief Connect to fencer * * \param[in] user_data If NULL, retry failures now, otherwise retry in mainloop timer * * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ gboolean controld_timer_fencer_connect(gpointer user_data) { int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); if (stonith_api == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); return G_SOURCE_REMOVE; } } if (stonith_api->state != stonith_disconnected) { crm_trace("Already connected to fencer, no need to retry"); return G_SOURCE_REMOVE; } if (user_data == NULL) { // Blocking (retry failures now until successful) rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); if (rc != pcmk_ok) { crm_err("Could not connect to fencer in 30 attempts: %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); } } else { // Non-blocking (retry failures later in main loop) rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (controld_fencer_connect_timer == NULL) { controld_fencer_connect_timer = mainloop_timer_add("controld_fencer_connect", 1000, TRUE, controld_timer_fencer_connect, GINT_TO_POINTER(TRUE)); } if (rc != pcmk_ok) { if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { crm_notice("Fencer connection failed (will retry): %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); if (!mainloop_timer_running(controld_fencer_connect_timer)) { mainloop_timer_start(controld_fencer_connect_timer); } return G_SOURCE_CONTINUE; } else { crm_info("Fencer connection failed (ignoring because no longer required): %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); } return G_SOURCE_REMOVE; } } if (rc == pcmk_ok) { stonith_api_operations_t *cmds = stonith_api->cmds; cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_FENCE, handle_fence_notification); cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED, tengine_stonith_history_synced); te_trigger_stonith_history_sync(TRUE); crm_notice("Fencer successfully connected"); } return G_SOURCE_REMOVE; } void controld_disconnect_fencer(bool destroy) { if (stonith_api) { // Prevent fencer connection from coming up again controld_clear_fsa_input_flags(R_ST_REQUIRED); if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(stonith_api); } stonith_api->cmds->remove_notification(stonith_api, NULL); } if (destroy) { if (stonith_api) { stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (controld_fencer_connect_timer) { mainloop_timer_del(controld_fencer_connect_timer); controld_fencer_connect_timer = NULL; } if (te_client_id) { free(te_client_id); te_client_id = NULL; } } } static gboolean do_stonith_history_sync(gpointer user_data) { if (stonith_api && (stonith_api->state != stonith_disconnected)) { stonith_history_t *history = NULL; te_cleanup_stonith_history_sync(stonith_api, FALSE); stonith_api->cmds->history(stonith_api, st_opt_sync_call | st_opt_broadcast, NULL, &history, 5); stonith_history_free(history); return TRUE; } else { crm_info("Skip triggering stonith history-sync as stonith is disconnected"); return FALSE; } } static void tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) { char *uuid = NULL; int stonith_id = -1; int transition_id = -1; pcmk__graph_action_t *action = NULL; const char *target = NULL; if ((data == NULL) || (data->userdata == NULL)) { crm_err("Ignoring fence operation %d result: " "No transition key given (bug?)", ((data == NULL)? -1 : data->call_id)); return; } if (!AM_I_DC) { const char *reason = stonith__exit_reason(data); if (reason == NULL) { reason = pcmk_exec_status_str(stonith__execution_status(data)); } crm_notice("Result of fence operation %d: %d (%s) " QB_XS " key=%s", data->call_id, stonith__exit_status(data), reason, (const char *) data->userdata); return; } CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, &stonith_id, NULL), goto bail); if (controld_globals.transition_graph->complete || (stonith_id < 0) || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none) || (controld_globals.transition_graph->id != transition_id)) { crm_info("Ignoring fence operation %d result: " "Not from current transition " QB_XS " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", data->call_id, pcmk__btoa(controld_globals.transition_graph->complete), stonith_id, uuid, controld_globals.te_uuid, transition_id, controld_globals.transition_graph->id); goto bail; } action = controld_get_action(stonith_id); if (action == NULL) { crm_err("Ignoring fence operation %d result: " "Action %d not found in transition graph (bug?) " QB_XS " uuid=%s transition=%d", data->call_id, stonith_id, uuid, transition_id); goto bail; } target = crm_element_value(action->xml, PCMK__META_ON_NODE); if (target == NULL) { crm_err("Ignoring fence operation %d result: No target given (bug?)", data->call_id); goto bail; } stop_te_timer(action); if (stonith__exit_status(data) == CRM_EX_OK) { const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); const char *op = crm_meta_value(action->params, PCMK__META_STONITH_ACTION); crm_info("Fence operation %d for %s succeeded", data->call_id, target); if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { te_action_confirmed(action, NULL); if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) { const char *value = NULL; char *now = pcmk__ttoa(time(NULL)); gboolean is_remote_node = FALSE; /* This check is not 100% reliable, since this node is not * guaranteed to have the remote node cached. However, it * doesn't have to be reliable, since the attribute manager can * learn a node's "remoteness" by other means sooner or later. * This allows it to learn more quickly if this node does have * the information. */ - if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) { + if (g_hash_table_lookup(pcmk__remote_peer_cache, + uuid) != NULL) { is_remote_node = TRUE; } update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, is_remote_node); free(now); value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL); update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, is_remote_node); value = crm_meta_value(action->params, PCMK__META_DIGESTS_SECURE); update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, is_remote_node); } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) { send_stonith_update(action, target, uuid); pcmk__set_graph_action_flags(action, pcmk__graph_action_sent_update); } } st_fail_count_reset(target); } else { enum pcmk__graph_next abort_action = pcmk__graph_restart; int status = stonith__execution_status(data); const char *reason = stonith__exit_reason(data); if (reason == NULL) { if (status == PCMK_EXEC_DONE) { reason = "Agent returned error"; } else { reason = pcmk_exec_status_str(status); } } pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); /* If no fence devices were available, there's no use in immediately * checking again, so don't start a new transition in that case. */ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { crm_warn("Fence operation %d for %s failed: %s " "(aborting transition and giving up for now)", data->call_id, target, reason); abort_action = pcmk__graph_wait; } else { crm_notice("Fence operation %d for %s failed: %s " "(aborting transition)", data->call_id, target, reason); } /* Increment the fail count now, so abort_for_stonith_failure() can * check it. Non-DC nodes will increment it in * handle_fence_notification(). */ st_fail_count_increment(target); abort_for_stonith_failure(abort_action, target, NULL); } pcmk__update_graph(controld_globals.transition_graph, action); trigger_graph(); bail: free(data->userdata); free(uuid); return; } static int fence_with_delay(const char *target, const char *type, int delay) { uint32_t options = st_opt_none; // Group of enum stonith_call_options int timeout_sec = (int) (controld_globals.transition_graph->stonith_timeout / 1000); if (crmd_join_phase_count(controld_join_confirmed) == 1) { stonith__set_call_options(options, target, st_opt_allow_suicide); } return stonith_api->cmds->fence_with_delay(stonith_api, options, target, type, timeout_sec, 0, delay); } /*! * \internal * \brief Execute a fencing action from a transition graph * * \param[in] graph Transition graph being executed (ignored) * \param[in] action Fencing action to execute * * \return Standard Pacemaker return code */ int controld_execute_fence_action(pcmk__graph_t *graph, pcmk__graph_action_t *action) { int rc = 0; const char *id = pcmk__xe_id(action->xml); const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); const char *target = crm_element_value(action->xml, PCMK__META_ON_NODE); const char *type = crm_meta_value(action->params, PCMK__META_STONITH_ACTION); char *transition_key = NULL; const char *priority_delay = NULL; int delay_i = 0; gboolean invalid_action = FALSE; int stonith_timeout = (int) (controld_globals.transition_graph->stonith_timeout / 1000); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return EPROTO; } priority_delay = crm_meta_value(action->params, PCMK_OPT_PRIORITY_FENCING_DELAY); crm_notice("Requesting fencing (%s) targeting node %s " QB_XS " action=%s timeout=%i%s%s", type, target, id, stonith_timeout, priority_delay ? " priority_delay=" : "", priority_delay ? priority_delay : ""); /* Passing NULL means block until we can connect... */ controld_timer_fencer_connect(NULL); pcmk__scan_min_int(priority_delay, &delay_i, 0); rc = fence_with_delay(target, type, delay_i); transition_key = pcmk__transition_key(controld_globals.transition_graph->id, action->id, 0, controld_globals.te_uuid), stonith_api->cmds->register_callback(stonith_api, rc, (stonith_timeout + (delay_i > 0 ? delay_i : 0)), st_opt_timeout_updates, transition_key, "tengine_stonith_callback", tengine_stonith_callback); return pcmk_rc_ok; } bool controld_verify_stonith_watchdog_timeout(const char *value) { long long st_timeout = (value != NULL)? crm_get_msec(value) : 0; const char *our_nodename = controld_globals.our_nodename; if (st_timeout == 0 || (stonith_api && (stonith_api->state != stonith_disconnected) && stonith__watchdog_fencing_enabled_for_node_api(stonith_api, our_nodename))) { return pcmk__valid_stonith_watchdog_timeout(value); } return true; } /* end stonith API client functions */ /* * stonith history synchronization * * Each node's fencer keeps track of a cluster-wide fencing history. When a node * joins or leaves, we need to synchronize the history across all nodes. */ static crm_trigger_t *stonith_history_sync_trigger = NULL; static mainloop_timer_t *stonith_history_sync_timer_short = NULL; static mainloop_timer_t *stonith_history_sync_timer_long = NULL; void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) { if (free_timers) { mainloop_timer_del(stonith_history_sync_timer_short); stonith_history_sync_timer_short = NULL; mainloop_timer_del(stonith_history_sync_timer_long); stonith_history_sync_timer_long = NULL; } else { mainloop_timer_stop(stonith_history_sync_timer_short); mainloop_timer_stop(stonith_history_sync_timer_long); } if (st) { st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED); } } static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) { te_cleanup_stonith_history_sync(st, FALSE); crm_debug("Fence-history synced - cancel all timers"); } static gboolean stonith_history_sync_set_trigger(gpointer user_data) { mainloop_set_trigger(stonith_history_sync_trigger); return FALSE; } void te_trigger_stonith_history_sync(bool long_timeout) { /* trigger a sync in 5s to give more nodes the * chance to show up so that we don't create * unnecessary stonith-history-sync traffic * * the long timeout of 30s is there as a fallback * so that after a successful connection to fenced * we will wait for 30s for the DC to trigger a * history-sync * if this doesn't happen we trigger a sync locally * (e.g. fenced segfaults and is restarted by pacemakerd) */ /* as we are finally checking the stonith-connection * in do_stonith_history_sync we should be fine * leaving stonith_history_sync_time & stonith_history_sync_trigger * around */ if (stonith_history_sync_trigger == NULL) { stonith_history_sync_trigger = mainloop_add_trigger(G_PRIORITY_LOW, do_stonith_history_sync, NULL); } if (long_timeout) { if(stonith_history_sync_timer_long == NULL) { stonith_history_sync_timer_long = mainloop_timer_add("history_sync_long", 30000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); mainloop_timer_start(stonith_history_sync_timer_long); } else { if(stonith_history_sync_timer_short == NULL) { stonith_history_sync_timer_short = mainloop_timer_add("history_sync_short", 5000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); mainloop_timer_start(stonith_history_sync_timer_short); } } /* end stonith history synchronization functions */ diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c index 3a082b1769..90754352e8 100644 --- a/daemons/controld/controld_join_dc.c +++ b/daemons/controld/controld_join_dc.c @@ -1,1083 +1,1083 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include // PRIu32 #include // bool, true, false #include // NULL #include // free(), etc. #include // gboolean, etc. #include // xmlNode #include #include #include #include static char *max_generation_from = NULL; static xmlNodePtr max_generation_xml = NULL; /*! * \internal * \brief Nodes from which a CIB sync has failed since the peer joined * * This table is of the form (node_name -> join_id). \p node_name is * the name of a client node from which a CIB \p sync_from() call has failed in * \p do_dc_join_finalize() since the client joined the cluster as a peer. * \p join_id is the ID of the join round in which the \p sync_from() failed, * and is intended for use in nack log messages. */ static GHashTable *failed_sync_nodes = NULL; void finalize_join_for(gpointer key, gpointer value, gpointer user_data); void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); /* Numeric counter used to identify join rounds (an unsigned int would be * appropriate, except we get and set it in XML as int) */ static int current_join_id = 0; /*! * \internal * \brief Get log-friendly string equivalent of a controller group join phase * * \param[in] phase Join phase * * \return Log-friendly string equivalent of \p phase */ static const char * join_phase_text(enum controld_join_phase phase) { switch (phase) { case controld_join_nack: return "nack"; case controld_join_none: return "none"; case controld_join_welcomed: return "welcomed"; case controld_join_integrated: return "integrated"; case controld_join_finalized: return "finalized"; case controld_join_confirmed: return "confirmed"; default: return "invalid"; } } /*! * \internal * \brief Destroy the hash table containing failed sync nodes */ void controld_destroy_failed_sync_table(void) { if (failed_sync_nodes != NULL) { g_hash_table_destroy(failed_sync_nodes); failed_sync_nodes = NULL; } } /*! * \internal * \brief Remove a node from the failed sync nodes table if present * * \param[in] node_name Node name to remove */ void controld_remove_failed_sync_node(const char *node_name) { if (failed_sync_nodes != NULL) { g_hash_table_remove(failed_sync_nodes, (gchar *) node_name); } } /*! * \internal * \brief Add to a hash table a node whose CIB failed to sync * * \param[in] node_name Name of node whose CIB failed to sync * \param[in] join_id Join round when the failure occurred */ static void record_failed_sync_node(const char *node_name, gint join_id) { if (failed_sync_nodes == NULL) { failed_sync_nodes = pcmk__strikey_table(g_free, NULL); } /* If the node is already in the table then we failed to nack it during the * filter offer step */ CRM_LOG_ASSERT(g_hash_table_insert(failed_sync_nodes, g_strdup(node_name), GINT_TO_POINTER(join_id))); } /*! * \internal * \brief Look up a node name in the failed sync table * * \param[in] node_name Name of node to look up * \param[out] join_id Where to store the join ID of when the sync failed * * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the * node name was found, or \p pcmk_rc_node_unknown otherwise. * \note \p *join_id is set to -1 if the node is not found. */ static int lookup_failed_sync_node(const char *node_name, gint *join_id) { *join_id = -1; if (failed_sync_nodes != NULL) { gpointer result = g_hash_table_lookup(failed_sync_nodes, (gchar *) node_name); if (result != NULL) { *join_id = GPOINTER_TO_INT(result); return pcmk_rc_ok; } } return pcmk_rc_node_unknown; } void crm_update_peer_join(const char *source, pcmk__node_status_t *node, enum controld_join_phase phase) { enum controld_join_phase last = controld_get_join_phase(node); CRM_CHECK(node != NULL, return); /* Remote nodes do not participate in joins */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return; } if (phase == last) { crm_trace("Node %s join-%d phase is still %s " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, current_join_id, join_phase_text(last), node->cluster_layer_id, source); return; } if ((phase <= controld_join_none) || (phase == (last + 1))) { struct controld_node_status_data *data = pcmk__assert_alloc(1, sizeof(struct controld_node_status_data)); data->join_phase = phase; node->user_data = data; crm_trace("Node %s join-%d phase is now %s (was %s) " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, current_join_id, join_phase_text(phase), join_phase_text(last), node->cluster_layer_id, source); return; } crm_warn("Rejecting join-%d phase update for node %s because can't go from " "%s to %s " QB_XS " nodeid=%" PRIu32 " source=%s", current_join_id, node->name, join_phase_text(last), join_phase_text(phase), node->cluster_layer_id, source); } static void start_join_round(void) { GHashTableIter iter; pcmk__node_status_t *peer = NULL; crm_debug("Starting new join round join-%d", current_join_id); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { crm_update_peer_join(__func__, peer, controld_join_none); } if (max_generation_from != NULL) { free(max_generation_from); max_generation_from = NULL; } if (max_generation_xml != NULL) { pcmk__xml_free(max_generation_xml); max_generation_xml = NULL; } controld_clear_fsa_input_flags(R_HAVE_CIB); } /*! * \internal * \brief Create a join message from the DC * * \param[in] join_op Join operation name * \param[in] host_to Recipient of message */ static xmlNode * create_dc_message(const char *join_op, const char *host_to) { xmlNode *msg = create_request(join_op, NULL, host_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); /* Identify which election this is a part of */ crm_xml_add_int(msg, PCMK__XA_JOIN_ID, current_join_id); /* Add a field specifying whether the DC is shutting down. This keeps the * joining node from fencing the old DC if it becomes the new DC. */ pcmk__xe_set_bool_attr(msg, PCMK__XA_DC_LEAVING, pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)); return msg; } static void join_make_offer(gpointer key, gpointer value, gpointer user_data) { /* @TODO We don't use user_data except to distinguish one particular call * from others. Make this clearer. */ xmlNode *offer = NULL; pcmk__node_status_t *member = (pcmk__node_status_t *) value; CRM_ASSERT(member != NULL); if (!pcmk__cluster_is_node_active(member)) { crm_info("Not making join-%d offer to inactive node %s", current_join_id, pcmk__s(member->name, "with unknown name")); if ((member->expected == NULL) && pcmk__str_eq(member->state, PCMK__VALUE_LOST, pcmk__str_none)) { /* You would think this unsafe, but in fact this plus an * active resource is what causes it to be fenced. * * Yes, this does mean that any node that dies at the same * time as the old DC and is not running resource (still) * won't be fenced. * * I'm not happy about this either. */ pcmk__update_peer_expected(__func__, member, CRMD_JOINSTATE_DOWN); } return; } if (member->name == NULL) { crm_info("Not making join-%d offer to node uuid %s with unknown name", current_join_id, member->xml_id); return; } if (controld_globals.membership_id != controld_globals.peer_seq) { controld_globals.membership_id = controld_globals.peer_seq; crm_info("Making join-%d offers based on membership event %llu", current_join_id, controld_globals.peer_seq); } if (user_data != NULL) { enum controld_join_phase phase = controld_get_join_phase(member); if (phase > controld_join_none) { crm_info("Not making join-%d offer to already known node %s (%s)", current_join_id, member->name, join_phase_text(phase)); return; } } crm_update_peer_join(__func__, (pcmk__node_status_t*) member, controld_join_none); offer = create_dc_message(CRM_OP_JOIN_OFFER, member->name); // Advertise our feature set so the joining node can bail if not compatible crm_xml_add(offer, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET); crm_info("Sending join-%d offer to %s", current_join_id, member->name); pcmk__cluster_send_message(member, pcmk__cluster_msg_controld, offer); pcmk__xml_free(offer); crm_update_peer_join(__func__, member, controld_join_welcomed); } /* A_DC_JOIN_OFFER_ALL */ void do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int count; /* Reset everyone's status back to down or in_ccm in the CIB. * Any nodes that are active in the CIB but not in the cluster membership * will be seen as offline by the scheduler anyway. */ current_join_id++; start_join_round(); update_dc(NULL); if (cause == C_HA_MESSAGE && current_input == I_NODE_JOIN) { crm_info("A new node joined the cluster"); } g_hash_table_foreach(crm_peer_cache, join_make_offer, NULL); count = crmd_join_phase_count(controld_join_welcomed); crm_info("Waiting on join-%d requests from %d outstanding node%s", current_join_id, count, pcmk__plural_s(count)); // Don't waste time by invoking the scheduler yet } /* A_DC_JOIN_OFFER_ONE */ void do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { pcmk__node_status_t *member = NULL; ha_msg_input_t *welcome = NULL; int count; const char *join_to = NULL; if (msg_data->data == NULL) { crm_info("Making join-%d offers to any unconfirmed nodes " "because an unknown node joined", current_join_id); g_hash_table_foreach(crm_peer_cache, join_make_offer, &member); check_join_state(cur_state, __func__); return; } welcome = fsa_typed_data(fsa_dt_ha_msg); if (welcome == NULL) { // fsa_typed_data() already logged an error return; } join_to = crm_element_value(welcome->msg, PCMK__XA_SRC); if (join_to == NULL) { crm_err("Can't make join-%d offer to unknown node", current_join_id); return; } member = pcmk__get_node(0, join_to, NULL, pcmk__node_search_cluster_member); /* It is possible that a node will have been sick or starting up when the * original offer was made. However, it will either re-announce itself in * due course, or we can re-store the original offer on the client. */ crm_update_peer_join(__func__, member, controld_join_none); join_make_offer(NULL, member, NULL); /* If the offer isn't to the local node, make an offer to the local node as * well, to ensure the correct value for max_generation_from. */ if (strcasecmp(join_to, controld_globals.our_nodename) != 0) { member = pcmk__get_node(0, controld_globals.our_nodename, NULL, pcmk__node_search_cluster_member); join_make_offer(NULL, member, NULL); } /* This was a genuine join request; cancel any existing transition and * invoke the scheduler. */ abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Node join", NULL); count = crmd_join_phase_count(controld_join_welcomed); crm_info("Waiting on join-%d requests from %d outstanding node%s", current_join_id, count, pcmk__plural_s(count)); // Don't waste time by invoking the scheduler yet } static int compare_int_fields(xmlNode * left, xmlNode * right, const char *field) { const char *elem_l = crm_element_value(left, field); const char *elem_r = crm_element_value(right, field); long long int_elem_l; long long int_elem_r; pcmk__scan_ll(elem_l, &int_elem_l, -1LL); pcmk__scan_ll(elem_r, &int_elem_r, -1LL); if (int_elem_l < int_elem_r) { return -1; } else if (int_elem_l > int_elem_r) { return 1; } return 0; } /* A_DC_JOIN_PROCESS_REQ */ void do_dc_join_filter_offer(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *generation = NULL; int cmp = 0; int join_id = -1; int count = 0; gint value = 0; gboolean ack_nack_bool = TRUE; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = crm_element_value(join_ack->msg, PCMK__XA_SRC); const char *ref = crm_element_value(join_ack->msg, PCMK_XA_REFERENCE); const char *join_version = crm_element_value(join_ack->msg, PCMK_XA_CRM_FEATURE_SET); pcmk__node_status_t *join_node = NULL; if (join_from == NULL) { crm_err("Ignoring invalid join request without node name"); return; } join_node = pcmk__get_node(0, join_from, NULL, pcmk__node_search_cluster_member); crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id); if (join_id != current_join_id) { crm_debug("Ignoring join-%d request from %s because we are on join-%d", join_id, join_from, current_join_id); check_join_state(cur_state, __func__); return; } generation = join_ack->xml; if (max_generation_xml != NULL && generation != NULL) { int lpc = 0; const char *attributes[] = { PCMK_XA_ADMIN_EPOCH, PCMK_XA_EPOCH, PCMK_XA_NUM_UPDATES, }; /* It's not obvious that join_ack->xml is the PCMK__XE_GENERATION_TUPLE * element from the join client. The "if" guard is for clarity. */ if (pcmk__xe_is(generation, PCMK__XE_GENERATION_TUPLE)) { for (lpc = 0; cmp == 0 && lpc < PCMK__NELEM(attributes); lpc++) { cmp = compare_int_fields(max_generation_xml, generation, attributes[lpc]); } } else { // Should always be PCMK__XE_GENERATION_TUPLE CRM_LOG_ASSERT(false); } } if (ref == NULL) { ref = "none"; // for logging only } if (lookup_failed_sync_node(join_from, &value) == pcmk_rc_ok) { crm_err("Rejecting join-%d request from node %s because we failed to " "sync its CIB in join-%d " QB_XS " ref=%s", join_id, join_from, value, ref); ack_nack_bool = FALSE; } else if (!pcmk__cluster_is_node_active(join_node)) { if (match_down_event(join_from) != NULL) { /* The join request was received after the node was fenced or * otherwise shutdown in a way that we're aware of. No need to log * an error in this rare occurrence; we know the client was recently * shut down, and receiving a lingering in-flight request is not * cause for alarm. */ crm_debug("Rejecting join-%d request from inactive node %s " QB_XS " ref=%s", join_id, join_from, ref); } else { crm_err("Rejecting join-%d request from inactive node %s " QB_XS " ref=%s", join_id, join_from, ref); } ack_nack_bool = FALSE; } else if (generation == NULL) { crm_err("Rejecting invalid join-%d request from node %s " "missing CIB generation " QB_XS " ref=%s", join_id, join_from, ref); ack_nack_bool = FALSE; } else if ((join_version == NULL) || !feature_set_compatible(CRM_FEATURE_SET, join_version)) { crm_err("Rejecting join-%d request from node %s because feature set %s" " is incompatible with ours (%s) " QB_XS " ref=%s", join_id, join_from, (join_version? join_version : "pre-3.1.0"), CRM_FEATURE_SET, ref); ack_nack_bool = FALSE; } else if (max_generation_xml == NULL) { const char *validation = crm_element_value(generation, PCMK_XA_VALIDATE_WITH); if (pcmk__get_schema(validation) == NULL) { crm_err("Rejecting join-%d request from %s (with first CIB " "generation) due to unknown schema version %s " QB_XS " ref=%s", join_id, join_from, pcmk__s(validation, "(missing)"), ref); ack_nack_bool = FALSE; } else { crm_debug("Accepting join-%d request from %s (with first CIB " "generation) " QB_XS " ref=%s", join_id, join_from, ref); max_generation_xml = pcmk__xml_copy(NULL, generation); pcmk__str_update(&max_generation_from, join_from); } } else if ((cmp < 0) || ((cmp == 0) && pcmk__str_eq(join_from, controld_globals.our_nodename, pcmk__str_casei))) { const char *validation = crm_element_value(generation, PCMK_XA_VALIDATE_WITH); if (pcmk__get_schema(validation) == NULL) { crm_err("Rejecting join-%d request from %s (with better CIB " "generation than current best from %s) due to unknown " "schema version %s " QB_XS " ref=%s", join_id, join_from, max_generation_from, pcmk__s(validation, "(missing)"), ref); ack_nack_bool = FALSE; } else { crm_debug("Accepting join-%d request from %s (with better CIB " "generation than current best from %s) " QB_XS " ref=%s", join_id, join_from, max_generation_from, ref); crm_log_xml_debug(max_generation_xml, "Old max generation"); crm_log_xml_debug(generation, "New max generation"); pcmk__xml_free(max_generation_xml); max_generation_xml = pcmk__xml_copy(NULL, join_ack->xml); pcmk__str_update(&max_generation_from, join_from); } } else { crm_debug("Accepting join-%d request from %s " QB_XS " ref=%s", join_id, join_from, ref); } if (!ack_nack_bool) { crm_update_peer_join(__func__, join_node, controld_join_nack); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_NACK); } else { crm_update_peer_join(__func__, join_node, controld_join_integrated); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER); } count = crmd_join_phase_count(controld_join_integrated); crm_debug("%d node%s currently integrated in join-%d", count, pcmk__plural_s(count), join_id); if (check_join_state(cur_state, __func__) == FALSE) { // Don't waste time by invoking the scheduler yet count = crmd_join_phase_count(controld_join_welcomed); crm_debug("Waiting on join-%d requests from %d outstanding node%s", join_id, count, pcmk__plural_s(count)); } } /* A_DC_JOIN_FINALIZE */ void do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { char *sync_from = NULL; int rc = pcmk_ok; int count_welcomed = crmd_join_phase_count(controld_join_welcomed); int count_finalizable = crmd_join_phase_count(controld_join_integrated) + crmd_join_phase_count(controld_join_nack); /* This we can do straight away and avoid clients timing us out * while we compute the latest CIB */ if (count_welcomed != 0) { crm_debug("Waiting on join-%d requests from %d outstanding node%s " "before finalizing join", current_join_id, count_welcomed, pcmk__plural_s(count_welcomed)); crmd_join_phase_log(LOG_DEBUG); /* crmd_fsa_stall(FALSE); Needed? */ return; } else if (count_finalizable == 0) { crm_debug("Finalization not needed for join-%d at the current time", current_join_id); crmd_join_phase_log(LOG_DEBUG); check_join_state(controld_globals.fsa_state, __func__); return; } controld_clear_fsa_input_flags(R_HAVE_CIB); if (pcmk__str_eq(max_generation_from, controld_globals.our_nodename, pcmk__str_null_matches|pcmk__str_casei)) { controld_set_fsa_input_flags(R_HAVE_CIB); } if (!controld_globals.transition_graph->complete) { crm_warn("Delaying join-%d finalization while transition in progress", current_join_id); crmd_join_phase_log(LOG_DEBUG); crmd_fsa_stall(FALSE); return; } if (pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) { // Send our CIB out to everyone sync_from = pcmk__str_copy(controld_globals.our_nodename); crm_debug("Finalizing join-%d for %d node%s (sync'ing from local CIB)", current_join_id, count_finalizable, pcmk__plural_s(count_finalizable)); crm_log_xml_debug(max_generation_xml, "Requested CIB version"); } else { // Ask for the agreed best CIB sync_from = pcmk__str_copy(max_generation_from); crm_notice("Finalizing join-%d for %d node%s (sync'ing CIB from %s)", current_join_id, count_finalizable, pcmk__plural_s(count_finalizable), sync_from); crm_log_xml_notice(max_generation_xml, "Requested CIB version"); } crmd_join_phase_log(LOG_DEBUG); rc = controld_globals.cib_conn->cmds->sync_from(controld_globals.cib_conn, sync_from, NULL, cib_none); fsa_register_cib_callback(rc, sync_from, finalize_sync_callback); } void free_max_generation(void) { free(max_generation_from); max_generation_from = NULL; pcmk__xml_free(max_generation_xml); max_generation_xml = NULL; } void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { CRM_LOG_ASSERT(-EPERM != rc); if (rc != pcmk_ok) { const char *sync_from = (const char *) user_data; do_crm_log(((rc == -pcmk_err_old_data)? LOG_WARNING : LOG_ERR), "Could not sync CIB from %s in join-%d: %s", sync_from, current_join_id, pcmk_strerror(rc)); if (rc != -pcmk_err_old_data) { record_failed_sync_node(sync_from, current_join_id); } /* restart the whole join process */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __func__); } else if (!AM_I_DC) { crm_debug("Sync'ed CIB for join-%d but no longer DC", current_join_id); } else if (controld_globals.fsa_state != S_FINALIZE_JOIN) { crm_debug("Sync'ed CIB for join-%d but no longer in S_FINALIZE_JOIN " "(%s)", current_join_id, fsa_state2string(controld_globals.fsa_state)); } else { controld_set_fsa_input_flags(R_HAVE_CIB); /* make sure dc_uuid is re-set to us */ if (!check_join_state(controld_globals.fsa_state, __func__)) { int count_finalizable = 0; count_finalizable = crmd_join_phase_count(controld_join_integrated) + crmd_join_phase_count(controld_join_nack); crm_debug("Notifying %d node%s of join-%d results", count_finalizable, pcmk__plural_s(count_finalizable), current_join_id); g_hash_table_foreach(crm_peer_cache, finalize_join_for, NULL); } } } static void join_node_state_commit_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { const char *node = user_data; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; // for register_fsa_error() macro crm_crit("join-%d node history update (via CIB call %d) for node %s " "failed: %s", current_join_id, call_id, node, pcmk_strerror(rc)); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } crm_debug("join-%d node history update (via CIB call %d) for node %s " "complete", current_join_id, call_id, node); check_join_state(controld_globals.fsa_state, __func__); } /* A_DC_JOIN_PROCESS_ACK */ void do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int join_id = -1; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *op = crm_element_value(join_ack->msg, PCMK__XA_CRM_TASK); char *join_from = crm_element_value_copy(join_ack->msg, PCMK__XA_SRC); pcmk__node_status_t *peer = NULL; enum controld_join_phase phase = controld_join_none; enum controld_section_e section = controld_section_lrm; char *xpath = NULL; xmlNode *state = join_ack->xml; xmlNode *execd_state = NULL; cib_t *cib = controld_globals.cib_conn; int rc = pcmk_ok; // Sanity checks if (join_from == NULL) { crm_warn("Ignoring message received without node identification"); goto done; } if (op == NULL) { crm_warn("Ignoring message received from %s without task", join_from); goto done; } if (strcmp(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring '%s' message from %s while waiting for '%s'", op, join_from, CRM_OP_JOIN_CONFIRM); goto done; } if (crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id) != 0) { crm_warn("Ignoring join confirmation from %s without valid join ID", join_from); goto done; } peer = pcmk__get_node(0, join_from, NULL, pcmk__node_search_cluster_member); phase = controld_get_join_phase(peer); if (phase != controld_join_finalized) { crm_info("Ignoring out-of-sequence join-%d confirmation from %s " "(currently %s not %s)", join_id, join_from, join_phase_text(phase), join_phase_text(controld_join_finalized)); goto done; } if (join_id != current_join_id) { crm_err("Rejecting join-%d confirmation from %s " "because currently on join-%d", join_id, join_from, current_join_id); crm_update_peer_join(__func__, peer, controld_join_nack); goto done; } crm_update_peer_join(__func__, peer, controld_join_confirmed); /* Update CIB with node's current executor state. A new transition will be * triggered later, when the CIB manager notifies us of the change. * * The delete and modify requests are part of an atomic transaction. */ rc = cib->cmds->init_transaction(cib); if (rc != pcmk_ok) { goto done; } // Delete relevant parts of node's current executor state from CIB if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { section = controld_section_lrm_unlocked; } controld_node_state_deletion_strings(join_from, section, &xpath, NULL); rc = cib->cmds->remove(cib, xpath, NULL, cib_xpath|cib_multiple|cib_transaction); if (rc != pcmk_ok) { goto done; } // Update CIB with node's latest known executor state if (pcmk__str_eq(join_from, controld_globals.our_nodename, pcmk__str_casei)) { // Use the latest possible state if processing our own join ack execd_state = controld_query_executor_state(); if (execd_state != NULL) { crm_debug("Updating local node history for join-%d from query " "result", current_join_id); state = execd_state; } else { crm_warn("Updating local node history from join-%d confirmation " "because query failed", current_join_id); } } else { crm_debug("Updating node history for %s from join-%d confirmation", join_from, current_join_id); } rc = cib->cmds->modify(cib, PCMK_XE_STATUS, state, cib_can_create|cib_transaction); pcmk__xml_free(execd_state); if (rc != pcmk_ok) { goto done; } // Commit the transaction rc = cib->cmds->end_transaction(cib, true, cib_none); fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback); if (rc > 0) { // join_from will be freed after callback join_from = NULL; rc = pcmk_ok; } done: if (rc != pcmk_ok) { crm_crit("join-%d node history update for node %s failed: %s", current_join_id, join_from, pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free(join_from); free(xpath); } void finalize_join_for(gpointer key, gpointer value, gpointer user_data) { xmlNode *acknak = NULL; xmlNode *tmp1 = NULL; pcmk__node_status_t *join_node = value; const char *join_to = join_node->name; enum controld_join_phase phase = controld_get_join_phase(join_node); bool integrated = false; switch (phase) { case controld_join_integrated: integrated = true; break; case controld_join_nack: break; default: crm_trace("Not updating non-integrated and non-nacked node %s (%s) " "for join-%d", join_to, join_phase_text(phase), current_join_id); return; } /* Update the element with the node's name and UUID, in case they * weren't known before */ crm_trace("Updating node name and UUID in CIB for %s", join_to); tmp1 = pcmk__xe_create(NULL, PCMK_XE_NODE); crm_xml_add(tmp1, PCMK_XA_ID, pcmk__cluster_node_uuid(join_node)); crm_xml_add(tmp1, PCMK_XA_UNAME, join_to); fsa_cib_anon_update(PCMK_XE_NODES, tmp1); pcmk__xml_free(tmp1); join_node = pcmk__get_node(0, join_to, NULL, pcmk__node_search_cluster_member); if (!pcmk__cluster_is_node_active(join_node)) { /* * NACK'ing nodes that the membership layer doesn't know about yet * simply creates more churn * * Better to leave them waiting and let the join restart when * the new membership event comes in * * All other NACKs (due to versions etc) should still be processed */ pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_PENDING); return; } // Acknowledge or nack node's join request crm_debug("%sing join-%d request from %s", integrated? "Acknowledg" : "Nack", current_join_id, join_to); acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to); pcmk__xe_set_bool_attr(acknak, CRM_OP_JOIN_ACKNAK, integrated); if (integrated) { // No change needed for a nacked node crm_update_peer_join(__func__, join_node, controld_join_finalized); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER); /* Iterate through the remote peer cache and add information on which * node hosts each to the ACK message. This keeps new controllers in * sync with what has already happened. */ if (pcmk__cluster_num_remote_nodes() > 0) { GHashTableIter iter; pcmk__node_status_t *node = NULL; xmlNode *remotes = pcmk__xe_create(acknak, PCMK_XE_NODES); - g_hash_table_iter_init(&iter, crm_remote_peer_cache); + g_hash_table_iter_init(&iter, pcmk__remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *remote = NULL; if (!node->conn_host) { continue; } remote = pcmk__xe_create(remotes, PCMK_XE_NODE); pcmk__xe_set_props(remote, PCMK_XA_ID, node->name, PCMK__XA_NODE_STATE, node->state, PCMK__XA_CONNECTION_HOST, node->conn_host, NULL); } } } pcmk__cluster_send_message(join_node, pcmk__cluster_msg_controld, acknak); pcmk__xml_free(acknak); return; } gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source) { static unsigned long long highest_seq = 0; if (controld_globals.membership_id != controld_globals.peer_seq) { crm_debug("join-%d: Membership changed from %llu to %llu " QB_XS " highest=%llu state=%s for=%s", current_join_id, controld_globals.membership_id, controld_globals.peer_seq, highest_seq, fsa_state2string(cur_state), source); if (highest_seq < controld_globals.peer_seq) { /* Don't spam the FSA with duplicates */ highest_seq = controld_globals.peer_seq; register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } } else if (cur_state == S_INTEGRATION) { if (crmd_join_phase_count(controld_join_welcomed) == 0) { int count = crmd_join_phase_count(controld_join_integrated); crm_debug("join-%d: Integration of %d peer%s complete " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL); return TRUE; } } else if (cur_state == S_FINALIZE_JOIN) { if (!pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) { crm_debug("join-%d: Delaying finalization until we have CIB " QB_XS " state=%s for=%s", current_join_id, fsa_state2string(cur_state), source); return TRUE; } else if (crmd_join_phase_count(controld_join_welcomed) != 0) { int count = crmd_join_phase_count(controld_join_welcomed); crm_debug("join-%d: Still waiting on %d welcomed node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else if (crmd_join_phase_count(controld_join_integrated) != 0) { int count = crmd_join_phase_count(controld_join_integrated); crm_debug("join-%d: Still waiting on %d integrated node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else if (crmd_join_phase_count(controld_join_finalized) != 0) { int count = crmd_join_phase_count(controld_join_finalized); crm_debug("join-%d: Still waiting on %d finalized node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else { crm_debug("join-%d: Complete " QB_XS " state=%s for=%s", current_join_id, fsa_state2string(cur_state), source); register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL); return TRUE; } } return FALSE; } void do_dc_join_final(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Ensuring DC, quorum and node attributes are up-to-date"); crm_update_quorum(pcmk__cluster_has_quorum(), TRUE); } int crmd_join_phase_count(enum controld_join_phase phase) { int count = 0; pcmk__node_status_t *peer; GHashTableIter iter; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { if (controld_get_join_phase(peer) == phase) { count++; } } return count; } void crmd_join_phase_log(int level) { pcmk__node_status_t *peer; GHashTableIter iter; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { do_crm_log(level, "join-%d: %s=%s", current_join_id, peer->name, join_phase_text(controld_get_join_phase(peer))); } } diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c index c003c3a427..01c336dbb4 100644 --- a/daemons/controld/controld_membership.c +++ b/daemons/controld/controld_membership.c @@ -1,467 +1,467 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include void post_cache_update(int instance); extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { pcmk__node_status_t *node = value; if (pcmk__cluster_is_node_active(node)) { return; } crm_update_peer_join(__func__, node, controld_join_none); if ((node != NULL) && (node->name != NULL)) { if (pcmk__str_eq(controld_globals.our_nodename, node->name, pcmk__str_casei)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (!AM_I_DC && pcmk__str_eq(node->name, controld_globals.dc_name, pcmk__str_casei)) { crm_warn("Our DC node (%s) left the cluster", node->name); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } if ((controld_globals.fsa_state == S_INTEGRATION) || (controld_globals.fsa_state == S_FINALIZE_JOIN)) { check_join_state(controld_globals.fsa_state, __func__); } if ((node != NULL) && (node->xml_id != NULL)) { fail_incompletable_actions(controld_globals.transition_graph, node->xml_id); } } void post_cache_update(int instance) { xmlNode *no_op = NULL; controld_globals.peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); controld_set_fsa_input_flags(R_MEMBERSHIP); if (AM_I_DC) { populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer | node_update_expected, __func__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ controld_set_fsa_action_flags(A_ELECTION_CHECK); controld_trigger_fsa(); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); pcmk__cluster_send_message(NULL, pcmk__cluster_msg_controld, no_op); pcmk__xml_free(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else if(call_id < pcmk_ok) { crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /*! * \internal * \brief Create an XML node state tag with updates * * \param[in,out] node Node whose state will be used for update * \param[in] flags Bitmask of node_update_flags indicating what to update * \param[in,out] parent XML node to contain update (or NULL) * \param[in] source Who requested the update (only used for logging) * * \return Pointer to created node state tag */ xmlNode * create_node_state_update(pcmk__node_status_t *node, int flags, xmlNode *parent, const char *source) { const char *value = NULL; xmlNode *node_state; if (!node->state) { crm_info("Node update for %s cancelled: no state, not seen yet", node->name); return NULL; } node_state = pcmk__xe_create(parent, PCMK__XE_NODE_STATE); if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { pcmk__xe_set_bool_attr(node_state, PCMK_XA_REMOTE_NODE, true); } if (crm_xml_add(node_state, PCMK_XA_ID, pcmk__cluster_node_uuid(node)) == NULL) { crm_info("Node update for %s cancelled: no ID", node->name); pcmk__xml_free(node_state); return NULL; } crm_xml_add(node_state, PCMK_XA_UNAME, node->name); if ((flags & node_update_cluster) && node->state) { if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { // A value 0 means the node is not a cluster member. crm_xml_add_ll(node_state, PCMK__XA_IN_CCM, node->when_member); } else { pcmk__xe_set_bool_attr(node_state, PCMK__XA_IN_CCM, pcmk__str_eq(node->state, PCMK_VALUE_MEMBER, pcmk__str_none)); } } if (!pcmk_is_set(node->flags, pcmk__node_status_remote)) { if (flags & node_update_peer) { if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { // A value 0 means the peer is offline in CPG. crm_xml_add_ll(node_state, PCMK_XA_CRMD, node->when_online); } else { // @COMPAT DCs < 2.1.7 use online/offline rather than timestamp value = PCMK_VALUE_OFFLINE; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { value = PCMK_VALUE_ONLINE; } crm_xml_add(node_state, PCMK_XA_CRMD, value); } } if (flags & node_update_join) { if (controld_get_join_phase(node) <= controld_join_none) { value = CRMD_JOINSTATE_DOWN; } else { value = CRMD_JOINSTATE_MEMBER; } crm_xml_add(node_state, PCMK__XA_JOIN, value); } if (flags & node_update_expected) { crm_xml_add(node_state, PCMK_XA_EXPECTED, node->expected); } } crm_xml_add(node_state, PCMK_XA_CRM_DEBUG_ORIGIN, source); return node_state; } static void remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *node_uuid = user_data; do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)", node_uuid, pcmk_strerror(rc), rc); } static void search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *new_node_uuid = user_data; xmlNode *node_xml = NULL; if (rc != pcmk_ok) { if (rc != -ENXIO) { crm_notice("Searching conflicting nodes for %s failed: %s (%d)", new_node_uuid, pcmk_strerror(rc), rc); } return; } else if (output == NULL) { return; } if (pcmk__xe_is(output, PCMK_XE_NODE)) { node_xml = output; } else { node_xml = pcmk__xe_first_child(output, PCMK_XE_NODE, NULL, NULL); } for (; node_xml != NULL; node_xml = pcmk__xe_next_same(node_xml)) { const char *node_uuid = NULL; const char *node_uname = NULL; GHashTableIter iter; pcmk__node_status_t *node = NULL; gboolean known = FALSE; node_uuid = crm_element_value(node_xml, PCMK_XA_ID); node_uname = crm_element_value(node_xml, PCMK_XA_UNAME); if (node_uuid == NULL || node_uname == NULL) { continue; } g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if ((node != NULL) && pcmk__str_eq(node->xml_id, node_uuid, pcmk__str_casei) && pcmk__str_eq(node->name, node_uname, pcmk__str_casei)) { known = TRUE; break; } } if (known == FALSE) { cib_t *cib_conn = controld_globals.cib_conn; int delete_call_id = 0; xmlNode *node_state_xml = NULL; crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s", node_uuid, node_uname, new_node_uuid); delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_NODES, node_xml, cib_none); fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid), remove_conflicting_node_callback); node_state_xml = pcmk__xe_create(NULL, PCMK__XE_NODE_STATE); crm_xml_add(node_state_xml, PCMK_XA_ID, node_uuid); crm_xml_add(node_state_xml, PCMK_XA_UNAME, node_uname); delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_STATUS, node_state_xml, cib_none); fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid), remove_conflicting_node_callback); pcmk__xml_free(node_state_xml); } } } static void node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if(call_id < pcmk_ok) { crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else if(rc < pcmk_ok) { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void populate_cib_nodes(enum node_update_flags flags, const char *source) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = 0; gboolean from_hashtable = TRUE; xmlNode *node_list = pcmk__xe_create(NULL, PCMK_XE_NODES); #if SUPPORT_COROSYNC if (!pcmk_is_set(flags, node_update_quick) && (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync)) { from_hashtable = pcmk__corosync_add_nodes(node_list); } #endif if (from_hashtable) { GHashTableIter iter; pcmk__node_status_t *node = NULL; GString *xpath = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *new_node = NULL; if ((node->xml_id != NULL) && (node->name != NULL)) { crm_trace("Creating node entry for %s/%s", node->name, node->xml_id); if (xpath == NULL) { xpath = g_string_sized_new(512); } else { g_string_truncate(xpath, 0); } /* We need both to be valid */ new_node = pcmk__xe_create(node_list, PCMK_XE_NODE); crm_xml_add(new_node, PCMK_XA_ID, node->xml_id); crm_xml_add(new_node, PCMK_XA_UNAME, node->name); /* Search and remove unknown nodes with the conflicting uname from CIB */ pcmk__g_strcat(xpath, "/" PCMK_XE_CIB "/" PCMK_XE_CONFIGURATION "/" PCMK_XE_NODES "/" PCMK_XE_NODE "[@" PCMK_XA_UNAME "='", node->name, "']" "[@" PCMK_XA_ID "!='", node->xml_id, "']", NULL); call_id = cib_conn->cmds->query(cib_conn, (const char *) xpath->str, NULL, cib_xpath); fsa_register_cib_callback(call_id, pcmk__str_copy(node->xml_id), search_conflicting_node_callback); } } if (xpath != NULL) { g_string_free(xpath, TRUE); } } crm_trace("Populating section from %s", from_hashtable ? "hashtable" : "cluster"); if ((controld_update_cib(PCMK_XE_NODES, node_list, cib_none, node_list_update_callback) == pcmk_rc_ok) && (crm_peer_cache != NULL) && AM_I_DC) { /* * There is no need to update the local CIB with our values if * we've not seen valid membership data */ GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__xml_free(node_list); node_list = pcmk__xe_create(NULL, PCMK_XE_STATUS); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } - if (crm_remote_peer_cache) { - g_hash_table_iter_init(&iter, crm_remote_peer_cache); + if (pcmk__remote_peer_cache != NULL) { + g_hash_table_iter_init(&iter, pcmk__remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } } controld_update_cib(PCMK_XE_STATUS, node_list, cib_none, crmd_node_update_complete); } pcmk__xml_free(node_list); } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { bool has_quorum = pcmk_is_set(controld_globals.flags, controld_has_quorum); if (quorum) { controld_set_global_flags(controld_ever_had_quorum); } else if (pcmk_all_flags_set(controld_globals.flags, controld_ever_had_quorum |controld_no_quorum_suicide)) { pcmk__panic(__func__); } if (AM_I_DC && ((has_quorum && !quorum) || (!has_quorum && quorum) || force_update)) { xmlNode *update = NULL; update = pcmk__xe_create(NULL, PCMK_XE_CIB); crm_xml_add_int(update, PCMK_XA_HAVE_QUORUM, quorum); crm_xml_add(update, PCMK_XA_DC_UUID, controld_globals.our_uuid); crm_debug("Updating quorum status to %s", pcmk__btoa(quorum)); controld_update_cib(PCMK_XE_CIB, update, cib_none, cib_quorum_update_complete); pcmk__xml_free(update); /* Quorum changes usually cause a new transition via other activity: * quorum gained via a node joining will abort via the node join, * and quorum lost via a node leaving will usually abort via resource * activity and/or fencing. * * However, it is possible that nothing else causes a transition (e.g. * someone forces quorum via corosync-cmaptcl, or quorum is lost due to * a node in standby shutting down cleanly), so here ensure a new * transition is triggered. */ if (quorum) { /* If quorum was gained, abort after a short delay, in case multiple * nodes are joining around the same time, so the one that brings us * to quorum doesn't cause all the remaining ones to be fenced. */ abort_after_delay(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Quorum gained", 5000); } else { abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Quorum lost", NULL); } } if (quorum) { controld_set_global_flags(controld_has_quorum); } else { controld_clear_global_flags(controld_has_quorum); } } diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c index d2b36c1785..92a428436b 100644 --- a/daemons/controld/controld_te_events.c +++ b/daemons/controld/controld_te_events.c @@ -1,613 +1,613 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include /*! * \internal * \brief Action numbers of outside events processed in current update diff * * This table is to be used as a set. It should be empty when the transitioner * begins processing a CIB update diff. It ensures that if there are multiple * events (for example, "_last_0" and "_last_failure_0") for the same action, * only one of them updates the failcount. Events that originate outside the * cluster can't be confirmed, since they're not in the transition graph. */ static GHashTable *outside_events = NULL; /*! * \internal * \brief Empty the hash table containing action numbers of outside events */ void controld_remove_all_outside_events(void) { if (outside_events != NULL) { g_hash_table_remove_all(outside_events); } } /*! * \internal * \brief Destroy the hash table containing action numbers of outside events */ void controld_destroy_outside_events_table(void) { if (outside_events != NULL) { g_hash_table_destroy(outside_events); outside_events = NULL; } } /*! * \internal * \brief Add an outside event's action number to a set * * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the * event was not already in the set, or \p pcmk_rc_already otherwise. */ static int record_outside_event(gint action_num) { if (outside_events == NULL) { outside_events = g_hash_table_new(NULL, NULL); } if (g_hash_table_add(outside_events, GINT_TO_POINTER(action_num))) { return pcmk_rc_ok; } return pcmk_rc_already; } gboolean fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node) { const char *target_uuid = NULL; const char *router = NULL; const char *router_uuid = NULL; xmlNode *last_action = NULL; GList *gIter = NULL; GList *gIter2 = NULL; if (graph == NULL || graph->complete) { return FALSE; } gIter = graph->synapses; for (; gIter != NULL; gIter = gIter->next) { pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data; if (pcmk_any_flags_set(synapse->flags, pcmk__synapse_confirmed|pcmk__synapse_failed)) { /* We've already been here */ continue; } gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data; if ((action->type == pcmk__pseudo_graph_action) || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { continue; } else if (action->type == pcmk__cluster_graph_action) { const char *task = crm_element_value(action->xml, PCMK_XA_OPERATION); if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { continue; } } target_uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); router = crm_element_value(action->xml, PCMK__XA_ROUTER_NODE); if (router) { const pcmk__node_status_t *node = pcmk__get_node(0, router, NULL, pcmk__node_search_cluster_member); if (node != NULL) { router_uuid = node->xml_id; } } if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) { pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); pcmk__set_synapse_flags(synapse, pcmk__synapse_failed); last_action = action->xml; stop_te_timer(action); pcmk__update_graph(graph, action); if (pcmk_is_set(synapse->flags, pcmk__synapse_executed)) { crm_notice("Action %d (%s) was pending on %s (offline)", action->id, crm_element_value(action->xml, PCMK__XA_OPERATION_KEY), down_node); } else { crm_info("Action %d (%s) is scheduled for %s (offline)", action->id, crm_element_value(action->xml, PCMK__XA_OPERATION_KEY), down_node); } } } } if (last_action != NULL) { crm_info("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Node failure", last_action); return TRUE; } return FALSE; } /*! * \internal * \brief Update failure-related node attributes if warranted * * \param[in] event XML describing operation that (maybe) failed * \param[in] event_node_uuid Node that event occurred on * \param[in] rc Actual operation return code * \param[in] target_rc Expected operation return code * \param[in] do_update If TRUE, do update regardless of operation type * \param[in] ignore_failures If TRUE, update last failure but not fail count * * \return TRUE if this was not a direct nack, success or lrm status refresh */ static gboolean update_failcount(const xmlNode *event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update, gboolean ignore_failures) { guint interval_ms = 0; char *task = NULL; char *rsc_id = NULL; const char *value = NULL; const char *id = crm_element_value(event, PCMK__XA_OPERATION_KEY); const char *on_uname = pcmk__node_name_from_uuid(event_node_uuid); const char *origin = crm_element_value(event, PCMK_XA_CRM_DEBUG_ORIGIN); // Nothing needs to be done for success or status refresh if (rc == target_rc) { return FALSE; } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) { crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", id, rc, on_uname); return FALSE; } /* Sanity check */ CRM_CHECK(on_uname != NULL, return TRUE); CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms), crm_err("Couldn't parse: %s", pcmk__xe_id(event)); goto bail); /* Decide whether update is necessary and what value to use */ if ((interval_ms > 0) || pcmk__str_eq(task, PCMK_ACTION_PROMOTE, pcmk__str_none) || pcmk__str_eq(task, PCMK_ACTION_DEMOTE, pcmk__str_none)) { do_update = TRUE; } else if (pcmk__str_eq(task, PCMK_ACTION_START, pcmk__str_none)) { do_update = TRUE; value = pcmk__s(controld_globals.transition_graph->failed_start_offset, PCMK_VALUE_INFINITY); } else if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_none)) { do_update = TRUE; value = pcmk__s(controld_globals.transition_graph->failed_stop_offset, PCMK_VALUE_INFINITY); } if (do_update) { pcmk__attrd_query_pair_t *fail_pair = NULL; pcmk__attrd_query_pair_t *last_pair = NULL; char *fail_name = NULL; char *last_name = NULL; GList *attrs = NULL; uint32_t opts = pcmk__node_attr_none; char *now = pcmk__ttoa(time(NULL)); // Fail count will be either incremented or set to infinity if (!pcmk_str_is_infinity(value)) { value = PCMK_XA_VALUE "++"; } - if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) { + if (g_hash_table_lookup(pcmk__remote_peer_cache, event_node_uuid)) { opts |= pcmk__node_attr_remote; } crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)", (ignore_failures? "last failure" : "failcount"), rsc_id, on_uname, task, rc, value, now); /* Update the fail count, if we're not ignoring failures */ if (!ignore_failures) { fail_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t)); fail_name = pcmk__failcount_name(rsc_id, task, interval_ms); fail_pair->name = fail_name; fail_pair->value = value; fail_pair->node = on_uname; attrs = g_list_prepend(attrs, fail_pair); } /* Update the last failure time (even if we're ignoring failures, * so that failure can still be detected and shown, e.g. by crm_mon) */ last_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t)); last_name = pcmk__lastfailure_name(rsc_id, task, interval_ms); last_pair->name = last_name; last_pair->value = now; last_pair->node = on_uname; attrs = g_list_prepend(attrs, last_pair); update_attrd_list(attrs, opts); free(fail_name); free(fail_pair); free(last_name); free(last_pair); g_list_free(attrs); free(now); } bail: free(rsc_id); free(task); return TRUE; } pcmk__graph_action_t * controld_get_action(int id) { for (GList *item = controld_globals.transition_graph->synapses; item != NULL; item = item->next) { pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) item->data; for (GList *item2 = synapse->actions; item2; item2 = item2->next) { pcmk__graph_action_t *action = (pcmk__graph_action_t *) item2->data; if (action->id == id) { return action; } } } return NULL; } pcmk__graph_action_t * get_cancel_action(const char *id, const char *node) { GList *gIter = NULL; GList *gIter2 = NULL; gIter = controld_globals.transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { const char *task = NULL; const char *target = NULL; pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data; task = crm_element_value(action->xml, PCMK_XA_OPERATION); if (!pcmk__str_eq(PCMK_ACTION_CANCEL, task, pcmk__str_casei)) { continue; } task = crm_element_value(action->xml, PCMK__XA_OPERATION_KEY); if (!pcmk__str_eq(task, id, pcmk__str_casei)) { crm_trace("Wrong key %s for %s on %s", task, id, node); continue; } target = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) { crm_trace("Wrong node %s for %s on %s", target, id, node); continue; } crm_trace("Found %s on %s", id, node); return action; } } return NULL; } bool confirm_cancel_action(const char *id, const char *node_id) { const char *op_key = NULL; const char *node_name = NULL; pcmk__graph_action_t *cancel = get_cancel_action(id, node_id); if (cancel == NULL) { return FALSE; } op_key = crm_element_value(cancel->xml, PCMK__XA_OPERATION_KEY); node_name = crm_element_value(cancel->xml, PCMK__META_ON_NODE); stop_te_timer(cancel); te_action_confirmed(cancel, controld_globals.transition_graph); crm_info("Cancellation of %s on %s confirmed (action %d)", op_key, node_name, cancel->id); return TRUE; } /* downed nodes are listed like: ... */ #define XPATH_DOWNED "//" PCMK__XE_DOWNED \ "/" PCMK_XE_NODE "[@" PCMK_XA_ID "='%s']" /*! * \brief Find a transition event that would have made a specified node down * * \param[in] target UUID of node to match * * \return Matching event if found, NULL otherwise */ pcmk__graph_action_t * match_down_event(const char *target) { pcmk__graph_action_t *match = NULL; xmlXPathObjectPtr xpath_ret = NULL; GList *gIter, *gIter2; char *xpath = crm_strdup_printf(XPATH_DOWNED, target); for (gIter = controld_globals.transition_graph->synapses; gIter != NULL && match == NULL; gIter = gIter->next) { for (gIter2 = ((pcmk__graph_synapse_t * ) gIter->data)->actions; gIter2 != NULL && match == NULL; gIter2 = gIter2->next) { match = (pcmk__graph_action_t *) gIter2->data; if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { xpath_ret = xpath_search(match->xml, xpath); if (numXpathResults(xpath_ret) < 1) { match = NULL; } freeXpathObject(xpath_ret); } else { // Only actions that were actually started can match match = NULL; } } } free(xpath); if (match != NULL) { crm_debug("Shutdown action %d (%s) found for node %s", match->id, crm_element_value(match->xml, PCMK__XA_OPERATION_KEY), target); } else { crm_debug("No reason to expect node %s to be down", target); } return match; } void process_graph_event(xmlNode *event, const char *event_node) { int rc = -1; // Actual result int target_rc = -1; // Expected result int status = -1; // Executor status int callid = -1; // Executor call ID int transition_num = -1; // Transition number int action_num = -1; // Action number within transition char *update_te_uuid = NULL; bool ignore_failures = FALSE; const char *id = NULL; const char *desc = NULL; const char *magic = NULL; const char *uname = NULL; CRM_ASSERT(event != NULL); /* */ magic = crm_element_value(event, PCMK__XA_TRANSITION_KEY); if (magic == NULL) { /* non-change */ return; } crm_element_value_int(event, PCMK__XA_OP_STATUS, &status); if (status == PCMK_EXEC_PENDING) { return; } id = crm_element_value(event, PCMK__XA_OPERATION_KEY); crm_element_value_int(event, PCMK__XA_RC_CODE, &rc); crm_element_value_int(event, PCMK__XA_CALL_ID, &callid); rc = pcmk__effective_rc(rc); if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action_num, &target_rc) == FALSE) { // decode_transition_key() already logged the bad key crm_err("Can't process action %s result: Incompatible versions? " QB_XS " call-id=%d", id, callid); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Bad event", event); return; } if (transition_num == -1) { // E.g. crm_resource --fail if (record_outside_event(action_num) != pcmk_rc_ok) { crm_debug("Outside event with transition key '%s' has already been " "processed", magic); goto bail; } desc = "initiated outside of the cluster"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Unexpected event", event); } else if ((action_num < 0) || !pcmk__str_eq(update_te_uuid, controld_globals.te_uuid, pcmk__str_none)) { desc = "initiated by a different DC"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Foreign event", event); } else if ((controld_globals.transition_graph->id != transition_num) || controld_globals.transition_graph->complete) { // Action is not from currently active transition guint interval_ms = 0; if (parse_op_key(id, NULL, NULL, &interval_ms) && (interval_ms != 0)) { /* Recurring actions have the transition number they were first * scheduled in. */ if (status == PCMK_EXEC_CANCELLED) { confirm_cancel_action(id, get_node_id(event)); goto bail; } desc = "arrived after initial scheduling"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Change in recurring result", event); } else if (controld_globals.transition_graph->id != transition_num) { desc = "arrived really late"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Old event", event); } else { desc = "arrived late"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Inactive graph", event); } } else { // Event is result of an action from currently active transition pcmk__graph_action_t *action = controld_get_action(action_num); if (action == NULL) { // Should never happen desc = "unknown"; abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Unknown event", event); } else if (pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { /* Nothing further needs to be done if the action has already been * confirmed. This can happen e.g. when processing both an * "xxx_last_0" or "xxx_last_failure_0" record as well as the main * history record, which would otherwise result in incorrectly * bumping the fail count twice. */ crm_log_xml_debug(event, "Event already confirmed:"); goto bail; } else { /* An action result needs to be confirmed. * (This is the only case where desc == NULL.) */ if (pcmk__str_eq(crm_meta_value(action->params, PCMK_META_ON_FAIL), PCMK_VALUE_IGNORE, pcmk__str_casei)) { ignore_failures = TRUE; } else if (rc != target_rc) { pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); } stop_te_timer(action); te_action_confirmed(action, controld_globals.transition_graph); if (pcmk_is_set(action->flags, pcmk__graph_action_failed)) { abort_transition(action->synapse->priority + 1, pcmk__graph_restart, "Event failed", event); } } } if (id == NULL) { id = "unknown action"; } uname = crm_element_value(event, PCMK__META_ON_NODE); if (uname == NULL) { uname = "unknown node"; } if (status == PCMK_EXEC_INVALID) { // We couldn't attempt the action crm_info("Transition %d action %d (%s on %s): %s", transition_num, action_num, id, uname, pcmk_exec_status_str(status)); } else if (desc && update_failcount(event, event_node, rc, target_rc, (transition_num == -1), FALSE)) { crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " QB_XS " target-rc=%d rc=%d call-id=%d event='%s'", transition_num, action_num, id, uname, services_ocf_exitcode_str(target_rc), services_ocf_exitcode_str(rc), target_rc, rc, callid, desc); } else if (desc) { crm_info("Transition %d action %d (%s on %s): %s " QB_XS " rc=%d target-rc=%d call-id=%d", transition_num, action_num, id, uname, desc, rc, target_rc, callid); } else if (rc == target_rc) { crm_info("Transition %d action %d (%s on %s) confirmed: %s " QB_XS " rc=%d call-id=%d", transition_num, action_num, id, uname, services_ocf_exitcode_str(rc), rc, callid); } else { update_failcount(event, event_node, rc, target_rc, (transition_num == -1), ignore_failures); crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " QB_XS " target-rc=%d rc=%d call-id=%d", transition_num, action_num, id, uname, services_ocf_exitcode_str(target_rc), services_ocf_exitcode_str(rc), target_rc, rc, callid); } bail: free(update_te_uuid); } diff --git a/include/crm/cluster.h b/include/crm/cluster.h index 7a005c466d..9fa7492f15 100644 --- a/include/crm/cluster.h +++ b/include/crm/cluster.h @@ -1,101 +1,97 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_CLUSTER__H # define PCMK__CRM_CLUSTER__H # include // uint32_t, uint64_t # include // gboolean, GHashTable # include // xmlNode # include # include #ifdef __cplusplus extern "C" { #endif #if SUPPORT_COROSYNC #include // cpg_callbacks_t #endif // @COMPAT Make this internal when we can break API backward compatibility //! \deprecated Do not use (public access will be removed in a future release) extern GHashTable *crm_peer_cache; -// @COMPAT Make this internal when we can break API backward compatibility -//! \deprecated Do not use (public access will be removed in a future release) -extern GHashTable *crm_remote_peer_cache; - //! \internal Do not use typedef struct pcmk__cluster_private pcmk__cluster_private_t; // Implementation of pcmk_cluster_t // @COMPAT Make contents internal when we can break API backward compatibility //!@{ //! \deprecated Do not use (public access will be removed in a future release) struct pcmk__cluster { /* @COMPAT Once all members are moved to pcmk__cluster_private_t, we can * make that the pcmk_cluster_t implementation and drop this struct * altogether, leaving pcmk_cluster_t as an opaque public type. */ //! \internal Do not use pcmk__cluster_private_t *priv; // NOTE: sbd (as of at least 1.5.2) uses this //! \deprecated Call pcmk_cluster_set_destroy_fn() to set this void (*destroy) (gpointer); #if SUPPORT_COROSYNC // NOTE: sbd (as of at least 1.5.2) uses this /*! * \deprecated Call pcmk_cpg_set_deliver_fn() and pcmk_cpg_set_confchg_fn() * to set these */ cpg_callbacks_t cpg; #endif // SUPPORT_COROSYNC }; //!@} //! Connection to a cluster layer typedef struct pcmk__cluster pcmk_cluster_t; int pcmk_cluster_connect(pcmk_cluster_t *cluster); int pcmk_cluster_disconnect(pcmk_cluster_t *cluster); pcmk_cluster_t *pcmk_cluster_new(void); void pcmk_cluster_free(pcmk_cluster_t *cluster); int pcmk_cluster_set_destroy_fn(pcmk_cluster_t *cluster, void (*fn)(gpointer)); #if SUPPORT_COROSYNC int pcmk_cpg_set_deliver_fn(pcmk_cluster_t *cluster, cpg_deliver_fn_t fn); int pcmk_cpg_set_confchg_fn(pcmk_cluster_t *cluster, cpg_confchg_fn_t fn); #endif // SUPPORT_COROSYNC /*! * \enum pcmk_cluster_layer * \brief Types of cluster layer */ enum pcmk_cluster_layer { pcmk_cluster_layer_unknown = 1, //!< Unknown cluster layer pcmk_cluster_layer_invalid = 2, //!< Invalid cluster layer pcmk_cluster_layer_corosync = 32, //!< Corosync Cluster Engine }; enum pcmk_cluster_layer pcmk_get_cluster_layer(void); const char *pcmk_cluster_layer_text(enum pcmk_cluster_layer layer); #ifdef __cplusplus } #endif #if !defined(PCMK_ALLOW_DEPRECATED) || (PCMK_ALLOW_DEPRECATED == 1) #include #endif #endif diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h index a864cdcbdc..ad8ebed2a1 100644 --- a/include/crm/cluster/internal.h +++ b/include/crm/cluster/internal.h @@ -1,332 +1,334 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_CLUSTER_INTERNAL__H #define PCMK__CRM_CLUSTER_INTERNAL__H #include #include // uint32_t, uint64_t #include // gboolean #include #if SUPPORT_COROSYNC #include // cpg_name, cpg_handle_t #endif #ifdef __cplusplus extern "C" { #endif /*! * \internal * \enum pcmk__cluster_msg * \brief Types of message sent via the cluster layer */ enum pcmk__cluster_msg { pcmk__cluster_msg_unknown, pcmk__cluster_msg_attrd, pcmk__cluster_msg_based, pcmk__cluster_msg_controld, pcmk__cluster_msg_execd, pcmk__cluster_msg_fenced, }; enum crm_proc_flag { /* @COMPAT When pcmk__node_status_t:processes is made internal, we can merge * this into node flags or turn it into a boolean. Until then, in theory * something could depend on these particular numeric values. */ crm_proc_none = 0x00000001, // Cluster layers crm_proc_cpg = 0x04000000, }; /*! * \internal * \enum pcmk__node_status_flags * \brief Boolean flags for a \c pcmk__node_status_t object * * Some flags may not be related to status specifically. However, we keep these * separate from enum pcmk__node_flags because they're used with * different object types. */ enum pcmk__node_status_flags { /*! * Node is a Pacemaker Remote node and should not be considered for cluster * membership */ pcmk__node_status_remote = (UINT32_C(1) << 0), //! Node's cache entry is dirty pcmk__node_status_dirty = (UINT32_C(1) << 1), }; // Used with node cache search functions enum pcmk__node_search_flags { //! Does not affect search pcmk__node_search_none = 0, //! Search for cluster nodes from membership cache pcmk__node_search_cluster_member = (1 << 0), //! Search for remote nodes pcmk__node_search_remote = (1 << 1), //! Search for cluster member nodes and remote nodes pcmk__node_search_any = pcmk__node_search_cluster_member |pcmk__node_search_remote, //! Search for cluster nodes from CIB (as of last cache refresh) pcmk__node_search_cluster_cib = (1 << 2), }; /*! * \internal * \enum pcmk__node_update * \brief Type of update to a \c pcmk__node_status_t object */ enum pcmk__node_update { pcmk__node_update_name, //!< Node name updated pcmk__node_update_state, //!< Node connection state updated pcmk__node_update_processes, //!< Node process group membership updated }; //! Implementation of pcmk__cluster_private_t struct pcmk__cluster_private { // @TODO Drop and replace with per-daemon cluster-layer ID global variables? uint32_t node_id; //!< Local node ID at cluster layer // @TODO Drop and replace with per-daemon node name global variables? char *node_name; //!< Local node name at cluster layer #if SUPPORT_COROSYNC /* @TODO Make these members a separate struct and use void *cluster_data * here instead, to abstract the cluster layer further. */ struct cpg_name group; //!< Corosync CPG name cpg_handle_t cpg_handle; //!< Corosync CPG handle #endif // SUPPORT_COROSYNC }; //! Node status data (may be a cluster node or a Pacemaker Remote node) typedef struct pcmk__node_status { //! Node name as known to cluster layer, or Pacemaker Remote node name char *name; /* @COMPAT This is less than ideal since the value is not a valid XML ID * (for Corosync, it's the string equivalent of the node's numeric node ID, * but XML IDs can't start with a number) and the three elements should have * different IDs. * * Ideally, we would use something like node-NODEID, node_state-NODEID, and * transient_attributes-NODEID as the element IDs. Unfortunately changing it * would be impractical due to backward compatibility; older nodes in a * rolling upgrade will always write and expect the value in the old format. */ /*! * Value of the PCMK_XA_ID XML attribute to use with the node's * PCMK_XE_NODE, PCMK_XE_NODE_STATE, and PCMK_XE_TRANSIENT_ATTRIBUTES * XML elements in the CIB */ char *xml_id; char *state; // @TODO change to enum //! Group of enum pcmk__node_status_flags uint32_t flags; /*! * Most recent cluster membership in which node was seen (0 for Pacemaker * Remote nodes) */ uint64_t membership_id; uint32_t processes; // @TODO most not needed, merge into flags /* @TODO When we can break public API compatibility, we can make the rest of * these members separate structs and use void *cluster_data and * void *user_data here instead, to abstract the cluster layer further. */ //! Arbitrary data (must be freeable by \c free()) void *user_data; char *expected; time_t peer_lost; char *conn_host; time_t when_member; // Since when node has been a cluster member time_t when_online; // Since when peer has been online in CPG /* @TODO The following are currently needed only by the Corosync stack. * Eventually consider moving them to a cluster-layer-specific data object. */ uint32_t cluster_layer_id; //!< Cluster-layer numeric node ID time_t when_lost; //!< When CPG membership was last lost } pcmk__node_status_t; /*! * \internal * \brief Return the process bit corresponding to the current cluster stack * * \return Process flag if detectable, otherwise 0 */ static inline uint32_t crm_get_cluster_proc(void) { switch (pcmk_get_cluster_layer()) { case pcmk_cluster_layer_corosync: return crm_proc_cpg; default: break; } return crm_proc_none; } /*! * \internal * \brief Get log-friendly string description of a Corosync return code * * \param[in] error Corosync return code * * \return Log-friendly string description corresponding to \p error */ static inline const char * pcmk__cs_err_str(int error) { # if SUPPORT_COROSYNC switch (error) { case CS_OK: return "OK"; case CS_ERR_LIBRARY: return "Library error"; case CS_ERR_VERSION: return "Version error"; case CS_ERR_INIT: return "Initialization error"; case CS_ERR_TIMEOUT: return "Timeout"; case CS_ERR_TRY_AGAIN: return "Try again"; case CS_ERR_INVALID_PARAM: return "Invalid parameter"; case CS_ERR_NO_MEMORY: return "No memory"; case CS_ERR_BAD_HANDLE: return "Bad handle"; case CS_ERR_BUSY: return "Busy"; case CS_ERR_ACCESS: return "Access error"; case CS_ERR_NOT_EXIST: return "Doesn't exist"; case CS_ERR_NAME_TOO_LONG: return "Name too long"; case CS_ERR_EXIST: return "Exists"; case CS_ERR_NO_SPACE: return "No space"; case CS_ERR_INTERRUPT: return "Interrupt"; case CS_ERR_NAME_NOT_FOUND: return "Name not found"; case CS_ERR_NO_RESOURCES: return "No resources"; case CS_ERR_NOT_SUPPORTED: return "Not supported"; case CS_ERR_BAD_OPERATION: return "Bad operation"; case CS_ERR_FAILED_OPERATION: return "Failed operation"; case CS_ERR_MESSAGE_ERROR: return "Message error"; case CS_ERR_QUEUE_FULL: return "Queue full"; case CS_ERR_QUEUE_NOT_AVAILABLE: return "Queue not available"; case CS_ERR_BAD_FLAGS: return "Bad flags"; case CS_ERR_TOO_BIG: return "Too big"; case CS_ERR_NO_SECTIONS: return "No sections"; } # endif return "Corosync error"; } # if SUPPORT_COROSYNC #if 0 /* This is the new way to do it, but we still support all Corosync 2 versions, * and this isn't always available. A better alternative here would be to check * for support in the configure script and enable this conditionally. */ #define pcmk__init_cmap(handle) cmap_initialize_map((handle), CMAP_MAP_ICMAP) #else #define pcmk__init_cmap(handle) cmap_initialize(handle) #endif char *pcmk__corosync_cluster_name(void); bool pcmk__corosync_add_nodes(xmlNode *xml_parent); void pcmk__cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); char *pcmk__cpg_message_data(cpg_handle_t handle, uint32_t sender_id, uint32_t pid, void *content, const char **from); # endif const char *pcmk__cluster_node_uuid(pcmk__node_status_t *node); char *pcmk__cluster_node_name(uint32_t nodeid); const char *pcmk__cluster_local_node_name(void); const char *pcmk__node_name_from_uuid(const char *uuid); pcmk__node_status_t *crm_update_peer_proc(const char *source, pcmk__node_status_t *peer, uint32_t flag, const char *status); pcmk__node_status_t *pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership); void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected); void pcmk__reap_unseen_nodes(uint64_t ring_id); void pcmk__corosync_quorum_connect(gboolean (*dispatch)(unsigned long long, gboolean), void (*destroy) (gpointer)); enum pcmk__cluster_msg pcmk__cluster_parse_msg_type(const char *text); bool pcmk__cluster_send_message(const pcmk__node_status_t *node, enum pcmk__cluster_msg service, const xmlNode *data); // Membership +extern GHashTable *pcmk__remote_peer_cache; + bool pcmk__cluster_has_quorum(void); void pcmk__cluster_init_node_caches(void); void pcmk__cluster_destroy_node_caches(void); void pcmk__cluster_set_autoreap(bool enable); void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)); bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node); unsigned int pcmk__cluster_num_active_nodes(void); unsigned int pcmk__cluster_num_remote_nodes(void); pcmk__node_status_t *pcmk__cluster_lookup_remote_node(const char *node_name); void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name); void pcmk__cluster_forget_remote_node(const char *node_name); pcmk__node_status_t *pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags); void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id); void pcmk__refresh_node_caches_from_cib(xmlNode *cib); pcmk__node_status_t *pcmk__get_node(unsigned int id, const char *uname, const char *uuid, uint32_t flags); #ifdef __cplusplus } #endif #endif // PCMK__CRM_CLUSTER_INTERNAL__H diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c index 313450e88d..e9e85d6e6f 100644 --- a/lib/cluster/cluster.c +++ b/lib/cluster/cluster.c @@ -1,488 +1,488 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include // PRIu32 #include #include #include #include #include #include #include #include #include // uname() #include // gboolean #include #include #include #include #include "crmcluster_private.h" CRM_TRACE_INIT_DATA(cluster); /*! * \internal * \brief Get the message type equivalent of a string * * \param[in] text String of message type * * \return Message type equivalent of \p text */ enum pcmk__cluster_msg pcmk__cluster_parse_msg_type(const char *text) { CRM_CHECK(text != NULL, return pcmk__cluster_msg_unknown); text = pcmk__message_name(text); if (pcmk__str_eq(text, "attrd", pcmk__str_none)) { return pcmk__cluster_msg_attrd; } else if (pcmk__str_eq(text, CRM_SYSTEM_CIB, pcmk__str_none)) { return pcmk__cluster_msg_based; } else if (pcmk__str_any_of(text, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL)) { return pcmk__cluster_msg_controld; } else if (pcmk__str_eq(text, CRM_SYSTEM_LRMD, pcmk__str_none)) { return pcmk__cluster_msg_execd; } else if (pcmk__str_eq(text, "stonith-ng", pcmk__str_none)) { return pcmk__cluster_msg_fenced; } else { return pcmk__cluster_msg_unknown; } } /*! * \internal * \brief Get a node's cluster-layer UUID, setting it if not already set * * \param[in,out] node Node to check * * \return Cluster-layer node UUID of \p node, or \c NULL if unknown */ const char * pcmk__cluster_node_uuid(pcmk__node_status_t *node) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); if (node == NULL) { return NULL; } if (node->xml_id != NULL) { return node->xml_id; } switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: node->xml_id = pcmk__corosync_uuid(node); return node->xml_id; #endif // SUPPORT_COROSYNC default: crm_err("Unsupported cluster layer %s", pcmk_cluster_layer_text(cluster_layer)); return NULL; } } /*! * \internal * \brief Connect to the cluster layer * * \param[in,out] cluster Initialized cluster object to connect * * \return Standard Pacemaker return code */ int pcmk_cluster_connect(pcmk_cluster_t *cluster) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); // cts-lab looks for this message crm_notice("Connecting to %s cluster layer", cluster_layer_s); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: return pcmk__corosync_connect(cluster); #endif // SUPPORT_COROSYNC default: break; } crm_err("Failed to connect to unsupported cluster layer %s", cluster_layer_s); return EPROTONOSUPPORT; } /*! * \brief Disconnect from the cluster layer * * \param[in,out] cluster Cluster object to disconnect * * \return Standard Pacemaker return code */ int pcmk_cluster_disconnect(pcmk_cluster_t *cluster) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); crm_info("Disconnecting from %s cluster layer", cluster_layer_s); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: pcmk__corosync_disconnect(cluster); pcmk__cluster_destroy_node_caches(); return pcmk_rc_ok; #endif // SUPPORT_COROSYNC default: break; } crm_err("Failed to disconnect from unsupported cluster layer %s", cluster_layer_s); return EPROTONOSUPPORT; } /*! * \brief Allocate a new \p pcmk_cluster_t object * * \return A newly allocated \p pcmk_cluster_t object (guaranteed not \c NULL) * \note The caller is responsible for freeing the return value using * \p pcmk_cluster_free(). */ pcmk_cluster_t * pcmk_cluster_new(void) { pcmk_cluster_t *cluster = pcmk__assert_alloc(1, sizeof(pcmk_cluster_t)); cluster->priv = pcmk__assert_alloc(1, sizeof(pcmk__cluster_private_t)); return cluster; } /*! * \brief Free a \p pcmk_cluster_t object and its dynamically allocated members * * \param[in,out] cluster Cluster object to free */ void pcmk_cluster_free(pcmk_cluster_t *cluster) { if (cluster == NULL) { return; } free(cluster->priv->node_name); free(cluster->priv); free(cluster); } /*! * \brief Set the destroy function for a cluster object * * \param[in,out] cluster Cluster object * \param[in] fn Destroy function to set * * \return Standard Pacemaker return code */ int pcmk_cluster_set_destroy_fn(pcmk_cluster_t *cluster, void (*fn)(gpointer)) { if (cluster == NULL) { return EINVAL; } cluster->destroy = fn; return pcmk_rc_ok; } /*! * \internal * \brief Send an XML message via the cluster messaging layer * * \param[in] node Cluster node to send message to * \param[in] service Message type to use in message host info * \param[in] data XML message to send * * \return \c true on success, or \c false otherwise */ bool pcmk__cluster_send_message(const pcmk__node_status_t *node, enum pcmk__cluster_msg service, const xmlNode *data) { // @TODO Return standard Pacemaker return code switch (pcmk_get_cluster_layer()) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: return pcmk__cpg_send_xml(data, node, service); #endif // SUPPORT_COROSYNC default: break; } return false; } /*! * \internal * \brief Get the node name corresponding to a cluster-layer node ID * * Get the node name from the cluster layer if possible. Otherwise, if for the * local node, call \c uname() and get the \c nodename member from the * struct utsname object. * * \param[in] nodeid Node ID to check (or 0 for the local node) * * \return Node name corresponding to \p nodeid * * \note This will fatally exit if \c uname() fails to get the local node name * or we run out of memory. * \note The caller is responsible for freeing the return value using \c free(). */ char * pcmk__cluster_node_name(uint32_t nodeid) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: return pcmk__corosync_name(0, nodeid); #else break; #endif // SUPPORT_COROSYNC default: crm_err("Unsupported cluster layer: %s", cluster_layer_s); break; } if (nodeid == 0) { struct utsname hostinfo; crm_notice("Could not get local node name from %s cluster layer, " "defaulting to local hostname", cluster_layer_s); if (uname(&hostinfo) < 0) { // @TODO Maybe let the caller decide what to do crm_err("Failed to get the local hostname"); crm_exit(CRM_EX_FATAL); } return pcmk__str_copy(hostinfo.nodename); } crm_notice("Could not obtain a node name for node with " PCMK_XA_ID "=" PRIu32, nodeid); return NULL; } /*! * \internal * \brief Get the local node's cluster-layer node name * * If getting the node name from the cluster layer is impossible, call * \c uname() and get the \c nodename member from the struct utsname * object. * * \return Local node's name * * \note This will fatally exit if \c uname() fails to get the local node name * or we run out of memory. */ const char * pcmk__cluster_local_node_name(void) { // @TODO Refactor to avoid trivially leaking name at exit static char *name = NULL; if (name == NULL) { name = pcmk__cluster_node_name(0); } return name; } /*! * \internal * \brief Get the node name corresonding to a node UUID * * Look for the UUID in both the remote node cache and the cluster member cache. * * \param[in] uuid UUID to search for * * \return Node name corresponding to \p uuid if found, or \c NULL otherwise */ const char * pcmk__node_name_from_uuid(const char *uuid) { /* @TODO There are too many functions in libcrmcluster that look up a node * from the node caches (possibly creating a cache entry if none exists). * There are at least the following: * * pcmk__cluster_lookup_remote_node() * * pcmk__get_node() * * pcmk__node_name_from_uuid() * * pcmk__search_node_caches() * * There's a lot of duplication among them, but they all do slightly * different things. We should try to clean them up and consolidate them to * the extent possible, likely with new helper functions. */ GHashTableIter iter; pcmk__node_status_t *node = NULL; CRM_CHECK(uuid != NULL, return NULL); // Remote nodes have the same uname and uuid - if (g_hash_table_lookup(crm_remote_peer_cache, uuid)) { + if (g_hash_table_lookup(pcmk__remote_peer_cache, uuid)) { return uuid; } g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, uuid, pcmk__str_casei)) { return node->name; } } return NULL; } /*! * \brief Get a log-friendly string equivalent of a cluster layer * * \param[in] layer Cluster layer * * \return Log-friendly string corresponding to \p layer */ const char * pcmk_cluster_layer_text(enum pcmk_cluster_layer layer) { switch (layer) { case pcmk_cluster_layer_corosync: return "corosync"; case pcmk_cluster_layer_unknown: return "unknown"; case pcmk_cluster_layer_invalid: return "invalid"; default: crm_err("Invalid cluster layer: %d", layer); return "invalid"; } } /*! * \brief Get and validate the local cluster layer * * If a cluster layer is not configured via the \c PCMK__ENV_CLUSTER_TYPE local * option, this will try to detect an active cluster from among the supported * cluster layers. * * \return Local cluster layer * * \note This will fatally exit if the configured cluster layer is invalid. */ enum pcmk_cluster_layer pcmk_get_cluster_layer(void) { static enum pcmk_cluster_layer cluster_layer = pcmk_cluster_layer_unknown; const char *cluster = NULL; // Cluster layer is stable once set if (cluster_layer != pcmk_cluster_layer_unknown) { return cluster_layer; } cluster = pcmk__env_option(PCMK__ENV_CLUSTER_TYPE); if (cluster != NULL) { crm_info("Verifying configured cluster layer '%s'", cluster); cluster_layer = pcmk_cluster_layer_invalid; #if SUPPORT_COROSYNC if (pcmk__str_eq(cluster, PCMK_VALUE_COROSYNC, pcmk__str_casei)) { cluster_layer = pcmk_cluster_layer_corosync; } #endif // SUPPORT_COROSYNC if (cluster_layer == pcmk_cluster_layer_invalid) { crm_notice("This installation does not support the '%s' cluster " "infrastructure: terminating", cluster); crm_exit(CRM_EX_FATAL); } crm_info("Assuming an active '%s' cluster", cluster); } else { // Nothing configured, so test supported cluster layers #if SUPPORT_COROSYNC crm_debug("Testing with Corosync"); if (pcmk__corosync_is_active()) { cluster_layer = pcmk_cluster_layer_corosync; } #endif // SUPPORT_COROSYNC if (cluster_layer == pcmk_cluster_layer_unknown) { crm_notice("Could not determine the current cluster layer"); } else { crm_info("Detected an active '%s' cluster", pcmk_cluster_layer_text(cluster_layer)); } } return cluster_layer; } // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START #include gboolean crm_cluster_connect(pcmk_cluster_t *cluster) { return pcmk_cluster_connect(cluster) == pcmk_rc_ok; } const char * name_for_cluster_type(enum cluster_type_e type) { switch (type) { case pcmk_cluster_corosync: return "corosync"; case pcmk_cluster_unknown: return "unknown"; case pcmk_cluster_invalid: return "invalid"; } crm_err("Invalid cluster type: %d", type); return "invalid"; } enum cluster_type_e get_cluster_type(void) { return (enum cluster_type_e) pcmk_get_cluster_layer(); } // LCOV_EXCL_STOP // End deprecated API diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index 506aa79df5..e62ac7a440 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -1,1505 +1,1506 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include // PRIu32 #include // bool #include #include #include #include #include #include #include #include #include #include #include #include "crmcluster_private.h" /* The peer cache remembers cluster nodes that have been seen. * This is managed mostly automatically by libcluster, based on * cluster membership events. * * Because cluster nodes can have conflicting names or UUIDs, * the hash table key is a uniquely generated ID. * * @COMPAT When this is internal, rename to cluster_node_member_cache and make * static. */ GHashTable *crm_peer_cache = NULL; -/* - * The remote peer cache tracks pacemaker_remote nodes. While the +/* The remote peer cache tracks pacemaker_remote nodes. While the * value has the same type as the peer cache's, it is tracked separately for * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs, * so the name (which is also the UUID) is used as the hash table key; there * is no equivalent of membership events, so management is not automatic; and * most users of the peer cache need to exclude pacemaker_remote nodes. * - * That said, using a single cache would be more logical and less error-prone, - * so it would be a good idea to merge them one day. + * @TODO That said, using a single cache would be more logical and less + * error-prone, so it would be a good idea to merge them one day. * - * libcluster provides two avenues for populating the cache: + * libcrmcluster provides two avenues for populating the cache: * pcmk__cluster_lookup_remote_node() and pcmk__cluster_forget_remote_node() * directly manage it, while refresh_remote_nodes() populates it via the CIB. + * + * @TODO Move caches to pcmk_cluster_t */ -GHashTable *crm_remote_peer_cache = NULL; +GHashTable *pcmk__remote_peer_cache = NULL; /* * The CIB cluster node cache tracks cluster nodes that have been seen in * the CIB. It is useful mainly when a caller needs to know about a node that * may no longer be in the membership, but doesn't want to add the node to the * main peer cache tables. */ static GHashTable *cluster_node_cib_cache = NULL; static bool autoreap = true; static bool has_quorum = false; // Flag setting and clearing for pcmk__node_status_t:flags #define set_peer_flags(peer, flags_to_set) do { \ (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_set), \ #flags_to_set); \ } while (0) #define clear_peer_flags(peer, flags_to_clear) do { \ (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \ LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_clear), \ #flags_to_clear); \ } while (0) static void update_peer_uname(pcmk__node_status_t *node, const char *uname); static pcmk__node_status_t *find_cib_cluster_node(const char *id, const char *uname); /*! * \internal * \brief Check whether the cluster currently has quorum * * \return \c true if the cluster has quorum, or \c false otherwise */ bool pcmk__cluster_has_quorum(void) { return has_quorum; } /*! * \internal * \brief Set whether the cluster currently has quorum * * \param[in] quorate \c true if the cluster has quorum, or \c false otherwise */ void pcmk__cluster_set_quorum(bool quorate) { has_quorum = quorate; } /*! * \internal * \brief Get the number of Pacemaker Remote nodes that have been seen * * \return Number of cached Pacemaker Remote nodes */ unsigned int pcmk__cluster_num_remote_nodes(void) { - if (crm_remote_peer_cache == NULL) { + if (pcmk__remote_peer_cache == NULL) { return 0U; } - return g_hash_table_size(crm_remote_peer_cache); + return g_hash_table_size(pcmk__remote_peer_cache); } /*! * \internal * \brief Get a remote node cache entry, creating it if necessary * * \param[in] node_name Name of remote node * * \return Cache entry for node on success, or \c NULL (and set \c errno) * otherwise * * \note When creating a new entry, this will leave the node state undetermined. * The caller should also call \c pcmk__update_peer_state() if the state * is known. * \note Because this can add and remove cache entries, callers should not * assume any previously obtained cache entry pointers remain valid. */ pcmk__node_status_t * pcmk__cluster_lookup_remote_node(const char *node_name) { pcmk__node_status_t *node = NULL; char *node_name_copy = NULL; if (node_name == NULL) { errno = EINVAL; return NULL; } /* It's theoretically possible that the node was added to the cluster peer * cache before it was known to be a Pacemaker Remote node. Remove that * entry unless it has a node ID, which means the name actually is * associated with a cluster node. (@TODO return an error in that case?) */ node = pcmk__search_node_caches(0, node_name, pcmk__node_search_cluster_member); if ((node != NULL) && (node->xml_id == NULL)) { /* node_name could be a pointer into the cache entry being removed, so * reassign it to a copy before the original gets freed */ node_name_copy = strdup(node_name); if (node_name_copy == NULL) { errno = ENOMEM; return NULL; } node_name = node_name_copy; pcmk__cluster_forget_cluster_node(0, node_name); } /* Return existing cache entry if one exists */ - node = g_hash_table_lookup(crm_remote_peer_cache, node_name); + node = g_hash_table_lookup(pcmk__remote_peer_cache, node_name); if (node) { free(node_name_copy); return node; } /* Allocate a new entry */ node = calloc(1, sizeof(pcmk__node_status_t)); if (node == NULL) { free(node_name_copy); return NULL; } /* Populate the essential information */ set_peer_flags(node, pcmk__node_status_remote); node->xml_id = strdup(node_name); if (node->xml_id == NULL) { free(node); errno = ENOMEM; free(node_name_copy); return NULL; } /* Add the new entry to the cache */ - g_hash_table_replace(crm_remote_peer_cache, node->xml_id, node); + g_hash_table_replace(pcmk__remote_peer_cache, node->xml_id, node); crm_trace("added %s to remote cache", node_name); /* Update the entry's uname, ensuring peer status callbacks are called */ update_peer_uname(node, node_name); free(node_name_copy); return node; } /*! * \internal * \brief Remove a node from the Pacemaker Remote node cache * * \param[in] node_name Name of node to remove from cache * * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into the cache entry being removed. */ void pcmk__cluster_forget_remote_node(const char *node_name) { /* Do a lookup first, because node_name could be a pointer within the entry * being removed -- we can't log it *after* removing it. */ - if (g_hash_table_lookup(crm_remote_peer_cache, node_name) != NULL) { + if (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL) { crm_trace("Removing %s from Pacemaker Remote node cache", node_name); - g_hash_table_remove(crm_remote_peer_cache, node_name); + g_hash_table_remove(pcmk__remote_peer_cache, node_name); } } /*! * \internal * \brief Return node status based on a CIB status entry * * \param[in] node_state XML of node state * * \return \c PCMK_VALUE_MEMBER if \c PCMK__XA_IN_CCM is true in * \c PCMK__XE_NODE_STATE, or \c PCMK__VALUE_LOST otherwise */ static const char * remote_state_from_cib(const xmlNode *node_state) { bool in_ccm = false; if ((pcmk__xe_get_bool_attr(node_state, PCMK__XA_IN_CCM, &in_ccm) == pcmk_rc_ok) && in_ccm) { return PCMK_VALUE_MEMBER; } return PCMK__VALUE_LOST; } /* user data for looping through remote node xpath searches */ struct refresh_data { const char *field; /* XML attribute to check for node name */ gboolean has_state; /* whether to update node state based on XML */ }; /*! * \internal * \brief Process one pacemaker_remote node xpath search result * * \param[in] result XML search result * \param[in] user_data what to look for in the XML */ static void remote_cache_refresh_helper(xmlNode *result, void *user_data) { const struct refresh_data *data = user_data; const char *remote = crm_element_value(result, data->field); const char *state = NULL; pcmk__node_status_t *node; CRM_CHECK(remote != NULL, return); /* Determine node's state, if the result has it */ if (data->has_state) { state = remote_state_from_cib(result); } /* Check whether cache already has entry for node */ - node = g_hash_table_lookup(crm_remote_peer_cache, remote); + node = g_hash_table_lookup(pcmk__remote_peer_cache, remote); if (node == NULL) { /* Node is not in cache, so add a new entry for it */ node = pcmk__cluster_lookup_remote_node(remote); CRM_ASSERT(node); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } } static void mark_dirty(gpointer key, gpointer value, gpointer user_data) { set_peer_flags((pcmk__node_status_t *) value, pcmk__node_status_dirty); } static gboolean is_dirty(gpointer key, gpointer value, gpointer user_data) { const pcmk__node_status_t *node = value; return pcmk_is_set(node->flags, pcmk__node_status_dirty); } /*! * \internal * \brief Repopulate the remote node cache based on CIB XML * * \param[in] cib CIB XML to parse */ static void refresh_remote_nodes(xmlNode *cib) { struct refresh_data data; pcmk__cluster_init_node_caches(); /* First, we mark all existing cache entries as dirty, * so that later we can remove any that weren't in the CIB. * We don't empty the cache, because we need to detect changes in state. */ - g_hash_table_foreach(crm_remote_peer_cache, mark_dirty, NULL); + g_hash_table_foreach(pcmk__remote_peer_cache, mark_dirty, NULL); /* Look for guest nodes and remote nodes in the status section */ data.field = PCMK_XA_ID; data.has_state = TRUE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_STATUS, remote_cache_refresh_helper, &data); /* Look for guest nodes and remote nodes in the configuration section, * because they may have just been added and not have a status entry yet. * In that case, the cached node state will be left NULL, so that the * peer status callback isn't called until we're sure the node started * successfully. */ data.field = PCMK_XA_VALUE; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_GUEST_NODE_CONFIG, remote_cache_refresh_helper, &data); data.field = PCMK_XA_ID; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_CONFIG, remote_cache_refresh_helper, &data); /* Remove all old cache entries that weren't seen in the CIB */ - g_hash_table_foreach_remove(crm_remote_peer_cache, is_dirty, NULL); + g_hash_table_foreach_remove(pcmk__remote_peer_cache, is_dirty, NULL); } /*! * \internal * \brief Check whether a node is an active cluster node * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \param[in] node Node to check * * \return \c true if the node is an active cluster node, or \c false otherwise */ bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); if ((node == NULL) || pcmk_is_set(node->flags, pcmk__node_status_remote)) { return false; } switch (cluster_layer) { case pcmk_cluster_layer_corosync: #if SUPPORT_COROSYNC return pcmk__corosync_is_peer_active(node); #else break; #endif // SUPPORT_COROSYNC default: break; } crm_err("Unhandled cluster layer: %s", pcmk_cluster_layer_text(cluster_layer)); return false; } /*! * \internal * \brief Check if a node's entry should be removed from the cluster node cache * * A node should be removed from the cache if it's inactive and matches another * \c pcmk__node_status_t (the search object). The node is considered a * mismatch if any of the following are true: * * The search object is \c NULL. * * The search object has an ID set and the cached node's ID does not match it. * * The search object does not have an ID set, and the cached node's name does * not match the search node's name. (If both names are \c NULL, it's a * match.) * * Otherwise, the node is considered a match. * * Note that if the search object has both an ID and a name set, the name is * ignored for matching purposes. * * \param[in] key Ignored * \param[in] value \c pcmk__node_status_t object from cluster node cache * \param[in] user_data \c pcmk__node_status_t object to match against (search * object) * * \return \c TRUE if the node entry should be removed from \c crm_peer_cache, * or \c FALSE otherwise */ static gboolean should_forget_cluster_node(gpointer key, gpointer value, gpointer user_data) { pcmk__node_status_t *node = value; pcmk__node_status_t *search = user_data; if (search == NULL) { return FALSE; } if ((search->cluster_layer_id != 0) && (node->cluster_layer_id != search->cluster_layer_id)) { return FALSE; } if ((search->cluster_layer_id == 0) && !pcmk__str_eq(node->name, search->name, pcmk__str_casei)) { // @TODO Consider name even if ID is set? return FALSE; } if (pcmk__cluster_is_node_active(value)) { return FALSE; } crm_info("Removing node with name %s and cluster layer ID " PRIu32 " from membership cache", pcmk__s(node->name, "(unknown)"), node->cluster_layer_id); return TRUE; } /*! * \internal * \brief Remove one or more inactive nodes from the cluster node cache * * All inactive nodes matching \p id and \p node_name as described in * \c should_forget_cluster_node documentation are removed from the cache. * * If \p id is 0 and \p node_name is \c NULL, all inactive nodes are removed * from the cache regardless of ID and name. This differs from clearing the * cache, in that entries for active nodes are preserved. * * \param[in] id ID of node to remove from cache (0 to ignore) * \param[in] node_name Name of node to remove from cache (ignored if \p id is * nonzero) * * \note \p node_name is not modified directly, but it will be freed if it's a * pointer into a cache entry that is removed. */ void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name) { pcmk__node_status_t search = { 0, }; char *criterion = NULL; // For logging guint matches = 0; if (crm_peer_cache == NULL) { crm_trace("Membership cache not initialized, ignoring removal request"); return; } search.cluster_layer_id = id; search.name = pcmk__str_copy(node_name); // May log after original freed if (id > 0) { criterion = crm_strdup_printf("cluster layer ID %" PRIu32, id); } else if (node_name != NULL) { criterion = crm_strdup_printf("name %s", node_name); } matches = g_hash_table_foreach_remove(crm_peer_cache, should_forget_cluster_node, &search); if (matches > 0) { if (criterion != NULL) { crm_notice("Removed %u inactive node%s with %s from the membership " "cache", matches, pcmk__plural_s(matches), criterion); } else { crm_notice("Removed all (%u) inactive cluster nodes from the " "membership cache", matches); } } else { crm_info("No inactive cluster nodes%s%s to remove from the membership " "cache", ((criterion != NULL)? " with " : ""), pcmk__s(criterion, "")); } free(search.name); free(criterion); } static void count_peer(gpointer key, gpointer value, gpointer user_data) { unsigned int *count = user_data; pcmk__node_status_t *node = value; if (pcmk__cluster_is_node_active(node)) { *count = *count + 1; } } /*! * \internal * \brief Get the number of active cluster nodes that have been seen * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \return Number of active nodes in the cluster node cache */ unsigned int pcmk__cluster_num_active_nodes(void) { unsigned int count = 0; if (crm_peer_cache != NULL) { g_hash_table_foreach(crm_peer_cache, count_peer, &count); } return count; } static void destroy_crm_node(gpointer data) { pcmk__node_status_t *node = data; crm_trace("Destroying entry for node %" PRIu32 ": %s", node->cluster_layer_id, node->name); free(node->name); free(node->state); free(node->xml_id); free(node->user_data); free(node->expected); free(node->conn_host); free(node); } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_init_node_caches(void) { if (crm_peer_cache == NULL) { crm_peer_cache = pcmk__strikey_table(free, destroy_crm_node); } - if (crm_remote_peer_cache == NULL) { - crm_remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node); + if (pcmk__remote_peer_cache == NULL) { + pcmk__remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node); } if (cluster_node_cib_cache == NULL) { cluster_node_cib_cache = pcmk__strikey_table(free, destroy_crm_node); } } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_destroy_node_caches(void) { if (crm_peer_cache != NULL) { crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache)); g_hash_table_destroy(crm_peer_cache); crm_peer_cache = NULL; } - if (crm_remote_peer_cache != NULL) { + if (pcmk__remote_peer_cache != NULL) { crm_trace("Destroying remote peer cache with %d members", pcmk__cluster_num_remote_nodes()); - g_hash_table_destroy(crm_remote_peer_cache); - crm_remote_peer_cache = NULL; + g_hash_table_destroy(pcmk__remote_peer_cache); + pcmk__remote_peer_cache = NULL; } if (cluster_node_cib_cache != NULL) { crm_trace("Destroying configured cluster node cache with %d members", g_hash_table_size(cluster_node_cib_cache)); g_hash_table_destroy(cluster_node_cib_cache); cluster_node_cib_cache = NULL; } } static void (*peer_status_callback)(enum pcmk__node_update, pcmk__node_status_t *, const void *) = NULL; /*! * \internal * \brief Set a client function that will be called after peer status changes * * \param[in] dispatch Pointer to function to use as callback * * \note Client callbacks should do only client-specific handling. Callbacks * must not add or remove entries in the peer caches. */ void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)) { // @TODO Improve documentation of peer_status_callback peer_status_callback = dispatch; } /*! * \internal * \brief Tell the library whether to automatically reap lost nodes * * If \c true (the default), calling \c crm_update_peer_proc() will also update * the peer state to \c PCMK_VALUE_MEMBER or \c PCMK__VALUE_LOST, and updating * the peer state will reap peers whose state changes to anything other than * \c PCMK_VALUE_MEMBER. * * Callers should leave this enabled unless they plan to manage the cache * separately on their own. * * \param[in] enable \c true to enable automatic reaping, \c false to disable */ void pcmk__cluster_set_autoreap(bool enable) { autoreap = enable; } static void dump_peer_hash(int level, const char *caller) { GHashTableIter iter; const char *id = NULL; pcmk__node_status_t *node = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) { do_crm_log(level, "%s: Node %" PRIu32 "/%s = %p - %s", caller, node->cluster_layer_id, node->name, node, id); } } static gboolean hash_find_by_data(gpointer key, gpointer value, gpointer user_data) { return value == user_data; } /*! * \internal * \brief Search cluster member node cache * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] uuid If not NULL while id is 0, node UUID instead of cluster * node ID to search for * * \return Cluster node cache entry if found, otherwise NULL */ static pcmk__node_status_t * search_cluster_member_cache(unsigned int id, const char *uname, const char *uuid) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); if (uname != NULL) { g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s", node->name); by_name = node; break; } } } if (id > 0) { g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->cluster_layer_id == id) { crm_trace("ID match: %" PRIu32, node->cluster_layer_id); by_id = node; break; } } } else if (uuid != NULL) { g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, uuid, pcmk__str_casei)) { crm_trace("UUID match: %s", node->xml_id); by_id = node; break; } } } node = by_id; /* Good default */ if(by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %u/%s", by_id, id, uname); } else if(by_id == NULL && by_name) { crm_trace("Only one: %p for %u/%s", by_name, id, uname); if (id && by_name->cluster_layer_id) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes %u and %" PRIu32 " share the same name '%s'", id, by_name->cluster_layer_id, uname); node = NULL; /* Create a new one */ } else { node = by_name; } } else if(by_name == NULL && by_id) { crm_trace("Only one: %p for %u/%s", by_id, id, uname); if ((uname != NULL) && (by_id->name != NULL)) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes '%s' and '%s' share the same cluster nodeid %u: " "assuming '%s' is correct", uname, by_id->name, id, uname); } } else if ((uname != NULL) && (by_id->name != NULL)) { if (pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { crm_notice("Node '%s' has changed its cluster layer ID " "from %" PRIu32 " to %" PRIu32, by_id->name, by_name->cluster_layer_id, by_id->cluster_layer_id); g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name); } else { crm_warn("Nodes '%s' and '%s' share the same cluster nodeid: %u %s", by_id->name, by_name->name, id, uname); dump_peer_hash(LOG_INFO, __func__); crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE, TRUE); } } else if ((id > 0) && (by_name->cluster_layer_id > 0)) { crm_warn("Nodes %" PRIu32 " and %" PRIu32 " share the same name: '%s'", by_id->cluster_layer_id, by_name->cluster_layer_id, uname); } else { /* Simple merge */ /* Only corosync-based clusters use node IDs. The functions that call * pcmk__update_peer_state() and crm_update_peer_proc() only know * nodeid, so 'by_id' is authoritative when merging. */ dump_peer_hash(LOG_DEBUG, __func__); crm_info("Merging %p into %p", by_name, by_id); g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name); } return node; } /*! * \internal * \brief Search caches for a node (cluster or Pacemaker Remote) * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] flags Group of enum pcmk__node_search_flags * * \return Node cache entry if found, otherwise NULL */ pcmk__node_status_t * pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags) { pcmk__node_status_t *node = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); if ((uname != NULL) && pcmk_is_set(flags, pcmk__node_search_remote)) { - node = g_hash_table_lookup(crm_remote_peer_cache, uname); + node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_member)) { node = search_cluster_member_cache(id, uname, NULL); } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_cib)) { char *id_str = (id == 0)? NULL : crm_strdup_printf("%u", id); node = find_cib_cluster_node(id_str, uname); free(id_str); } return node; } /*! * \internal * \brief Purge a node from cache (both cluster and Pacemaker Remote) * * \param[in] node_name If not NULL, purge only nodes with this name * \param[in] node_id If not 0, purge cluster nodes only if they have this ID * * \note If \p node_name is NULL and \p node_id is 0, no nodes will be purged. * If \p node_name is not NULL and \p node_id is not 0, Pacemaker Remote * nodes that match \p node_name will be purged, and cluster nodes that * match both \p node_name and \p node_id will be purged. * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into a cache entry being removed. */ void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id) { char *node_name_copy = NULL; if ((node_name == NULL) && (node_id == 0U)) { return; } // Purge from Pacemaker Remote node cache if ((node_name != NULL) - && (g_hash_table_lookup(crm_remote_peer_cache, node_name) != NULL)) { + && (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL)) { /* node_name could be a pointer into the cache entry being purged, * so reassign it to a copy before the original gets freed */ node_name_copy = pcmk__str_copy(node_name); node_name = node_name_copy; crm_trace("Purging %s from Pacemaker Remote node cache", node_name); - g_hash_table_remove(crm_remote_peer_cache, node_name); + g_hash_table_remove(pcmk__remote_peer_cache, node_name); } pcmk__cluster_forget_cluster_node(node_id, node_name); free(node_name_copy); } #if SUPPORT_COROSYNC static guint remove_conflicting_peer(pcmk__node_status_t *node) { int matches = 0; GHashTableIter iter; pcmk__node_status_t *existing_node = NULL; if ((node->cluster_layer_id == 0) || (node->name == NULL)) { return 0; } if (!pcmk__corosync_has_nodelist()) { return 0; } g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) { if ((existing_node->cluster_layer_id > 0) && (existing_node->cluster_layer_id != node->cluster_layer_id) && pcmk__str_eq(existing_node->name, node->name, pcmk__str_casei)) { if (pcmk__cluster_is_node_active(existing_node)) { continue; } crm_warn("Removing cached offline node %" PRIu32 "/%s which has " "conflicting name with %" PRIu32, existing_node->cluster_layer_id, existing_node->name, node->cluster_layer_id); g_hash_table_iter_remove(&iter); matches++; } } return matches; } #endif /*! * \internal * \brief Get a cluster node cache entry, possibly creating one if not found * * If \c pcmk__node_search_cluster_member is set in \p flags, the return value * is guaranteed not to be \c NULL. A new cache entry is created if one does not * already exist. * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] uuid If not NULL while id is 0, node UUID instead of cluster * node ID to search for * \param[in] flags Group of enum pcmk__node_search_flags * * \return (Possibly newly created) cluster node cache entry */ /* coverity[-alloc] Memory is referenced in one or both hashtables */ pcmk__node_status_t * pcmk__get_node(unsigned int id, const char *uname, const char *uuid, uint32_t flags) { pcmk__node_status_t *node = NULL; char *uname_lookup = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); // Check the Pacemaker Remote node cache first if (pcmk_is_set(flags, pcmk__node_search_remote)) { - node = g_hash_table_lookup(crm_remote_peer_cache, uname); + node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); if (node != NULL) { return node; } } if (!pcmk_is_set(flags, pcmk__node_search_cluster_member)) { return NULL; } node = search_cluster_member_cache(id, uname, uuid); /* if uname wasn't provided, and find_peer did not turn up a uname based on id. * we need to do a lookup of the node name using the id in the cluster membership. */ if ((uname == NULL) && ((node == NULL) || (node->name == NULL))) { uname_lookup = pcmk__cluster_node_name(id); } if (uname_lookup) { uname = uname_lookup; crm_trace("Inferred a name of '%s' for node %u", uname, id); /* try to turn up the node one more time now that we know the uname. */ if (node == NULL) { node = search_cluster_member_cache(id, uname, uuid); } } if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); crm_info("Created entry %s/%p for node %s/%u (%d total)", uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache)); g_hash_table_replace(crm_peer_cache, uniqueid, node); } if ((id > 0) && (uname != NULL) && ((node->cluster_layer_id == 0) || (node->name == NULL))) { crm_info("Node %u is now known as %s", id, uname); } if ((id > 0) && (node->cluster_layer_id == 0)) { node->cluster_layer_id = id; } if ((uname != NULL) && (node->name == NULL)) { update_peer_uname(node, uname); } if (node->xml_id == NULL) { if (uuid == NULL) { uuid = pcmk__cluster_node_uuid(node); } if (uuid) { crm_info("Node %u has uuid %s", id, uuid); } else { crm_info("Cannot obtain a UUID for node %u/%s", id, node->name); } } free(uname_lookup); return node; } /*! * \internal * \brief Update a node's uname * * \param[in,out] node Node object to update * \param[in] uname New name to set * * \note This function should not be called within a peer cache iteration, * because in some cases it can remove conflicting cache entries, * which would invalidate the iterator. */ static void update_peer_uname(pcmk__node_status_t *node, const char *uname) { CRM_CHECK(uname != NULL, crm_err("Bug: can't update node name without name"); return); CRM_CHECK(node != NULL, crm_err("Bug: can't update node name to %s without node", uname); return); if (pcmk__str_eq(uname, node->name, pcmk__str_casei)) { crm_debug("Node name '%s' did not change", uname); return; } for (const char *c = uname; *c; ++c) { if ((*c >= 'A') && (*c <= 'Z')) { crm_warn("Node names with capitals are discouraged, consider changing '%s'", uname); break; } } pcmk__str_update(&node->name, uname); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_name, node, NULL); } #if SUPPORT_COROSYNC if ((pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { remove_conflicting_peer(node); } #endif } /*! * \internal * \brief Get log-friendly string equivalent of a process flag * * \param[in] proc Process flag * * \return Log-friendly string equivalent of \p proc */ static inline const char * proc2text(enum crm_proc_flag proc) { const char *text = "unknown"; switch (proc) { case crm_proc_none: text = "none"; break; case crm_proc_cpg: text = "corosync-cpg"; break; } return text; } /*! * \internal * \brief Update a node's process information (and potentially state) * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] flag Bitmask of new process information * \param[in] status node status (online, offline, etc.) * * \return NULL if any node was reaped from peer caches, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, otherwise * reaping could invalidate the iterator. */ pcmk__node_status_t * crm_update_peer_proc(const char *source, pcmk__node_status_t *node, uint32_t flag, const char *status) { uint32_t last = 0; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", source, proc2text(flag), status); return NULL); /* Pacemaker doesn't spawn processes on remote nodes */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return node; } last = node->processes; if (status == NULL) { node->processes = flag; if (node->processes != last) { changed = TRUE; } } else if (pcmk__str_eq(status, PCMK_VALUE_ONLINE, pcmk__str_casei)) { if ((node->processes & flag) != flag) { node->processes = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } } else if (node->processes & flag) { node->processes = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } if (changed) { if (status == NULL && flag <= crm_proc_none) { crm_info("%s: Node %s[%" PRIu32 "] - all processes are now offline", source, node->name, node->cluster_layer_id); } else { crm_info("%s: Node %s[%" PRIu32 "] - %s is now %s", source, node->name, node->cluster_layer_id, proc2text(flag), status); } if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { node->when_online = time(NULL); } else { node->when_online = 0; } /* Call the client callback first, then update the peer state, * in case the node will be reaped */ if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_processes, node, &last); } /* The client callback shouldn't touch the peer caches, * but as a safety net, bail if the peer cache was destroyed. */ if (crm_peer_cache == NULL) { return NULL; } if (autoreap) { const char *peer_state = NULL; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { peer_state = PCMK_VALUE_MEMBER; } else { peer_state = PCMK__VALUE_LOST; } node = pcmk__update_peer_state(__func__, node, peer_state, 0); } } else { crm_trace("%s: Node %s[%" PRIu32 "] - %s is unchanged (%s)", source, node->name, node->cluster_layer_id, proc2text(flag), status); } return node; } /*! * \internal * \brief Update a cluster node cache entry's expected join state * * \param[in] source Caller's function name (for logging) * \param[in,out] node Node to update * \param[in] expected Node's new join state */ void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected) { char *last = NULL; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected); return); /* Remote nodes don't participate in joins */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return; } last = node->expected; if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) { node->expected = strdup(expected); changed = TRUE; } if (changed) { crm_info("%s: Node %s[%" PRIu32 "] - expected state is now %s (was %s)", source, node->name, node->cluster_layer_id, expected, last); free(last); } else { crm_trace("%s: Node %s[%" PRIu32 "] - expected state is unchanged (%s)", source, node->name, node->cluster_layer_id, expected); } } /*! * \internal * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * \param[in,out] iter If not NULL, pointer to node's peer cache iterator * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function may be called from * within a peer cache iteration if the iterator is supplied. */ static pcmk__node_status_t * update_peer_state_iter(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership, GHashTableIter *iter) { gboolean is_member; CRM_CHECK(node != NULL, crm_err("Could not set state for unknown host to %s" QB_XS " source=%s", state, source); return NULL); is_member = pcmk__str_eq(state, PCMK_VALUE_MEMBER, pcmk__str_none); if (is_member) { node->when_lost = 0; if (membership) { node->membership_id = membership; } } if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) { char *last = node->state; if (is_member) { node->when_member = time(NULL); } else { node->when_member = 0; } node->state = strdup(state); crm_notice("Node %s state is now %s " QB_XS " nodeid=%" PRIu32 " previous=%s source=%s", node->name, state, node->cluster_layer_id, pcmk__s(last, "unknown"), source); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_state, node, last); } free(last); if (autoreap && !is_member && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { /* We only autoreap from the peer cache, not the remote peer cache, * because the latter should be managed only by * refresh_remote_nodes(). */ if(iter) { crm_notice("Purged 1 peer with cluster layer ID=" PRIu32 "and/or name=%s from the membership cache", node->cluster_layer_id, node->name); g_hash_table_iter_remove(iter); } else { pcmk__cluster_forget_cluster_node(node->cluster_layer_id, node->name); } node = NULL; } } else { crm_trace("Node %s state is unchanged (%s) " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, state, node->cluster_layer_id, source); } return node; } /*! * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, * otherwise reaping could invalidate the iterator. */ pcmk__node_status_t * pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership) { return update_peer_state_iter(source, node, state, membership, NULL); } /*! * \internal * \brief Reap all nodes from cache whose membership information does not match * * \param[in] membership Membership ID of nodes to keep */ void pcmk__reap_unseen_nodes(uint64_t membership) { GHashTableIter iter; pcmk__node_status_t *node = NULL; crm_trace("Reaping unseen nodes..."); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) { if (node->membership_id != membership) { if (node->state) { /* * Calling update_peer_state_iter() allows us to * remove the node from crm_peer_cache without * invalidating our iterator */ update_peer_state_iter(__func__, node, PCMK__VALUE_LOST, membership, &iter); } else { crm_info("State of node %s[%" PRIu32 "] is still unknown", node->name, node->cluster_layer_id); } } } } static pcmk__node_status_t * find_cib_cluster_node(const char *id, const char *uname) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; if (uname) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s = %p", node->name, node); by_name = node; break; } } } if (id) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, id, pcmk__str_casei)) { crm_trace("ID match: %s= %p", id, node); by_id = node; break; } } } node = by_id; /* Good default */ if (by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %s/%s", by_id, id, uname); } else if (by_id == NULL && by_name) { crm_trace("Only one: %p for %s/%s", by_name, id, uname); if (id) { node = NULL; } else { node = by_name; } } else if (by_name == NULL && by_id) { crm_trace("Only one: %p for %s/%s", by_id, id, uname); if (uname) { node = NULL; } } else if ((uname != NULL) && (by_id->name != NULL) && pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { /* Multiple nodes have the same uname in the CIB. * Return by_id. */ } else if ((id != NULL) && (by_name->xml_id != NULL) && pcmk__str_eq(id, by_name->xml_id, pcmk__str_casei)) { /* Multiple nodes have the same id in the CIB. * Return by_name. */ node = by_name; } else { node = NULL; } if (node == NULL) { crm_debug("Couldn't find node%s%s%s%s", id? " " : "", id? id : "", uname? " with name " : "", uname? uname : ""); } return node; } static void cluster_node_cib_cache_refresh_helper(xmlNode *xml_node, void *user_data) { const char *id = crm_element_value(xml_node, PCMK_XA_ID); const char *uname = crm_element_value(xml_node, PCMK_XA_UNAME); pcmk__node_status_t * node = NULL; CRM_CHECK(id != NULL && uname !=NULL, return); node = find_cib_cluster_node(id, uname); if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); node->name = pcmk__str_copy(uname); node->xml_id = pcmk__str_copy(id); g_hash_table_replace(cluster_node_cib_cache, uniqueid, node); } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { pcmk__str_update(&node->name, uname); /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); } } static void refresh_cluster_node_cib_cache(xmlNode *cib) { pcmk__cluster_init_node_caches(); g_hash_table_foreach(cluster_node_cib_cache, mark_dirty, NULL); crm_foreach_xpath_result(cib, PCMK__XP_MEMBER_NODE_CONFIG, cluster_node_cib_cache_refresh_helper, NULL); // Remove all old cache entries that weren't seen in the CIB g_hash_table_foreach_remove(cluster_node_cib_cache, is_dirty, NULL); } void pcmk__refresh_node_caches_from_cib(xmlNode *cib) { refresh_remote_nodes(cib); refresh_cluster_node_cib_cache(cib); } // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START #include void crm_peer_init(void) { pcmk__cluster_init_node_caches(); } // LCOV_EXCL_STOP // End deprecated API