diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c index 808a7bc7e3..ad2bf2052c 100644 --- a/daemons/attrd/attrd_cib.c +++ b/daemons/attrd/attrd_cib.c @@ -1,692 +1,692 @@ /* * Copyright 2013-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include // PRIu32 #include #include #include #include // cib__* #include #include #include #include #include // pcmk__get_node() #include "pacemaker-attrd.h" static int last_cib_op_done = 0; static void write_attribute(attribute_t *a, bool ignore_delay); static void attrd_cib_destroy_cb(gpointer user_data) { cib_t *cib = user_data; cib->cmds->signoff(cib); if (attrd_shutting_down(false)) { crm_info("Disconnected from the CIB manager"); } else { // @TODO This should trigger a reconnect, not a shutdown crm_crit("Lost connection to the CIB manager, shutting down"); attrd_exit_status = CRM_EX_DISCONNECT; attrd_shutdown(0); } } static void attrd_cib_updated_cb(const char *event, xmlNode *msg) { const xmlNode *patchset = NULL; const char *client_name = NULL; bool status_changed = false; if (cib__get_notify_patchset(msg, &patchset) != pcmk_rc_ok) { return; } if (pcmk__cib_element_in_patchset(patchset, PCMK_XE_ALERTS)) { if (attrd_shutting_down(true)) { crm_debug("Ignoring alerts change in CIB during shutdown"); } else { mainloop_set_trigger(attrd_config_read); } } status_changed = pcmk__cib_element_in_patchset(patchset, PCMK_XE_STATUS); client_name = crm_element_value(msg, PCMK__XA_CIB_CLIENTNAME); if (!cib__client_triggers_refresh(client_name)) { /* This change came from a source that ensured the CIB is consistent * with our attributes table, so we don't need to write anything out. */ return; } if (!attrd_election_won()) { // Don't write attributes if we're not the writer return; } if (status_changed || pcmk__cib_element_in_patchset(patchset, PCMK_XE_NODES)) { if (attrd_shutting_down(true)) { crm_debug("Ignoring node change in CIB during shutdown"); return; } /* An unsafe client modified the PCMK_XE_NODES or PCMK_XE_STATUS * section. Write transient attributes to ensure they're up-to-date in * the CIB. */ if (client_name == NULL) { client_name = crm_element_value(msg, PCMK__XA_CIB_CLIENTID); } crm_notice("Updating all attributes after %s event triggered by %s", event, pcmk__s(client_name, "unidentified client")); attrd_write_attributes(attrd_write_all); } } int attrd_cib_connect(int max_retry) { static int attempts = 0; int rc = -ENOTCONN; the_cib = cib_new(); if (the_cib == NULL) { return -ENOTCONN; } do { if (attempts > 0) { sleep(attempts); } attempts++; crm_debug("Connection attempt %d to the CIB manager", attempts); rc = the_cib->cmds->signon(the_cib, crm_system_name, cib_command); } while ((rc != pcmk_ok) && (attempts < max_retry)); if (rc != pcmk_ok) { crm_err("Connection to the CIB manager failed: %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); goto cleanup; } crm_debug("Connected to the CIB manager after %d attempts", attempts); rc = the_cib->cmds->set_connection_dnotify(the_cib, attrd_cib_destroy_cb); if (rc != pcmk_ok) { crm_err("Could not set disconnection callback"); goto cleanup; } rc = the_cib->cmds->add_notify_callback(the_cib, PCMK__VALUE_CIB_DIFF_NOTIFY, attrd_cib_updated_cb); if (rc != pcmk_ok) { crm_err("Could not set CIB notification callback"); goto cleanup; } return pcmk_ok; cleanup: cib__clean_up_connection(&the_cib); return -ENOTCONN; } void attrd_cib_disconnect(void) { CRM_CHECK(the_cib != NULL, return); the_cib->cmds->del_notify_callback(the_cib, PCMK__VALUE_CIB_DIFF_NOTIFY, attrd_cib_updated_cb); cib__clean_up_connection(&the_cib); mainloop_destroy_trigger(attrd_config_read); } static void attrd_erase_cb(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { const char *node = pcmk__s((const char *) user_data, "a node"); if (rc == pcmk_ok) { crm_info("Cleared transient node attributes for %s from CIB", node); } else { crm_err("Unable to clear transient node attributes for %s from CIB: %s", node, pcmk_strerror(rc)); } } #define XPATH_TRANSIENT "//" PCMK__XE_NODE_STATE \ "[@" PCMK_XA_UNAME "='%s']" \ "/" PCMK__XE_TRANSIENT_ATTRIBUTES /*! * \internal * \brief Wipe all transient node attributes for a node from the CIB * * \param[in] node Node to clear attributes for */ void attrd_cib_erase_transient_attrs(const char *node) { int call_id = 0; char *xpath = NULL; CRM_CHECK(node != NULL, return); xpath = crm_strdup_printf(XPATH_TRANSIENT, node); crm_debug("Clearing transient node attributes for %s from CIB using %s", node, xpath); call_id = the_cib->cmds->remove(the_cib, xpath, NULL, cib_xpath); free(xpath); the_cib->cmds->register_callback_full(the_cib, call_id, 120, FALSE, pcmk__str_copy(node), "attrd_erase_cb", attrd_erase_cb, free); } /*! * \internal * \brief Prepare the CIB after cluster is connected */ void attrd_cib_init(void) { /* We have no attribute values in memory, so wipe the CIB to match. This is * normally done by the DC's controller when this node leaves the cluster, but * this handles the case where the node restarted so quickly that the * cluster layer didn't notice. * * \todo If the attribute manager respawns after crashing (see * PCMK_ENV_RESPAWNED), ideally we'd skip this and sync our attributes * from the writer. However, currently we reject any values for us * that the writer has, in attrd_peer_update(). */ attrd_cib_erase_transient_attrs(attrd_cluster->priv->node_name); // Set a trigger for reading the CIB (for the alerts section) attrd_config_read = mainloop_add_trigger(G_PRIORITY_HIGH, attrd_read_options, NULL); // Always read the CIB at start-up mainloop_set_trigger(attrd_config_read); } static gboolean attribute_timer_cb(gpointer data) { attribute_t *a = data; crm_trace("Dampen interval expired for %s", a->id); attrd_write_or_elect_attribute(a); return FALSE; } static void attrd_cib_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { int level = LOG_ERR; GHashTableIter iter; const char *peer = NULL; attribute_value_t *v = NULL; char *name = user_data; attribute_t *a = g_hash_table_lookup(attributes, name); if(a == NULL) { crm_info("Attribute %s no longer exists", name); return; } a->update = 0; if (rc == pcmk_ok && call_id < 0) { rc = call_id; } switch (rc) { case pcmk_ok: level = LOG_INFO; last_cib_op_done = call_id; if (a->timer && !a->timeout_ms) { // Remove temporary dampening for failed writes mainloop_timer_del(a->timer); a->timer = NULL; } break; case -pcmk_err_diff_failed: /* When an attr changes while the CIB is syncing */ case -ETIME: /* When an attr changes while there is a DC election */ case -ENXIO: /* When an attr changes while the CIB is syncing a * newer config from a node that just came up */ level = LOG_WARNING; break; } do_crm_log(level, "CIB update %d result for %s: %s " QB_XS " rc=%d", call_id, a->id, pcmk_strerror(rc), rc); g_hash_table_iter_init(&iter, a->values); while (g_hash_table_iter_next(&iter, (gpointer *) & peer, (gpointer *) & v)) { if (rc == pcmk_ok) { crm_info("* Wrote %s[%s]=%s", a->id, peer, pcmk__s(v->requested, "(unset)")); pcmk__str_update(&(v->requested), NULL); } else { do_crm_log(level, "* Could not write %s[%s]=%s", a->id, peer, pcmk__s(v->requested, "(unset)")); /* Reattempt write below if we are still the writer */ attrd_set_attr_flags(a, attrd_attr_changed); } } if (pcmk_is_set(a->flags, attrd_attr_changed) && attrd_election_won()) { if (rc == pcmk_ok) { /* We deferred a write of a new update because this update was in * progress. Write out the new value without additional delay. */ crm_debug("Pending update for %s can be written now", a->id); write_attribute(a, false); /* We're re-attempting a write because the original failed; delay * the next attempt so we don't potentially flood the CIB manager * and logs with a zillion attempts per second. * * @TODO We could elect a new writer instead. However, we'd have to * somehow downgrade our vote, and we'd still need something like this * if all peers similarly fail to write this attribute (which may * indicate a corrupted attribute entry rather than a CIB issue). */ } else if (a->timer) { // Attribute has a dampening value, so use that as delay if (!mainloop_timer_running(a->timer)) { crm_trace("Delayed re-attempted write for %s by %s", name, pcmk__readable_interval(a->timeout_ms)); mainloop_timer_start(a->timer); } } else { /* Set a temporary dampening of 2 seconds (timer will continue * to exist until the attribute's dampening gets set or the * write succeeds). */ a->timer = attrd_add_timer(a->id, 2000, a); mainloop_timer_start(a->timer); } } } /*! * \internal * \brief Add a set-attribute update request to the current CIB transaction * * \param[in] attr Attribute to update * \param[in] attr_id ID of attribute to update * \param[in] node_id ID of node for which to update attribute value * \param[in] set_id ID of attribute set * \param[in] value New value for attribute * * \return Standard Pacemaker return code */ static int add_set_attr_update(const attribute_t *attr, const char *attr_id, const char *node_id, const char *set_id, const char *value) { xmlNode *update = pcmk__xe_create(NULL, PCMK__XE_NODE_STATE); xmlNode *child = update; int rc = ENOMEM; crm_xml_add(child, PCMK_XA_ID, node_id); child = pcmk__xe_create(child, PCMK__XE_TRANSIENT_ATTRIBUTES); crm_xml_add(child, PCMK_XA_ID, node_id); child = pcmk__xe_create(child, attr->set_type); crm_xml_add(child, PCMK_XA_ID, set_id); child = pcmk__xe_create(child, PCMK_XE_NVPAIR); crm_xml_add(child, PCMK_XA_ID, attr_id); crm_xml_add(child, PCMK_XA_NAME, attr->id); crm_xml_add(child, PCMK_XA_VALUE, value); rc = the_cib->cmds->modify(the_cib, PCMK_XE_STATUS, update, cib_can_create|cib_transaction); rc = pcmk_legacy2rc(rc); pcmk__xml_free(update); return rc; } /*! * \internal * \brief Add an unset-attribute update request to the current CIB transaction * * \param[in] attr Attribute to update * \param[in] attr_id ID of attribute to update * \param[in] node_id ID of node for which to update attribute value * \param[in] set_id ID of attribute set * * \return Standard Pacemaker return code */ static int add_unset_attr_update(const attribute_t *attr, const char *attr_id, const char *node_id, const char *set_id) { char *xpath = crm_strdup_printf("/" PCMK_XE_CIB "/" PCMK_XE_STATUS "/" PCMK__XE_NODE_STATE "[@" PCMK_XA_ID "='%s']" "/" PCMK__XE_TRANSIENT_ATTRIBUTES "[@" PCMK_XA_ID "='%s']" "/%s[@" PCMK_XA_ID "='%s']" "/" PCMK_XE_NVPAIR "[@" PCMK_XA_ID "='%s' " "and @" PCMK_XA_NAME "='%s']", node_id, node_id, attr->set_type, set_id, attr_id, attr->id); int rc = the_cib->cmds->remove(the_cib, xpath, NULL, cib_xpath|cib_transaction); free(xpath); return pcmk_legacy2rc(rc); } /*! * \internal * \brief Add an attribute update request to the current CIB transaction * * \param[in] attr Attribute to update * \param[in] value New value for attribute * \param[in] node_id ID of node for which to update attribute value * * \return Standard Pacemaker return code */ static int add_attr_update(const attribute_t *attr, const char *value, const char *node_id) { char *set_id = attrd_set_id(attr, node_id); char *nvpair_id = attrd_nvpair_id(attr, node_id); int rc = pcmk_rc_ok; if (value == NULL) { rc = add_unset_attr_update(attr, nvpair_id, node_id, set_id); } else { rc = add_set_attr_update(attr, nvpair_id, node_id, set_id, value); } free(set_id); free(nvpair_id); return rc; } static void send_alert_attributes_value(attribute_t *a, GHashTable *t) { int rc = 0; attribute_value_t *at = NULL; GHashTableIter vIter; g_hash_table_iter_init(&vIter, t); while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & at)) { rc = attrd_send_attribute_alert(at->nodename, at->nodeid, a->id, at->current); crm_trace("Sent alerts for %s[%s]=%s: nodeid=%d rc=%d", a->id, at->nodename, at->current, at->nodeid, rc); } } static void set_alert_attribute_value(GHashTable *t, attribute_value_t *v) { attribute_value_t *a_v = pcmk__assert_alloc(1, sizeof(attribute_value_t)); a_v->nodeid = v->nodeid; a_v->nodename = pcmk__str_copy(v->nodename); a_v->current = pcmk__str_copy(v->current); g_hash_table_replace(t, a_v->nodename, a_v); } mainloop_timer_t * attrd_add_timer(const char *id, int timeout_ms, attribute_t *attr) { return mainloop_timer_add(id, timeout_ms, FALSE, attribute_timer_cb, attr); } /*! * \internal * \brief Write an attribute's values to the CIB if appropriate * * \param[in,out] a Attribute to write * \param[in] ignore_delay If true, write attribute now regardless of any * configured delay */ static void write_attribute(attribute_t *a, bool ignore_delay) { int private_updates = 0, cib_updates = 0; attribute_value_t *v = NULL; GHashTableIter iter; GHashTable *alert_attribute_value = NULL; int rc = pcmk_ok; if (a == NULL) { return; } /* If this attribute will be written to the CIB ... */ if (!stand_alone && !pcmk_is_set(a->flags, attrd_attr_is_private)) { /* Defer the write if now's not a good time */ if (a->update && (a->update < last_cib_op_done)) { crm_info("Write out of '%s' continuing: update %d considered lost", a->id, a->update); a->update = 0; // Don't log this message again } else if (a->update) { crm_info("Write out of '%s' delayed: update %d in progress", a->id, a->update); goto done; } else if (mainloop_timer_running(a->timer)) { if (ignore_delay) { mainloop_timer_stop(a->timer); crm_debug("Overriding '%s' write delay", a->id); } else { crm_info("Delaying write of '%s'", a->id); goto done; } } // Initiate a transaction for all the peer value updates CRM_CHECK(the_cib != NULL, goto done); the_cib->cmds->set_user(the_cib, a->user); rc = the_cib->cmds->init_transaction(the_cib); if (rc != pcmk_ok) { crm_err("Failed to write %s (set %s): Could not initiate " "CIB transaction", a->id, pcmk__s(a->set_id, "unspecified")); goto done; } } /* Attribute will be written shortly, so clear changed flag and force * write flag, and initialize UUID missing flag to false. */ attrd_clear_attr_flags(a, attrd_attr_changed|attrd_attr_uuid_missing|attrd_attr_force_write); /* Make the table for the attribute trap */ alert_attribute_value = pcmk__strikey_table(NULL, attrd_free_attribute_value); /* Iterate over each peer value of this attribute */ g_hash_table_iter_init(&iter, a->values); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &v)) { const char *node_xml_id = NULL; // Private attributes (or any in standalone mode) are not written to CIB if (stand_alone || pcmk_is_set(a->flags, attrd_attr_is_private)) { private_updates++; continue; } // Try to get the XML ID used for the node in the CIB if (pcmk_is_set(v->flags, attrd_value_remote)) { // A Pacemaker Remote node's XML ID is the same as its name node_xml_id = v->nodename; } else { /* Get cluster node XML IDs from the peer caches. * This will create a cluster node cache entry if none exists. */ pcmk__node_status_t *peer = pcmk__get_node(v->nodeid, v->nodename, NULL, pcmk__node_search_any); - node_xml_id = pcmk__cluster_node_uuid(peer); + node_xml_id = pcmk__cluster_get_xml_id(peer); // Remember peer's node ID if we're just now learning it if ((peer->cluster_layer_id != 0) && (v->nodeid == 0)) { crm_trace("Learned ID %" PRIu32 " for node %s", peer->cluster_layer_id, v->nodename); v->nodeid = peer->cluster_layer_id; } } // Defer write if this is a cluster node that's never been seen if (node_xml_id == NULL) { attrd_set_attr_flags(a, attrd_attr_uuid_missing); crm_notice("Cannot write %s[%s]='%s' to CIB because node's XML ID " "is unknown (will retry if learned)", a->id, v->nodename, v->current); continue; } // Update this value as part of the CIB transaction we're building rc = add_attr_update(a, v->current, node_xml_id); if (rc != pcmk_rc_ok) { crm_err("Couldn't add %s[%s]='%s' to CIB transaction: %s " QB_XS " node XML ID %s", a->id, v->nodename, v->current, pcmk_rc_str(rc), node_xml_id); continue; } crm_debug("Added %s[%s]=%s to CIB transaction (node XML ID %s)", a->id, v->nodename, pcmk__s(v->current, "(unset)"), node_xml_id); cib_updates++; /* Preservation of the attribute to transmit alert */ set_alert_attribute_value(alert_attribute_value, v); // Save this value so we can log it when write completes pcmk__str_update(&(v->requested), v->current); } if (private_updates) { crm_info("Processed %d private change%s for %s (set %s)", private_updates, pcmk__plural_s(private_updates), a->id, pcmk__s(a->set_id, "unspecified")); } if (cib_updates > 0) { char *id = pcmk__str_copy(a->id); // Commit transaction a->update = the_cib->cmds->end_transaction(the_cib, true, cib_none); crm_info("Sent CIB request %d with %d change%s for %s (set %s)", a->update, cib_updates, pcmk__plural_s(cib_updates), a->id, pcmk__s(a->set_id, "unspecified")); if (the_cib->cmds->register_callback_full(the_cib, a->update, CIB_OP_TIMEOUT_S, FALSE, id, "attrd_cib_callback", attrd_cib_callback, free)) { // Transmit alert of the attribute send_alert_attributes_value(a, alert_attribute_value); } } done: // Discard transaction (if any) if (the_cib != NULL) { the_cib->cmds->end_transaction(the_cib, false, cib_none); the_cib->cmds->set_user(the_cib, NULL); } if (alert_attribute_value != NULL) { g_hash_table_destroy(alert_attribute_value); } } /*! * \internal * \brief Write out attributes * * \param[in] options Group of enum attrd_write_options */ void attrd_write_attributes(uint32_t options) { GHashTableIter iter; attribute_t *a = NULL; crm_debug("Writing out %s attributes", pcmk_is_set(options, attrd_write_all)? "all" : "changed"); g_hash_table_iter_init(&iter, attributes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) { if (!pcmk_is_set(options, attrd_write_all) && pcmk_is_set(a->flags, attrd_attr_uuid_missing)) { // Try writing this attribute again, in case peer ID was learned attrd_set_attr_flags(a, attrd_attr_changed); } else if (pcmk_is_set(a->flags, attrd_attr_force_write)) { /* If the force_write flag is set, write the attribute. */ attrd_set_attr_flags(a, attrd_attr_changed); } if (pcmk_is_set(options, attrd_write_all) || pcmk_is_set(a->flags, attrd_attr_changed)) { bool ignore_delay = pcmk_is_set(options, attrd_write_no_delay); if (pcmk_is_set(a->flags, attrd_attr_force_write)) { // Always ignore delay when forced write flag is set ignore_delay = true; } write_attribute(a, ignore_delay); } else { crm_trace("Skipping unchanged attribute %s", a->id); } } } void attrd_write_or_elect_attribute(attribute_t *a) { if (attrd_election_won()) { write_attribute(a, false); } else { attrd_start_election_if_needed(); } } diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 4b00f894ef..ed4751b8d7 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,697 +1,697 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include static qb_ipcs_service_t *ipcs = NULL; static crm_trigger_t *config_read_trigger = NULL; #if SUPPORT_COROSYNC extern gboolean crm_connect_corosync(pcmk_cluster_t *cluster); #endif static void crm_shutdown(int nsig); static gboolean crm_read_options(gpointer user_data); /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; if (controld_globals.cluster == NULL) { controld_globals.cluster = pcmk_cluster_new(); } if (action & A_HA_DISCONNECT) { pcmk_cluster_disconnect(controld_globals.cluster); crm_info("Disconnected from the cluster"); controld_set_fsa_input_flags(R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { pcmk__cluster_set_status_callback(&peer_update_callback); pcmk__cluster_set_autoreap(false); #if SUPPORT_COROSYNC if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { registered = crm_connect_corosync(controld_globals.cluster); } #endif // SUPPORT_COROSYNC if (registered) { pcmk__node_status_t *node = controld_get_local_node_status(); controld_election_init(); free(controld_globals.our_uuid); controld_globals.our_uuid = - pcmk__str_copy(pcmk__cluster_node_uuid(node)); + pcmk__str_copy(pcmk__cluster_get_xml_id(node)); if (controld_globals.our_uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } } if (!registered) { controld_set_fsa_input_flags(R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __func__); controld_clear_fsa_input_flags(R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __func__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ controld_set_fsa_input_flags(R_SHUTDOWN); controld_disconnect_fencer(FALSE); } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; controld_set_fsa_input_flags(R_SHUTDOWN); //controld_set_fsa_input_flags(R_STAYDOWN); crm_info("Sending shutdown request to all peers (DC is %s)", pcmk__s(controld_globals.dc_name, "not set")); msg = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD, NULL, CRM_SYSTEM_CRMD, CRM_OP_SHUTDOWN_REQ, NULL); if (!pcmk__cluster_send_message(NULL, pcmk_ipc_controld, msg)) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } pcmk__xml_free(msg); } void crmd_fast_exit(crm_exit_t exit_code) { if (pcmk_is_set(controld_globals.fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn " QB_XS " remapping exit code %d to %d", exit_code, CRM_EX_FATAL); exit_code = CRM_EX_FATAL; } else if ((exit_code == CRM_EX_OK) && pcmk_is_set(controld_globals.fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = CRM_EX_ERROR; } if (controld_globals.logger_out != NULL) { controld_globals.logger_out->finish(controld_globals.logger_out, exit_code, true, NULL); pcmk__output_free(controld_globals.logger_out); controld_globals.logger_out = NULL; } crm_exit(exit_code); } crm_exit_t crmd_exit(crm_exit_t exit_code) { GMainLoop *mloop = controld_globals.mainloop; static bool in_progress = FALSE; if (in_progress && (exit_code == CRM_EX_OK)) { crm_debug("Exit is already in progress"); return exit_code; } else if(in_progress) { crm_notice("Error during shutdown process, exiting now with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } in_progress = TRUE; crm_trace("Preparing to exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); /* Suppress secondary errors resulting from us disconnecting everything */ controld_set_fsa_input_flags(R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } controld_close_attrd_ipc(); controld_shutdown_schedulerd_ipc(); controld_disconnect_fencer(TRUE); if ((exit_code == CRM_EX_OK) && (controld_globals.mainloop == NULL)) { crm_debug("No mainloop detected"); exit_code = CRM_EX_ERROR; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if (exit_code != CRM_EX_OK) { crm_notice("Forcing immediate exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } /* Clean up as much memory as possible for valgrind */ for (GList *iter = controld_globals.fsa_message_queue; iter != NULL; iter = iter->next) { fsa_data_t *fsa_data = (fsa_data_t *) iter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(controld_globals.fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } controld_clear_fsa_input_flags(R_MEMBERSHIP); g_list_free(controld_globals.fsa_message_queue); controld_globals.fsa_message_queue = NULL; controld_free_node_pending_timers(); election_reset(controld_globals.cluster); // Stop any election timer /* Tear down the CIB manager connection, but don't free it yet -- it could * be used when we drain the mainloop later. */ controld_disconnect_cib_manager(); verify_stopped(controld_globals.fsa_state, LOG_WARNING); controld_clear_fsa_input_flags(R_LRM_CONNECTED); lrm_state_destroy_all(); mainloop_destroy_trigger(config_read_trigger); config_read_trigger = NULL; controld_destroy_fsa_trigger(); controld_destroy_transition_trigger(); pcmk__client_cleanup(); pcmk__cluster_destroy_node_caches(); controld_free_fsa_timers(); te_cleanup_stonith_history_sync(NULL, TRUE); controld_free_sched_timer(); free(controld_globals.our_uuid); controld_globals.our_uuid = NULL; free(controld_globals.dc_name); controld_globals.dc_name = NULL; free(controld_globals.dc_version); controld_globals.dc_version = NULL; free(controld_globals.cluster_name); controld_globals.cluster_name = NULL; free(controld_globals.te_uuid); controld_globals.te_uuid = NULL; free_max_generation(); controld_destroy_failed_sync_table(); controld_destroy_outside_events_table(); mainloop_destroy_signal(SIGPIPE); mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); /* leave SIGCHLD engaged as we might still want to drain some service-actions */ if (mloop) { GMainContext *ctx = g_main_loop_get_context(controld_globals.mainloop); /* Don't re-enter this block */ controld_globals.mainloop = NULL; /* no signals on final draining anymore */ mainloop_destroy_signal(SIGCHLD); crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); { int lpc = 0; while((g_main_context_pending(ctx) && lpc < 10)) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); } else { mainloop_destroy_signal(SIGCHLD); } cib_delete(controld_globals.cib_conn); controld_globals.cib_conn = NULL; throttle_fini(); pcmk_cluster_free(controld_globals.cluster); controld_globals.cluster = NULL; /* Graceful */ crm_trace("Done preparing for exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); return exit_code; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; if (pcmk_is_set(action, A_EXIT_1)) { exit_code = CRM_EX_ERROR; crm_err("Exiting now due to errors"); } verify_stopped(cur_state, LOG_ERR); crmd_exit(exit_code); } static void sigpipe_ignore(int nsig) { return; } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); mainloop_add_signal(SIGPIPE, sigpipe_ignore); config_read_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); controld_init_fsa_trigger(); controld_init_transition_trigger(); crm_debug("Creating CIB manager and executor objects"); controld_globals.cib_conn = cib_new(); lrm_state_init_local(); if (controld_init_fsa_timers() == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } // \return libqb error code (0 on success, -errno on error) static int32_t accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Accepting new IPC client connection"); if (pcmk__new_client(c, uid, gid) == NULL) { return -ENOMEM; } return 0; } // \return libqb error code (0 on success, -errno on error) static int32_t dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; pcmk__client_t *client = pcmk__find_client(c); xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags); if (msg == NULL) { pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, CRM_EX_PROTOCOL); return 0; } pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, CRM_EX_INDETERMINATE); pcmk__assert(client->user != NULL); pcmk__update_acl_user(msg, PCMK__XA_CRM_USER, client->user); crm_xml_add(msg, PCMK__XA_CRM_SYS_FROM, client->id); if (controld_authorize_ipc_message(msg, client, NULL)) { crm_trace("Processing IPC message from client %s", pcmk__client_name(client)); route_message(C_IPC_MESSAGE, msg); } controld_trigger_fsa(); pcmk__xml_free(msg); return 0; } static int32_t ipc_client_disconnected(qb_ipcs_connection_t *c) { pcmk__client_t *client = pcmk__find_client(c); if (client) { crm_trace("Disconnecting %sregistered client %s (%p/%p)", (client->userdata? "" : "un"), pcmk__client_name(client), c, client); free(client->userdata); pcmk__free_client(client); controld_trigger_fsa(); } return 0; } static void ipc_connection_destroyed(qb_ipcs_connection_t *c) { crm_trace("Connection %p", c); ipc_client_disconnected(c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = accept_controller_client, .connection_created = NULL, .msg_process = dispatch_controller_ipc, .connection_closed = ipc_client_disconnected, .connection_destroyed = ipc_connection_destroyed }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_MEMBERSHIP)) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) { crm_info("Delaying start, not connected to executor (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_READ_CONFIG)) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_PEER_DATA)) { crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = pcmk__serve_controld_ipc(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_notice("Pacemaker controller successfully started and accepting connections"); } controld_set_fsa_input_flags(R_ST_REQUIRED); controld_timer_fencer_connect(GINT_TO_POINTER(TRUE)); controld_clear_fsa_input_flags(R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { controld_set_fsa_input_flags(R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); xmlNode *crmconfig = NULL; xmlNode *alerts = NULL; pcmk_rule_input_t rule_input = { .now = now, }; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); controld_set_fsa_input_flags(R_STAYDOWN); } goto bail; } crmconfig = output; if ((crmconfig != NULL) && !pcmk__xe_is(crmconfig, PCMK_XE_CRM_CONFIG)) { crmconfig = pcmk__xe_first_child(crmconfig, PCMK_XE_CRM_CONFIG, NULL, NULL); } if (!crmconfig) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query for " PCMK_XE_CRM_CONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = pcmk__strkey_table(free, free); pcmk_unpack_nvpair_blocks(crmconfig, PCMK_XE_CLUSTER_PROPERTY_SET, PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS, &rule_input, config_hash, NULL); // Validate all options, and use defaults if not already present in hash pcmk__validate_cluster_options(config_hash); /* Validate the watchdog timeout in the context of the local node * environment. If invalid, the controller will exit with a fatal error. * * We do this via a wrapper in the controller, so that we call * pcmk__valid_stonith_watchdog_timeout() only if watchdog fencing is * enabled for the local node. Otherwise, we may exit unnecessarily. * * A validator function in libcrmcommon can't act as such a wrapper, because * it doesn't have a stonith API connection or the local node name. */ value = g_hash_table_lookup(config_hash, PCMK_OPT_STONITH_WATCHDOG_TIMEOUT); controld_verify_stonith_watchdog_timeout(value); value = g_hash_table_lookup(config_hash, PCMK_OPT_NO_QUORUM_POLICY); if (pcmk__strcase_any_of(value, PCMK_VALUE_FENCE, PCMK_VALUE_FENCE_LEGACY, NULL) && (pcmk__locate_sbd() != 0)) { controld_set_global_flags(controld_no_quorum_panic); } value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK); if (crm_is_true(value)) { controld_set_global_flags(controld_shutdown_lock_enabled); } else { controld_clear_global_flags(controld_shutdown_lock_enabled); } value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK_LIMIT); pcmk_parse_interval_spec(value, &controld_globals.shutdown_lock_limit); controld_globals.shutdown_lock_limit /= 1000; value = g_hash_table_lookup(config_hash, PCMK_OPT_NODE_PENDING_TIMEOUT); pcmk_parse_interval_spec(value, &controld_globals.node_pending_timeout); controld_globals.node_pending_timeout /= 1000; value = g_hash_table_lookup(config_hash, PCMK_OPT_CLUSTER_NAME); pcmk__str_update(&(controld_globals.cluster_name), value); // Let subcomponents initialize their own static variables controld_configure_election(config_hash); controld_configure_fencing(config_hash); controld_configure_fsa_timers(config_hash); controld_configure_throttle(config_hash); alerts = pcmk__xe_first_child(output, PCMK_XE_ALERTS, NULL, NULL); crmd_unpack_alerts(alerts); controld_set_fsa_input_flags(R_READ_CONFIG); controld_trigger_fsa(); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } /*! * \internal * \brief Trigger read and processing of the configuration * * \param[in] fn Calling function name * \param[in] line Line number where call occurred */ void controld_trigger_config_as(const char *fn, int line) { if (config_read_trigger != NULL) { crm_trace("%s:%d - Triggered config processing", fn, line); mainloop_set_trigger(config_read_trigger); } } gboolean crm_read_options(gpointer user_data) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = cib_conn->cmds->query(cib_conn, "//" PCMK_XE_CRM_CONFIG " | //" PCMK_XE_ALERTS, NULL, cib_xpath); fsa_register_cib_callback(call_id, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); controld_trigger_config(); } static void crm_shutdown(int nsig) { const char *value = NULL; guint default_period_ms = 0; if ((controld_globals.mainloop == NULL) || !g_main_loop_is_running(controld_globals.mainloop)) { crmd_exit(CRM_EX_OK); return; } if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); return; } controld_set_fsa_input_flags(R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); /* If shutdown timer doesn't have a period set, use the default * * @TODO: Evaluate whether this is still necessary. As long as * config_query_callback() has been run at least once, it doesn't look like * anything could have changed the timer period since then. */ value = pcmk__cluster_option(NULL, PCMK_OPT_SHUTDOWN_ESCALATION); pcmk_parse_interval_spec(value, &default_period_ms); controld_shutdown_start_countdown(default_period_ms); } diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 093f48eb45..7565b6c6c4 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -1,1124 +1,1124 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); /* * stonith failure counting * * We don't want to get stuck in a permanent fencing loop. Keep track of the * number of fencing failures for each target node, and the most we'll restart a * transition for. */ struct st_fail_rec { int count; }; #define DEFAULT_STONITH_MAX_ATTEMPTS 10 static bool fence_reaction_panic = false; static unsigned long int stonith_max_attempts = DEFAULT_STONITH_MAX_ATTEMPTS; static GHashTable *stonith_failures = NULL; /*! * \internal * \brief Update max fencing attempts before giving up * * \param[in] value New max fencing attempts */ static void update_stonith_max_attempts(const char *value) { int score = 0; int rc = pcmk_parse_score(value, &score, DEFAULT_STONITH_MAX_ATTEMPTS); // The option validator ensures invalid values shouldn't be possible CRM_CHECK((rc == pcmk_rc_ok) && (score > 0), return); if (stonith_max_attempts != score) { crm_debug("Maximum fencing attempts per transition is now %d (was %lu)", score, stonith_max_attempts); } stonith_max_attempts = score; } /*! * \internal * \brief Configure reaction to notification of local node being fenced * * \param[in] reaction_s Reaction type */ static void set_fence_reaction(const char *reaction_s) { if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) { fence_reaction_panic = true; } else { if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) { crm_warn("Invalid value '%s' for %s, using 'stop'", reaction_s, PCMK_OPT_FENCE_REACTION); } fence_reaction_panic = false; } } /*! * \internal * \brief Configure fencing options based on the CIB * * \param[in,out] options Name/value pairs for configured options */ void controld_configure_fencing(GHashTable *options) { const char *value = NULL; value = g_hash_table_lookup(options, PCMK_OPT_FENCE_REACTION); set_fence_reaction(value); value = g_hash_table_lookup(options, PCMK_OPT_STONITH_MAX_ATTEMPTS); update_stonith_max_attempts(value); } static gboolean too_many_st_failures(const char *target) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if (stonith_failures == NULL) { return FALSE; } if (target == NULL) { g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { if (value->count >= stonith_max_attempts) { target = (const char*)key; goto too_many; } } } else { value = g_hash_table_lookup(stonith_failures, target); if ((value != NULL) && (value->count >= stonith_max_attempts)) { goto too_many; } } return FALSE; too_many: crm_warn("Too many failures (%d) to fence %s, giving up", value->count, target); return TRUE; } /*! * \internal * \brief Reset a stonith fail count * * \param[in] target Name of node to reset, or NULL for all */ void st_fail_count_reset(const char *target) { if (stonith_failures == NULL) { return; } if (target) { struct st_fail_rec *rec = NULL; rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count = 0; } } else { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *rec = NULL; g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &rec)) { rec->count = 0; } } } static void st_fail_count_increment(const char *target) { struct st_fail_rec *rec = NULL; if (stonith_failures == NULL) { stonith_failures = pcmk__strkey_table(free, free); } rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); if(rec == NULL) { return; } rec->count = 1; g_hash_table_insert(stonith_failures, pcmk__str_copy(target), rec); } } /* end stonith fail count functions */ static void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } } static void send_stonith_update(pcmk__graph_action_t *action, const char *target, const char *uuid) { int rc = pcmk_ok; pcmk__node_status_t *peer = NULL; /* We (usually) rely on the membership layer to do node_update_cluster, * and the peer status callback to do node_update_peer, because the node * might have already rejoined before we get the stonith result here. */ int flags = node_update_join | node_update_expected; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate. * Try getting any existing node cache entry also by node uuid in case it * doesn't have an uname yet. */ peer = pcmk__get_node(0, target, uuid, pcmk__node_search_any); CRM_CHECK(peer != NULL, return); if (peer->state == NULL) { /* Usually, we rely on the membership layer to update the cluster state * in the CIB. However, if the node has never been seen, do it here, so * the node is not considered unclean. */ flags |= node_update_cluster; } if (peer->xml_id == NULL) { crm_info("Recording XML ID '%s' for node '%s'", uuid, target); peer->xml_id = pcmk__str_copy(uuid); } crmd_peer_down(peer, TRUE); /* Generate a node state update for the CIB */ node_state = create_node_state_update(peer, flags, NULL, __func__); /* we have to mark whether or not remote nodes have already been fenced */ if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) { char *now_s = pcmk__ttoa(time(NULL)); crm_xml_add(node_state, PCMK__XA_NODE_FENCED, now_s); free(now_s); } /* Force our known ID */ crm_xml_add(node_state, PCMK_XA_ID, uuid); rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn, PCMK_XE_STATUS, node_state, cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); // Make sure it sticks /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn, * cib_none); */ controld_delete_node_state(peer->name, controld_section_all, cib_none); pcmk__xml_free(node_state); return; } /*! * \internal * \brief Abort transition due to stonith failure * * \param[in] abort_action Whether to restart or stop transition * \param[in] target Don't restart if this (NULL for any) has too many failures * \param[in] reason Log this stonith action XML as abort reason (or NULL) */ static void abort_for_stonith_failure(enum pcmk__graph_next abort_action, const char *target, const xmlNode *reason) { /* If stonith repeatedly fails, we eventually give up on starting a new * transition for that reason. */ if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) { abort_action = pcmk__graph_wait; } abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed", reason); } /* * stonith cleanup list * * If the DC is shot, proper notifications might not go out. * The stonith cleanup list allows the cluster to (re-)send * notifications once a new DC is elected. */ static GList *stonith_cleanup_list = NULL; /*! * \internal * \brief Add a node to the stonith cleanup list * * \param[in] target Name of node to add */ void add_stonith_cleanup(const char *target) { stonith_cleanup_list = g_list_append(stonith_cleanup_list, pcmk__str_copy(target)); } /*! * \internal * \brief Remove a node from the stonith cleanup list * * \param[in] Name of node to remove */ void remove_stonith_cleanup(const char *target) { GList *iter = stonith_cleanup_list; while (iter != NULL) { GList *tmp = iter; char *iter_name = tmp->data; iter = iter->next; if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) { crm_trace("Removing %s from the cleanup list", iter_name); stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); free(iter_name); } } } /*! * \internal * \brief Purge all entries from the stonith cleanup list */ void purge_stonith_cleanup(void) { if (stonith_cleanup_list) { GList *iter = NULL; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_info("Purging %s from stonith cleanup list", target); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } } /*! * \internal * \brief Send stonith updates for all entries in cleanup list, then purge it */ void execute_stonith_cleanup(void) { GList *iter; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; pcmk__node_status_t *target_node = pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member); - const char *uuid = pcmk__cluster_node_uuid(target_node); + const char *uuid = pcmk__cluster_get_xml_id(target_node); crm_notice("Marking %s, target of a previous stonith action, as clean", target); send_stonith_update(NULL, target, uuid); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } /* end stonith cleanup list functions */ /* stonith API client * * Functions that need to interact directly with the fencer via its API */ static stonith_t *stonith_api = NULL; static mainloop_timer_t *controld_fencer_connect_timer = NULL; static char *te_client_id = NULL; static gboolean fail_incompletable_stonith(pcmk__graph_t *graph) { GList *lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GList *lpc2 = NULL; pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data; if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data; if ((action->type != pcmk__cluster_graph_action) || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { continue; } task = crm_element_value(action->xml, PCMK_XA_OPERATION); if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); last_action = action->xml; pcmk__update_graph(graph, action); crm_notice("Failing action %d (%s): fencer terminated", action->id, pcmk__xe_id(action->xml)); } } } if (last_action != NULL) { crm_warn("Fencer failure resulted in unrunnable actions"); abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) { te_cleanup_stonith_history_sync(st, FALSE); if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { crm_err("Lost fencer connection (will attempt to reconnect)"); if (!mainloop_timer_running(controld_fencer_connect_timer)) { mainloop_timer_start(controld_fencer_connect_timer); } } else { crm_info("Disconnected from fencer"); } if (stonith_api) { /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(st); } stonith_api->cmds->remove_notification(stonith_api, NULL); } if (AM_I_DC) { fail_incompletable_stonith(controld_globals.transition_graph); trigger_graph(); } } /*! * \internal * \brief Handle an event notification from the fencing API * * \param[in] st Fencing API connection (ignored) * \param[in] event Fencing API event notification */ static void handle_fence_notification(stonith_t *st, stonith_event_t *event) { bool succeeded = true; const char *executioner = "the cluster"; const char *client = "a client"; const char *reason = NULL; int exec_status; if (te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } if (event == NULL) { crm_err("Notify data not found"); return; } if (event->executioner != NULL) { executioner = event->executioner; } if (event->client_origin != NULL) { client = event->client_origin; } exec_status = stonith__event_execution_status(event); if ((stonith__event_exit_status(event) != CRM_EX_OK) || (exec_status != PCMK_EXEC_DONE)) { succeeded = false; if (exec_status == PCMK_EXEC_DONE) { exec_status = PCMK_EXEC_ERROR; } } reason = stonith__event_exit_reason(event); crmd_alert_fencing_op(event); if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) { // Unfencing doesn't need special handling, just a log message if (succeeded) { crm_notice("%s was unfenced by %s at the request of %s@%s", event->target, executioner, client, event->origin); } else { crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", event->target, executioner, pcmk_exec_status_str(exec_status), ((reason == NULL)? "" : ": "), ((reason == NULL)? "" : reason), stonith__event_exit_status(event)); } return; } if (succeeded && controld_is_local_node(event->target)) { /* We were notified of our own fencing. Most likely, either fencing was * misconfigured, or fabric fencing that doesn't cut cluster * communication is in use. * * Either way, shutting down the local host is a good idea, to require * administrator intervention. Also, other nodes would otherwise likely * set our status to lost because of the fencing callback and discard * our subsequent election votes as "not part of our cluster". */ crm_crit("We were allegedly just fenced by %s for %s!", executioner, event->origin); // Dumps blackbox if enabled if (fence_reaction_panic) { pcmk__panic("Notified of own fencing"); } else { crm_exit(CRM_EX_FATAL); } return; // Should never get here } /* Update the count of fencing failures for this target, in case we become * DC later. The current DC has already updated its fail count in * tengine_stonith_callback(). */ if (!AM_I_DC) { if (succeeded) { st_fail_count_reset(event->target); } else { st_fail_count_increment(event->target); } } crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " "%s%s%s%s " QB_XS " event=%s", event->target, (succeeded? "" : " not"), event->action, executioner, client, event->origin, (succeeded? "OK" : pcmk_exec_status_str(exec_status)), ((reason == NULL)? "" : " ("), ((reason == NULL)? "" : reason), ((reason == NULL)? "" : ")"), event->id); if (succeeded) { const uint32_t flags = pcmk__node_search_any |pcmk__node_search_cluster_cib; pcmk__node_status_t *peer = pcmk__search_node_caches(0, event->target, NULL, flags); const char *uuid = NULL; if (peer == NULL) { return; } - uuid = pcmk__cluster_node_uuid(peer); + uuid = pcmk__cluster_get_xml_id(peer); if (AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, event->target, uuid); /* @TODO Ideally, at this point, we'd check whether the fenced node * hosted any guest nodes, and call remote_node_down() for them. * Unfortunately, the controller doesn't have a simple, reliable way * to map hosts to guests. It might be possible to track this in the * peer cache via refresh_remote_nodes(). For now, we rely on the * scheduler creating fence pseudo-events for the guests. */ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { /* Abort the current transition if it wasn't the cluster that * initiated fencing. */ crm_info("External fencing operation from %s fenced %s", client, event->target); abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "External Fencing Operation", NULL); } } else if (pcmk__str_eq(controld_globals.dc_name, event->target, pcmk__str_null_matches|pcmk__str_casei) && !pcmk_is_set(peer->flags, pcmk__node_status_remote)) { // Assume the target was our DC if we don't currently have one if (controld_globals.dc_name != NULL) { crm_notice("Fencing target %s was our DC", event->target); } else { crm_notice("Fencing target %s may have been our DC", event->target); } /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (controld_is_local_node(event->executioner)) { send_stonith_update(NULL, event->target, uuid); } add_stonith_cleanup(event->target); } /* If the target is a remote node, and we host its connection, * immediately fail all monitors so it can be recovered quickly. * The connection won't necessarily drop when a remote node is fenced, * so the failure might not otherwise be detected until the next poke. */ if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) { remote_ra_fail(event->target); } crmd_peer_down(peer, TRUE); } } /*! * \brief Connect to fencer * * \param[in] user_data If NULL, retry failures now, otherwise retry in mainloop timer * * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ gboolean controld_timer_fencer_connect(gpointer user_data) { int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); if (stonith_api == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); return G_SOURCE_REMOVE; } } if (stonith_api->state != stonith_disconnected) { crm_trace("Already connected to fencer, no need to retry"); return G_SOURCE_REMOVE; } if (user_data == NULL) { // Blocking (retry failures now until successful) rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); if (rc != pcmk_ok) { crm_err("Could not connect to fencer in 30 attempts: %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); } } else { // Non-blocking (retry failures later in main loop) rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (controld_fencer_connect_timer == NULL) { controld_fencer_connect_timer = mainloop_timer_add("controld_fencer_connect", 1000, TRUE, controld_timer_fencer_connect, GINT_TO_POINTER(TRUE)); } if (rc != pcmk_ok) { if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { crm_notice("Fencer connection failed (will retry): %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); if (!mainloop_timer_running(controld_fencer_connect_timer)) { mainloop_timer_start(controld_fencer_connect_timer); } return G_SOURCE_CONTINUE; } else { crm_info("Fencer connection failed (ignoring because no longer required): %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); } return G_SOURCE_REMOVE; } } if (rc == pcmk_ok) { stonith_api_operations_t *cmds = stonith_api->cmds; cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_FENCE, handle_fence_notification); cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED, tengine_stonith_history_synced); te_trigger_stonith_history_sync(TRUE); crm_notice("Fencer successfully connected"); } return G_SOURCE_REMOVE; } void controld_disconnect_fencer(bool destroy) { if (stonith_api) { // Prevent fencer connection from coming up again controld_clear_fsa_input_flags(R_ST_REQUIRED); if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(stonith_api); } stonith_api->cmds->remove_notification(stonith_api, NULL); } if (destroy) { if (stonith_api) { stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (controld_fencer_connect_timer) { mainloop_timer_del(controld_fencer_connect_timer); controld_fencer_connect_timer = NULL; } if (te_client_id) { free(te_client_id); te_client_id = NULL; } } } static gboolean do_stonith_history_sync(gpointer user_data) { if (stonith_api && (stonith_api->state != stonith_disconnected)) { stonith_history_t *history = NULL; te_cleanup_stonith_history_sync(stonith_api, FALSE); stonith_api->cmds->history(stonith_api, st_opt_sync_call | st_opt_broadcast, NULL, &history, 5); stonith_history_free(history); return TRUE; } else { crm_info("Skip triggering stonith history-sync as stonith is disconnected"); return FALSE; } } static void tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) { char *uuid = NULL; int stonith_id = -1; int transition_id = -1; pcmk__graph_action_t *action = NULL; const char *target = NULL; if ((data == NULL) || (data->userdata == NULL)) { crm_err("Ignoring fence operation %d result: " "No transition key given (bug?)", ((data == NULL)? -1 : data->call_id)); return; } if (!AM_I_DC) { const char *reason = stonith__exit_reason(data); if (reason == NULL) { reason = pcmk_exec_status_str(stonith__execution_status(data)); } crm_notice("Result of fence operation %d: %d (%s) " QB_XS " key=%s", data->call_id, stonith__exit_status(data), reason, (const char *) data->userdata); return; } CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, &stonith_id, NULL), goto bail); if (controld_globals.transition_graph->complete || (stonith_id < 0) || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none) || (controld_globals.transition_graph->id != transition_id)) { crm_info("Ignoring fence operation %d result: " "Not from current transition " QB_XS " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", data->call_id, pcmk__btoa(controld_globals.transition_graph->complete), stonith_id, uuid, controld_globals.te_uuid, transition_id, controld_globals.transition_graph->id); goto bail; } action = controld_get_action(stonith_id); if (action == NULL) { crm_err("Ignoring fence operation %d result: " "Action %d not found in transition graph (bug?) " QB_XS " uuid=%s transition=%d", data->call_id, stonith_id, uuid, transition_id); goto bail; } target = crm_element_value(action->xml, PCMK__META_ON_NODE); if (target == NULL) { crm_err("Ignoring fence operation %d result: No target given (bug?)", data->call_id); goto bail; } stop_te_timer(action); if (stonith__exit_status(data) == CRM_EX_OK) { const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); const char *op = crm_meta_value(action->params, PCMK__META_STONITH_ACTION); crm_info("Fence operation %d for %s succeeded", data->call_id, target); if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { te_action_confirmed(action, NULL); if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) { const char *value = NULL; char *now = pcmk__ttoa(time(NULL)); gboolean is_remote_node = FALSE; /* This check is not 100% reliable, since this node is not * guaranteed to have the remote node cached. However, it * doesn't have to be reliable, since the attribute manager can * learn a node's "remoteness" by other means sooner or later. * This allows it to learn more quickly if this node does have * the information. */ if (g_hash_table_lookup(pcmk__remote_peer_cache, uuid) != NULL) { is_remote_node = TRUE; } update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, is_remote_node); free(now); value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL); update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, is_remote_node); value = crm_meta_value(action->params, PCMK__META_DIGESTS_SECURE); update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, is_remote_node); } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) { send_stonith_update(action, target, uuid); pcmk__set_graph_action_flags(action, pcmk__graph_action_sent_update); } } st_fail_count_reset(target); } else { enum pcmk__graph_next abort_action = pcmk__graph_restart; int status = stonith__execution_status(data); const char *reason = stonith__exit_reason(data); if (reason == NULL) { if (status == PCMK_EXEC_DONE) { reason = "Agent returned error"; } else { reason = pcmk_exec_status_str(status); } } pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); /* If no fence devices were available, there's no use in immediately * checking again, so don't start a new transition in that case. */ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { crm_warn("Fence operation %d for %s failed: %s " "(aborting transition and giving up for now)", data->call_id, target, reason); abort_action = pcmk__graph_wait; } else { crm_notice("Fence operation %d for %s failed: %s " "(aborting transition)", data->call_id, target, reason); } /* Increment the fail count now, so abort_for_stonith_failure() can * check it. Non-DC nodes will increment it in * handle_fence_notification(). */ st_fail_count_increment(target); abort_for_stonith_failure(abort_action, target, NULL); } pcmk__update_graph(controld_globals.transition_graph, action); trigger_graph(); bail: free(data->userdata); free(uuid); return; } static int fence_with_delay(const char *target, const char *type, int delay) { uint32_t options = st_opt_none; // Group of enum stonith_call_options int timeout_sec = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout); if (crmd_join_phase_count(controld_join_confirmed) == 1) { stonith__set_call_options(options, target, st_opt_allow_self_fencing); } return stonith_api->cmds->fence_with_delay(stonith_api, options, target, type, timeout_sec, 0, delay); } /*! * \internal * \brief Execute a fencing action from a transition graph * * \param[in] graph Transition graph being executed (ignored) * \param[in] action Fencing action to execute * * \return Standard Pacemaker return code */ int controld_execute_fence_action(pcmk__graph_t *graph, pcmk__graph_action_t *action) { int rc = 0; const char *id = pcmk__xe_id(action->xml); const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID); const char *target = crm_element_value(action->xml, PCMK__META_ON_NODE); const char *type = crm_meta_value(action->params, PCMK__META_STONITH_ACTION); char *transition_key = NULL; const char *priority_delay = NULL; int delay_i = 0; gboolean invalid_action = FALSE; int stonith_timeout = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return EPROTO; } priority_delay = crm_meta_value(action->params, PCMK_OPT_PRIORITY_FENCING_DELAY); crm_notice("Requesting fencing (%s) targeting node %s " QB_XS " action=%s timeout=%i%s%s", type, target, id, stonith_timeout, priority_delay ? " priority_delay=" : "", priority_delay ? priority_delay : ""); /* Passing NULL means block until we can connect... */ controld_timer_fencer_connect(NULL); pcmk__scan_min_int(priority_delay, &delay_i, 0); rc = fence_with_delay(target, type, delay_i); transition_key = pcmk__transition_key(controld_globals.transition_graph->id, action->id, 0, controld_globals.te_uuid), stonith_api->cmds->register_callback(stonith_api, rc, (stonith_timeout + (delay_i > 0 ? delay_i : 0)), st_opt_timeout_updates, transition_key, "tengine_stonith_callback", tengine_stonith_callback); return pcmk_rc_ok; } bool controld_verify_stonith_watchdog_timeout(const char *value) { long long st_timeout = (value != NULL)? crm_get_msec(value) : 0; const char *our_nodename = controld_globals.cluster->priv->node_name; if (st_timeout == 0 || (stonith_api && (stonith_api->state != stonith_disconnected) && stonith__watchdog_fencing_enabled_for_node_api(stonith_api, our_nodename))) { return pcmk__valid_stonith_watchdog_timeout(value); } return true; } /* end stonith API client functions */ /* * stonith history synchronization * * Each node's fencer keeps track of a cluster-wide fencing history. When a node * joins or leaves, we need to synchronize the history across all nodes. */ static crm_trigger_t *stonith_history_sync_trigger = NULL; static mainloop_timer_t *stonith_history_sync_timer_short = NULL; static mainloop_timer_t *stonith_history_sync_timer_long = NULL; void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) { if (free_timers) { mainloop_timer_del(stonith_history_sync_timer_short); stonith_history_sync_timer_short = NULL; mainloop_timer_del(stonith_history_sync_timer_long); stonith_history_sync_timer_long = NULL; } else { mainloop_timer_stop(stonith_history_sync_timer_short); mainloop_timer_stop(stonith_history_sync_timer_long); } if (st) { st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED); } } static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) { te_cleanup_stonith_history_sync(st, FALSE); crm_debug("Fence-history synced - cancel all timers"); } static gboolean stonith_history_sync_set_trigger(gpointer user_data) { mainloop_set_trigger(stonith_history_sync_trigger); return FALSE; } void te_trigger_stonith_history_sync(bool long_timeout) { /* trigger a sync in 5s to give more nodes the * chance to show up so that we don't create * unnecessary stonith-history-sync traffic * * the long timeout of 30s is there as a fallback * so that after a successful connection to fenced * we will wait for 30s for the DC to trigger a * history-sync * if this doesn't happen we trigger a sync locally * (e.g. fenced segfaults and is restarted by pacemakerd) */ /* as we are finally checking the stonith-connection * in do_stonith_history_sync we should be fine * leaving stonith_history_sync_time & stonith_history_sync_trigger * around */ if (stonith_history_sync_trigger == NULL) { stonith_history_sync_trigger = mainloop_add_trigger(G_PRIORITY_LOW, do_stonith_history_sync, NULL); } if (long_timeout) { if(stonith_history_sync_timer_long == NULL) { stonith_history_sync_timer_long = mainloop_timer_add("history_sync_long", 30000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); mainloop_timer_start(stonith_history_sync_timer_long); } else { if(stonith_history_sync_timer_short == NULL) { stonith_history_sync_timer_short = mainloop_timer_add("history_sync_short", 5000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); mainloop_timer_start(stonith_history_sync_timer_short); } } /* end stonith history synchronization functions */ diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c index 7ada26949d..9960966c6b 100644 --- a/daemons/controld/controld_join_dc.c +++ b/daemons/controld/controld_join_dc.c @@ -1,1095 +1,1095 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include // PRIu32 #include // bool, true, false #include // NULL #include // free(), etc. #include // gboolean, etc. #include // xmlNode #include #include #include #include static char *max_generation_from = NULL; static xmlNodePtr max_generation_xml = NULL; /*! * \internal * \brief Nodes from which a CIB sync has failed since the peer joined * * This table is of the form (node_name -> join_id). \p node_name is * the name of a client node from which a CIB \p sync_from() call has failed in * \p do_dc_join_finalize() since the client joined the cluster as a peer. * \p join_id is the ID of the join round in which the \p sync_from() failed, * and is intended for use in nack log messages. */ static GHashTable *failed_sync_nodes = NULL; void finalize_join_for(gpointer key, gpointer value, gpointer user_data); void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); /* Numeric counter used to identify join rounds (an unsigned int would be * appropriate, except we get and set it in XML as int) */ static int current_join_id = 0; /*! * \internal * \brief Get log-friendly string equivalent of a controller group join phase * * \param[in] phase Join phase * * \return Log-friendly string equivalent of \p phase */ static const char * join_phase_text(enum controld_join_phase phase) { switch (phase) { case controld_join_nack: return "nack"; case controld_join_none: return "none"; case controld_join_welcomed: return "welcomed"; case controld_join_integrated: return "integrated"; case controld_join_finalized: return "finalized"; case controld_join_confirmed: return "confirmed"; default: return "invalid"; } } /*! * \internal * \brief Destroy the hash table containing failed sync nodes */ void controld_destroy_failed_sync_table(void) { if (failed_sync_nodes != NULL) { g_hash_table_destroy(failed_sync_nodes); failed_sync_nodes = NULL; } } /*! * \internal * \brief Remove a node from the failed sync nodes table if present * * \param[in] node_name Node name to remove */ void controld_remove_failed_sync_node(const char *node_name) { if (failed_sync_nodes != NULL) { g_hash_table_remove(failed_sync_nodes, (gchar *) node_name); } } /*! * \internal * \brief Add to a hash table a node whose CIB failed to sync * * \param[in] node_name Name of node whose CIB failed to sync * \param[in] join_id Join round when the failure occurred */ static void record_failed_sync_node(const char *node_name, gint join_id) { if (failed_sync_nodes == NULL) { failed_sync_nodes = pcmk__strikey_table(g_free, NULL); } /* If the node is already in the table then we failed to nack it during the * filter offer step */ CRM_LOG_ASSERT(g_hash_table_insert(failed_sync_nodes, g_strdup(node_name), GINT_TO_POINTER(join_id))); } /*! * \internal * \brief Look up a node name in the failed sync table * * \param[in] node_name Name of node to look up * \param[out] join_id Where to store the join ID of when the sync failed * * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the * node name was found, or \p pcmk_rc_node_unknown otherwise. * \note \p *join_id is set to -1 if the node is not found. */ static int lookup_failed_sync_node(const char *node_name, gint *join_id) { *join_id = -1; if (failed_sync_nodes != NULL) { gpointer result = g_hash_table_lookup(failed_sync_nodes, (gchar *) node_name); if (result != NULL) { *join_id = GPOINTER_TO_INT(result); return pcmk_rc_ok; } } return pcmk_rc_node_unknown; } void crm_update_peer_join(const char *source, pcmk__node_status_t *node, enum controld_join_phase phase) { enum controld_join_phase last = controld_get_join_phase(node); CRM_CHECK(node != NULL, return); /* Remote nodes do not participate in joins */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return; } if (phase == last) { crm_trace("Node %s join-%d phase is still %s " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, current_join_id, join_phase_text(last), node->cluster_layer_id, source); return; } if ((phase <= controld_join_none) || (phase == (last + 1))) { struct controld_node_status_data *data = NULL; if (node->user_data == NULL) { node->user_data = pcmk__assert_alloc(1, sizeof(struct controld_node_status_data)); } data = node->user_data; data->join_phase = phase; crm_trace("Node %s join-%d phase is now %s (was %s) " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, current_join_id, join_phase_text(phase), join_phase_text(last), node->cluster_layer_id, source); return; } crm_warn("Rejecting join-%d phase update for node %s because can't go from " "%s to %s " QB_XS " nodeid=%" PRIu32 " source=%s", current_join_id, node->name, join_phase_text(last), join_phase_text(phase), node->cluster_layer_id, source); } static void start_join_round(void) { GHashTableIter iter; pcmk__node_status_t *peer = NULL; crm_debug("Starting new join round join-%d", current_join_id); g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { crm_update_peer_join(__func__, peer, controld_join_none); } if (max_generation_from != NULL) { free(max_generation_from); max_generation_from = NULL; } if (max_generation_xml != NULL) { pcmk__xml_free(max_generation_xml); max_generation_xml = NULL; } controld_clear_fsa_input_flags(R_HAVE_CIB); } /*! * \internal * \brief Create a join message from the DC * * \param[in] join_op Join operation name * \param[in] host_to Recipient of message */ static xmlNode * create_dc_message(const char *join_op, const char *host_to) { xmlNode *msg = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_DC, host_to, CRM_SYSTEM_CRMD, join_op, NULL); /* Identify which election this is a part of */ crm_xml_add_int(msg, PCMK__XA_JOIN_ID, current_join_id); /* Add a field specifying whether the DC is shutting down. This keeps the * joining node from fencing the old DC if it becomes the new DC. */ pcmk__xe_set_bool_attr(msg, PCMK__XA_DC_LEAVING, pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)); return msg; } static void join_make_offer(gpointer key, gpointer value, gpointer user_data) { /* @TODO We don't use user_data except to distinguish one particular call * from others. Make this clearer. */ xmlNode *offer = NULL; pcmk__node_status_t *member = (pcmk__node_status_t *) value; pcmk__assert(member != NULL); if (!pcmk__cluster_is_node_active(member)) { crm_info("Not making join-%d offer to inactive node %s", current_join_id, pcmk__s(member->name, "with unknown name")); if ((member->expected == NULL) && pcmk__str_eq(member->state, PCMK__VALUE_LOST, pcmk__str_none)) { /* You would think this unsafe, but in fact this plus an * active resource is what causes it to be fenced. * * Yes, this does mean that any node that dies at the same * time as the old DC and is not running resource (still) * won't be fenced. * * I'm not happy about this either. */ pcmk__update_peer_expected(__func__, member, CRMD_JOINSTATE_DOWN); } return; } if (member->name == NULL) { crm_info("Not making join-%d offer to node uuid %s with unknown name", current_join_id, member->xml_id); return; } if (controld_globals.membership_id != controld_globals.peer_seq) { controld_globals.membership_id = controld_globals.peer_seq; crm_info("Making join-%d offers based on membership event %llu", current_join_id, controld_globals.peer_seq); } if (user_data != NULL) { enum controld_join_phase phase = controld_get_join_phase(member); if (phase > controld_join_none) { crm_info("Not making join-%d offer to already known node %s (%s)", current_join_id, member->name, join_phase_text(phase)); return; } } crm_update_peer_join(__func__, (pcmk__node_status_t*) member, controld_join_none); offer = create_dc_message(CRM_OP_JOIN_OFFER, member->name); // Advertise our feature set so the joining node can bail if not compatible crm_xml_add(offer, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET); crm_info("Sending join-%d offer to %s", current_join_id, member->name); pcmk__cluster_send_message(member, pcmk_ipc_controld, offer); pcmk__xml_free(offer); crm_update_peer_join(__func__, member, controld_join_welcomed); } /* A_DC_JOIN_OFFER_ALL */ void do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int count; /* Reset everyone's status back to down or in_ccm in the CIB. * Any nodes that are active in the CIB but not in the cluster membership * will be seen as offline by the scheduler anyway. */ current_join_id++; start_join_round(); update_dc(NULL); if (cause == C_HA_MESSAGE && current_input == I_NODE_JOIN) { crm_info("A new node joined the cluster"); } g_hash_table_foreach(pcmk__peer_cache, join_make_offer, NULL); count = crmd_join_phase_count(controld_join_welcomed); crm_info("Waiting on join-%d requests from %d outstanding node%s", current_join_id, count, pcmk__plural_s(count)); // Don't waste time by invoking the scheduler yet } /* A_DC_JOIN_OFFER_ONE */ void do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { pcmk__node_status_t *member = NULL; ha_msg_input_t *welcome = NULL; int count; const char *join_to = NULL; if (msg_data->data == NULL) { crm_info("Making join-%d offers to any unconfirmed nodes " "because an unknown node joined", current_join_id); g_hash_table_foreach(pcmk__peer_cache, join_make_offer, &member); check_join_state(cur_state, __func__); return; } welcome = fsa_typed_data(fsa_dt_ha_msg); if (welcome == NULL) { // fsa_typed_data() already logged an error return; } join_to = crm_element_value(welcome->msg, PCMK__XA_SRC); if (join_to == NULL) { crm_err("Can't make join-%d offer to unknown node", current_join_id); return; } member = pcmk__get_node(0, join_to, NULL, pcmk__node_search_cluster_member); /* It is possible that a node will have been sick or starting up when the * original offer was made. However, it will either re-announce itself in * due course, or we can re-store the original offer on the client. */ crm_update_peer_join(__func__, member, controld_join_none); join_make_offer(NULL, member, NULL); /* If the offer isn't to the local node, make an offer to the local node as * well, to ensure the correct value for max_generation_from. */ if (!controld_is_local_node(join_to)) { member = controld_get_local_node_status(); join_make_offer(NULL, member, NULL); } /* This was a genuine join request; cancel any existing transition and * invoke the scheduler. */ abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Node join", NULL); count = crmd_join_phase_count(controld_join_welcomed); crm_info("Waiting on join-%d requests from %d outstanding node%s", current_join_id, count, pcmk__plural_s(count)); // Don't waste time by invoking the scheduler yet } static int compare_int_fields(xmlNode * left, xmlNode * right, const char *field) { const char *elem_l = crm_element_value(left, field); const char *elem_r = crm_element_value(right, field); long long int_elem_l; long long int_elem_r; int rc = pcmk_rc_ok; rc = pcmk__scan_ll(elem_l, &int_elem_l, -1LL); if (rc != pcmk_rc_ok) { // Shouldn't be possible crm_warn("Comparing current CIB %s as -1 " "because '%s' is not an integer", field, elem_l); } rc = pcmk__scan_ll(elem_r, &int_elem_r, -1LL); if (rc != pcmk_rc_ok) { // Shouldn't be possible crm_warn("Comparing joining node's CIB %s as -1 " "because '%s' is not an integer", field, elem_r); } if (int_elem_l < int_elem_r) { return -1; } else if (int_elem_l > int_elem_r) { return 1; } return 0; } /* A_DC_JOIN_PROCESS_REQ */ void do_dc_join_filter_offer(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *generation = NULL; int cmp = 0; int join_id = -1; int count = 0; gint value = 0; gboolean ack_nack_bool = TRUE; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = crm_element_value(join_ack->msg, PCMK__XA_SRC); const char *ref = crm_element_value(join_ack->msg, PCMK_XA_REFERENCE); const char *join_version = crm_element_value(join_ack->msg, PCMK_XA_CRM_FEATURE_SET); pcmk__node_status_t *join_node = NULL; if (join_from == NULL) { crm_err("Ignoring invalid join request without node name"); return; } join_node = pcmk__get_node(0, join_from, NULL, pcmk__node_search_cluster_member); crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id); if (join_id != current_join_id) { crm_debug("Ignoring join-%d request from %s because we are on join-%d", join_id, join_from, current_join_id); check_join_state(cur_state, __func__); return; } generation = join_ack->xml; if (max_generation_xml != NULL && generation != NULL) { int lpc = 0; const char *attributes[] = { PCMK_XA_ADMIN_EPOCH, PCMK_XA_EPOCH, PCMK_XA_NUM_UPDATES, }; /* It's not obvious that join_ack->xml is the PCMK__XE_GENERATION_TUPLE * element from the join client. The "if" guard is for clarity. */ if (pcmk__xe_is(generation, PCMK__XE_GENERATION_TUPLE)) { for (lpc = 0; cmp == 0 && lpc < PCMK__NELEM(attributes); lpc++) { cmp = compare_int_fields(max_generation_xml, generation, attributes[lpc]); } } else { // Should always be PCMK__XE_GENERATION_TUPLE CRM_LOG_ASSERT(false); } } if (ref == NULL) { ref = "none"; // for logging only } if (lookup_failed_sync_node(join_from, &value) == pcmk_rc_ok) { crm_err("Rejecting join-%d request from node %s because we failed to " "sync its CIB in join-%d " QB_XS " ref=%s", join_id, join_from, value, ref); ack_nack_bool = FALSE; } else if (!pcmk__cluster_is_node_active(join_node)) { if (match_down_event(join_from) != NULL) { /* The join request was received after the node was fenced or * otherwise shutdown in a way that we're aware of. No need to log * an error in this rare occurrence; we know the client was recently * shut down, and receiving a lingering in-flight request is not * cause for alarm. */ crm_debug("Rejecting join-%d request from inactive node %s " QB_XS " ref=%s", join_id, join_from, ref); } else { crm_err("Rejecting join-%d request from inactive node %s " QB_XS " ref=%s", join_id, join_from, ref); } ack_nack_bool = FALSE; } else if (generation == NULL) { crm_err("Rejecting invalid join-%d request from node %s " "missing CIB generation " QB_XS " ref=%s", join_id, join_from, ref); ack_nack_bool = FALSE; } else if ((join_version == NULL) || !feature_set_compatible(CRM_FEATURE_SET, join_version)) { crm_err("Rejecting join-%d request from node %s because feature set %s" " is incompatible with ours (%s) " QB_XS " ref=%s", join_id, join_from, (join_version? join_version : "pre-3.1.0"), CRM_FEATURE_SET, ref); ack_nack_bool = FALSE; } else if (max_generation_xml == NULL) { const char *validation = crm_element_value(generation, PCMK_XA_VALIDATE_WITH); if (pcmk__get_schema(validation) == NULL) { crm_err("Rejecting join-%d request from %s (with first CIB " "generation) due to %s schema version %s " QB_XS " ref=%s", join_id, join_from, ((validation == NULL)? "missing" : "unknown"), pcmk__s(validation, ""), ref); ack_nack_bool = FALSE; } else { crm_debug("Accepting join-%d request from %s (with first CIB " "generation) " QB_XS " ref=%s", join_id, join_from, ref); max_generation_xml = pcmk__xml_copy(NULL, generation); pcmk__str_update(&max_generation_from, join_from); } } else if ((cmp < 0) || ((cmp == 0) && controld_is_local_node(join_from))) { const char *validation = crm_element_value(generation, PCMK_XA_VALIDATE_WITH); if (pcmk__get_schema(validation) == NULL) { crm_err("Rejecting join-%d request from %s (with better CIB " "generation than current best from %s) due to %s " "schema version %s " QB_XS " ref=%s", join_id, join_from, max_generation_from, ((validation == NULL)? "missing" : "unknown"), pcmk__s(validation, ""), ref); ack_nack_bool = FALSE; } else { crm_debug("Accepting join-%d request from %s (with better CIB " "generation than current best from %s) " QB_XS " ref=%s", join_id, join_from, max_generation_from, ref); crm_log_xml_debug(max_generation_xml, "Old max generation"); crm_log_xml_debug(generation, "New max generation"); pcmk__xml_free(max_generation_xml); max_generation_xml = pcmk__xml_copy(NULL, join_ack->xml); pcmk__str_update(&max_generation_from, join_from); } } else { crm_debug("Accepting join-%d request from %s " QB_XS " ref=%s", join_id, join_from, ref); } if (!ack_nack_bool) { crm_update_peer_join(__func__, join_node, controld_join_nack); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_NACK); } else { crm_update_peer_join(__func__, join_node, controld_join_integrated); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER); } count = crmd_join_phase_count(controld_join_integrated); crm_debug("%d node%s currently integrated in join-%d", count, pcmk__plural_s(count), join_id); if (check_join_state(cur_state, __func__) == FALSE) { // Don't waste time by invoking the scheduler yet count = crmd_join_phase_count(controld_join_welcomed); crm_debug("Waiting on join-%d requests from %d outstanding node%s", join_id, count, pcmk__plural_s(count)); } } /* A_DC_JOIN_FINALIZE */ void do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { char *sync_from = NULL; int rc = pcmk_ok; int count_welcomed = crmd_join_phase_count(controld_join_welcomed); int count_finalizable = crmd_join_phase_count(controld_join_integrated) + crmd_join_phase_count(controld_join_nack); /* This we can do straight away and avoid clients timing us out * while we compute the latest CIB */ if (count_welcomed != 0) { crm_debug("Waiting on join-%d requests from %d outstanding node%s " "before finalizing join", current_join_id, count_welcomed, pcmk__plural_s(count_welcomed)); crmd_join_phase_log(LOG_DEBUG); /* crmd_fsa_stall(FALSE); Needed? */ return; } else if (count_finalizable == 0) { crm_debug("Finalization not needed for join-%d at the current time", current_join_id); crmd_join_phase_log(LOG_DEBUG); check_join_state(controld_globals.fsa_state, __func__); return; } controld_clear_fsa_input_flags(R_HAVE_CIB); if ((max_generation_from == NULL) || controld_is_local_node(max_generation_from)) { controld_set_fsa_input_flags(R_HAVE_CIB); } if (!controld_globals.transition_graph->complete) { crm_warn("Delaying join-%d finalization while transition in progress", current_join_id); crmd_join_phase_log(LOG_DEBUG); crmd_fsa_stall(FALSE); return; } if (pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) { // Send our CIB out to everyone sync_from = pcmk__str_copy(controld_globals.cluster->priv->node_name); crm_debug("Finalizing join-%d for %d node%s (sync'ing from local CIB)", current_join_id, count_finalizable, pcmk__plural_s(count_finalizable)); crm_log_xml_debug(max_generation_xml, "Requested CIB version"); } else { // Ask for the agreed best CIB sync_from = pcmk__str_copy(max_generation_from); crm_notice("Finalizing join-%d for %d node%s (sync'ing CIB from %s)", current_join_id, count_finalizable, pcmk__plural_s(count_finalizable), sync_from); crm_log_xml_notice(max_generation_xml, "Requested CIB version"); } crmd_join_phase_log(LOG_DEBUG); rc = controld_globals.cib_conn->cmds->sync_from(controld_globals.cib_conn, sync_from, NULL, cib_none); fsa_register_cib_callback(rc, sync_from, finalize_sync_callback); } void free_max_generation(void) { free(max_generation_from); max_generation_from = NULL; pcmk__xml_free(max_generation_xml); max_generation_xml = NULL; } void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { CRM_LOG_ASSERT(-EPERM != rc); if (rc != pcmk_ok) { const char *sync_from = (const char *) user_data; do_crm_log(((rc == -pcmk_err_old_data)? LOG_WARNING : LOG_ERR), "Could not sync CIB from %s in join-%d: %s", sync_from, current_join_id, pcmk_strerror(rc)); if (rc != -pcmk_err_old_data) { record_failed_sync_node(sync_from, current_join_id); } /* restart the whole join process */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __func__); } else if (!AM_I_DC) { crm_debug("Sync'ed CIB for join-%d but no longer DC", current_join_id); } else if (controld_globals.fsa_state != S_FINALIZE_JOIN) { crm_debug("Sync'ed CIB for join-%d but no longer in S_FINALIZE_JOIN " "(%s)", current_join_id, fsa_state2string(controld_globals.fsa_state)); } else { controld_set_fsa_input_flags(R_HAVE_CIB); /* make sure dc_uuid is re-set to us */ if (!check_join_state(controld_globals.fsa_state, __func__)) { int count_finalizable = 0; count_finalizable = crmd_join_phase_count(controld_join_integrated) + crmd_join_phase_count(controld_join_nack); crm_debug("Notifying %d node%s of join-%d results", count_finalizable, pcmk__plural_s(count_finalizable), current_join_id); g_hash_table_foreach(pcmk__peer_cache, finalize_join_for, NULL); } } } static void join_node_state_commit_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { const char *node = user_data; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; // for register_fsa_error() macro crm_crit("join-%d node history update (via CIB call %d) for node %s " "failed: %s", current_join_id, call_id, node, pcmk_strerror(rc)); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } crm_debug("join-%d node history update (via CIB call %d) for node %s " "complete", current_join_id, call_id, node); check_join_state(controld_globals.fsa_state, __func__); } /* A_DC_JOIN_PROCESS_ACK */ void do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int join_id = -1; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *op = crm_element_value(join_ack->msg, PCMK__XA_CRM_TASK); char *join_from = crm_element_value_copy(join_ack->msg, PCMK__XA_SRC); pcmk__node_status_t *peer = NULL; enum controld_join_phase phase = controld_join_none; enum controld_section_e section = controld_section_lrm; char *xpath = NULL; xmlNode *state = join_ack->xml; xmlNode *execd_state = NULL; cib_t *cib = controld_globals.cib_conn; int rc = pcmk_ok; // Sanity checks if (join_from == NULL) { crm_warn("Ignoring message received without node identification"); goto done; } if (op == NULL) { crm_warn("Ignoring message received from %s without task", join_from); goto done; } if (strcmp(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring '%s' message from %s while waiting for '%s'", op, join_from, CRM_OP_JOIN_CONFIRM); goto done; } if (crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id) != 0) { crm_warn("Ignoring join confirmation from %s without valid join ID", join_from); goto done; } peer = pcmk__get_node(0, join_from, NULL, pcmk__node_search_cluster_member); phase = controld_get_join_phase(peer); if (phase != controld_join_finalized) { crm_info("Ignoring out-of-sequence join-%d confirmation from %s " "(currently %s not %s)", join_id, join_from, join_phase_text(phase), join_phase_text(controld_join_finalized)); goto done; } if (join_id != current_join_id) { crm_err("Rejecting join-%d confirmation from %s " "because currently on join-%d", join_id, join_from, current_join_id); crm_update_peer_join(__func__, peer, controld_join_nack); goto done; } crm_update_peer_join(__func__, peer, controld_join_confirmed); /* Update CIB with node's current executor state. A new transition will be * triggered later, when the CIB manager notifies us of the change. * * The delete and modify requests are part of an atomic transaction. */ rc = cib->cmds->init_transaction(cib); if (rc != pcmk_ok) { goto done; } // Delete relevant parts of node's current executor state from CIB if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { section = controld_section_lrm_unlocked; } controld_node_state_deletion_strings(join_from, section, &xpath, NULL); rc = cib->cmds->remove(cib, xpath, NULL, cib_xpath|cib_multiple|cib_transaction); if (rc != pcmk_ok) { goto done; } // Update CIB with node's latest known executor state if (controld_is_local_node(join_from)) { // Use the latest possible state if processing our own join ack execd_state = controld_query_executor_state(); if (execd_state != NULL) { crm_debug("Updating local node history for join-%d from query " "result", current_join_id); state = execd_state; } else { crm_warn("Updating local node history from join-%d confirmation " "because query failed", current_join_id); } } else { crm_debug("Updating node history for %s from join-%d confirmation", join_from, current_join_id); } rc = cib->cmds->modify(cib, PCMK_XE_STATUS, state, cib_can_create|cib_transaction); pcmk__xml_free(execd_state); if (rc != pcmk_ok) { goto done; } // Commit the transaction rc = cib->cmds->end_transaction(cib, true, cib_none); fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback); if (rc > 0) { // join_from will be freed after callback join_from = NULL; rc = pcmk_ok; } done: if (rc != pcmk_ok) { crm_crit("join-%d node history update for node %s failed: %s", current_join_id, join_from, pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free(join_from); free(xpath); } void finalize_join_for(gpointer key, gpointer value, gpointer user_data) { xmlNode *acknak = NULL; xmlNode *tmp1 = NULL; pcmk__node_status_t *join_node = value; const char *join_to = join_node->name; enum controld_join_phase phase = controld_get_join_phase(join_node); bool integrated = false; switch (phase) { case controld_join_integrated: integrated = true; break; case controld_join_nack: break; default: crm_trace("Not updating non-integrated and non-nacked node %s (%s) " "for join-%d", join_to, join_phase_text(phase), current_join_id); return; } /* Update the element with the node's name and UUID, in case they * weren't known before */ crm_trace("Updating node name and UUID in CIB for %s", join_to); tmp1 = pcmk__xe_create(NULL, PCMK_XE_NODE); - crm_xml_add(tmp1, PCMK_XA_ID, pcmk__cluster_node_uuid(join_node)); + crm_xml_add(tmp1, PCMK_XA_ID, pcmk__cluster_get_xml_id(join_node)); crm_xml_add(tmp1, PCMK_XA_UNAME, join_to); fsa_cib_anon_update(PCMK_XE_NODES, tmp1); pcmk__xml_free(tmp1); join_node = pcmk__get_node(0, join_to, NULL, pcmk__node_search_cluster_member); if (!pcmk__cluster_is_node_active(join_node)) { /* * NACK'ing nodes that the membership layer doesn't know about yet * simply creates more churn * * Better to leave them waiting and let the join restart when * the new membership event comes in * * All other NACKs (due to versions etc) should still be processed */ pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_PENDING); return; } // Acknowledge or nack node's join request crm_debug("%sing join-%d request from %s", integrated? "Acknowledg" : "Nack", current_join_id, join_to); acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to); pcmk__xe_set_bool_attr(acknak, CRM_OP_JOIN_ACKNAK, integrated); if (integrated) { // No change needed for a nacked node crm_update_peer_join(__func__, join_node, controld_join_finalized); pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER); /* Iterate through the remote peer cache and add information on which * node hosts each to the ACK message. This keeps new controllers in * sync with what has already happened. */ if (pcmk__cluster_num_remote_nodes() > 0) { GHashTableIter iter; pcmk__node_status_t *node = NULL; xmlNode *remotes = pcmk__xe_create(acknak, PCMK_XE_NODES); g_hash_table_iter_init(&iter, pcmk__remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *remote = NULL; if (!node->conn_host) { continue; } remote = pcmk__xe_create(remotes, PCMK_XE_NODE); pcmk__xe_set_props(remote, PCMK_XA_ID, node->name, PCMK__XA_NODE_STATE, node->state, PCMK__XA_CONNECTION_HOST, node->conn_host, NULL); } } } pcmk__cluster_send_message(join_node, pcmk_ipc_controld, acknak); pcmk__xml_free(acknak); return; } gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source) { static unsigned long long highest_seq = 0; if (controld_globals.membership_id != controld_globals.peer_seq) { crm_debug("join-%d: Membership changed from %llu to %llu " QB_XS " highest=%llu state=%s for=%s", current_join_id, controld_globals.membership_id, controld_globals.peer_seq, highest_seq, fsa_state2string(cur_state), source); if (highest_seq < controld_globals.peer_seq) { /* Don't spam the FSA with duplicates */ highest_seq = controld_globals.peer_seq; register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } } else if (cur_state == S_INTEGRATION) { if (crmd_join_phase_count(controld_join_welcomed) == 0) { int count = crmd_join_phase_count(controld_join_integrated); crm_debug("join-%d: Integration of %d peer%s complete " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL); return TRUE; } } else if (cur_state == S_FINALIZE_JOIN) { if (!pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) { crm_debug("join-%d: Delaying finalization until we have CIB " QB_XS " state=%s for=%s", current_join_id, fsa_state2string(cur_state), source); return TRUE; } else if (crmd_join_phase_count(controld_join_welcomed) != 0) { int count = crmd_join_phase_count(controld_join_welcomed); crm_debug("join-%d: Still waiting on %d welcomed node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else if (crmd_join_phase_count(controld_join_integrated) != 0) { int count = crmd_join_phase_count(controld_join_integrated); crm_debug("join-%d: Still waiting on %d integrated node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else if (crmd_join_phase_count(controld_join_finalized) != 0) { int count = crmd_join_phase_count(controld_join_finalized); crm_debug("join-%d: Still waiting on %d finalized node%s " QB_XS " state=%s for=%s", current_join_id, count, pcmk__plural_s(count), fsa_state2string(cur_state), source); crmd_join_phase_log(LOG_DEBUG); } else { crm_debug("join-%d: Complete " QB_XS " state=%s for=%s", current_join_id, fsa_state2string(cur_state), source); register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL); return TRUE; } } return FALSE; } void do_dc_join_final(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Ensuring DC, quorum and node attributes are up-to-date"); crm_update_quorum(pcmk__cluster_has_quorum(), TRUE); } int crmd_join_phase_count(enum controld_join_phase phase) { int count = 0; pcmk__node_status_t *peer; GHashTableIter iter; g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { if (controld_get_join_phase(peer) == phase) { count++; } } return count; } void crmd_join_phase_log(int level) { pcmk__node_status_t *peer; GHashTableIter iter; g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { do_crm_log(level, "join-%d: %s=%s", current_join_id, peer->name, join_phase_text(controld_get_join_phase(peer))); } } diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c index 8075955953..daf0c5fd43 100644 --- a/daemons/controld/controld_membership.c +++ b/daemons/controld/controld_membership.c @@ -1,468 +1,468 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include void post_cache_update(int instance); extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { pcmk__node_status_t *node = value; if (pcmk__cluster_is_node_active(node)) { return; } crm_update_peer_join(__func__, node, controld_join_none); if ((node != NULL) && (node->name != NULL)) { if (controld_is_local_node(node->name)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (!AM_I_DC && pcmk__str_eq(node->name, controld_globals.dc_name, pcmk__str_casei)) { crm_warn("Our DC node (%s) left the cluster", node->name); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } if ((controld_globals.fsa_state == S_INTEGRATION) || (controld_globals.fsa_state == S_FINALIZE_JOIN)) { check_join_state(controld_globals.fsa_state, __func__); } if ((node != NULL) && (node->xml_id != NULL)) { fail_incompletable_actions(controld_globals.transition_graph, node->xml_id); } } void post_cache_update(int instance) { xmlNode *no_op = NULL; controld_globals.peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(pcmk__peer_cache, reap_dead_nodes, NULL); controld_set_fsa_input_flags(R_MEMBERSHIP); if (AM_I_DC) { populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer | node_update_expected, __func__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ controld_set_fsa_action_flags(A_ELECTION_CHECK); controld_trigger_fsa(); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = pcmk__new_request(pcmk_ipc_controld, (AM_I_DC? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD), NULL, CRM_SYSTEM_CRMD, CRM_OP_NOOP, NULL); pcmk__cluster_send_message(NULL, pcmk_ipc_controld, no_op); pcmk__xml_free(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else if(call_id < pcmk_ok) { crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /*! * \internal * \brief Create an XML node state tag with updates * * \param[in,out] node Node whose state will be used for update * \param[in] flags Bitmask of node_update_flags indicating what to update * \param[in,out] parent XML node to contain update (or NULL) * \param[in] source Who requested the update (only used for logging) * * \return Pointer to created node state tag */ xmlNode * create_node_state_update(pcmk__node_status_t *node, int flags, xmlNode *parent, const char *source) { // @TODO Ensure all callers handle NULL returns const char *value = NULL; xmlNode *node_state; if (!node->state) { crm_info("Node update for %s cancelled: no state, not seen yet", node->name); return NULL; } node_state = pcmk__xe_create(parent, PCMK__XE_NODE_STATE); if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { pcmk__xe_set_bool_attr(node_state, PCMK_XA_REMOTE_NODE, true); } if (crm_xml_add(node_state, PCMK_XA_ID, - pcmk__cluster_node_uuid(node)) == NULL) { + pcmk__cluster_get_xml_id(node)) == NULL) { crm_info("Node update for %s cancelled: no ID", node->name); pcmk__xml_free(node_state); return NULL; } crm_xml_add(node_state, PCMK_XA_UNAME, node->name); if ((flags & node_update_cluster) && node->state) { if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { // A value 0 means the node is not a cluster member. crm_xml_add_ll(node_state, PCMK__XA_IN_CCM, node->when_member); } else { pcmk__xe_set_bool_attr(node_state, PCMK__XA_IN_CCM, pcmk__str_eq(node->state, PCMK_VALUE_MEMBER, pcmk__str_none)); } } if (!pcmk_is_set(node->flags, pcmk__node_status_remote)) { if (flags & node_update_peer) { if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { // A value 0 means the peer is offline in CPG. crm_xml_add_ll(node_state, PCMK_XA_CRMD, node->when_online); } else { // @COMPAT DCs < 2.1.7 use online/offline rather than timestamp value = PCMK_VALUE_OFFLINE; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { value = PCMK_VALUE_ONLINE; } crm_xml_add(node_state, PCMK_XA_CRMD, value); } } if (flags & node_update_join) { if (controld_get_join_phase(node) <= controld_join_none) { value = CRMD_JOINSTATE_DOWN; } else { value = CRMD_JOINSTATE_MEMBER; } crm_xml_add(node_state, PCMK__XA_JOIN, value); } if (flags & node_update_expected) { crm_xml_add(node_state, PCMK_XA_EXPECTED, node->expected); } } crm_xml_add(node_state, PCMK_XA_CRM_DEBUG_ORIGIN, source); return node_state; } static void remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *node_uuid = user_data; do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)", node_uuid, pcmk_strerror(rc), rc); } static void search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *new_node_uuid = user_data; xmlNode *node_xml = NULL; if (rc != pcmk_ok) { if (rc != -ENXIO) { crm_notice("Searching conflicting nodes for %s failed: %s (%d)", new_node_uuid, pcmk_strerror(rc), rc); } return; } else if (output == NULL) { return; } if (pcmk__xe_is(output, PCMK_XE_NODE)) { node_xml = output; } else { node_xml = pcmk__xe_first_child(output, PCMK_XE_NODE, NULL, NULL); } for (; node_xml != NULL; node_xml = pcmk__xe_next(node_xml, PCMK_XE_NODE)) { const char *node_uuid = NULL; const char *node_uname = NULL; GHashTableIter iter; pcmk__node_status_t *node = NULL; gboolean known = FALSE; node_uuid = crm_element_value(node_xml, PCMK_XA_ID); node_uname = crm_element_value(node_xml, PCMK_XA_UNAME); if (node_uuid == NULL || node_uname == NULL) { continue; } g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if ((node != NULL) && pcmk__str_eq(node->xml_id, node_uuid, pcmk__str_casei) && pcmk__str_eq(node->name, node_uname, pcmk__str_casei)) { known = TRUE; break; } } if (known == FALSE) { cib_t *cib_conn = controld_globals.cib_conn; int delete_call_id = 0; xmlNode *node_state_xml = NULL; crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s", node_uuid, node_uname, new_node_uuid); delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_NODES, node_xml, cib_none); fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid), remove_conflicting_node_callback); node_state_xml = pcmk__xe_create(NULL, PCMK__XE_NODE_STATE); crm_xml_add(node_state_xml, PCMK_XA_ID, node_uuid); crm_xml_add(node_state_xml, PCMK_XA_UNAME, node_uname); delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_STATUS, node_state_xml, cib_none); fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid), remove_conflicting_node_callback); pcmk__xml_free(node_state_xml); } } } static void node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if(call_id < pcmk_ok) { crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else if(rc < pcmk_ok) { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void populate_cib_nodes(enum node_update_flags flags, const char *source) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = 0; gboolean from_hashtable = TRUE; xmlNode *node_list = pcmk__xe_create(NULL, PCMK_XE_NODES); #if SUPPORT_COROSYNC if (!pcmk_is_set(flags, node_update_quick) && (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync)) { from_hashtable = pcmk__corosync_add_nodes(node_list); } #endif if (from_hashtable) { GHashTableIter iter; pcmk__node_status_t *node = NULL; GString *xpath = NULL; g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *new_node = NULL; if ((node->xml_id != NULL) && (node->name != NULL)) { crm_trace("Creating node entry for %s/%s", node->name, node->xml_id); if (xpath == NULL) { xpath = g_string_sized_new(512); } else { g_string_truncate(xpath, 0); } /* We need both to be valid */ new_node = pcmk__xe_create(node_list, PCMK_XE_NODE); crm_xml_add(new_node, PCMK_XA_ID, node->xml_id); crm_xml_add(new_node, PCMK_XA_UNAME, node->name); /* Search and remove unknown nodes with the conflicting uname from CIB */ pcmk__g_strcat(xpath, "/" PCMK_XE_CIB "/" PCMK_XE_CONFIGURATION "/" PCMK_XE_NODES "/" PCMK_XE_NODE "[@" PCMK_XA_UNAME "='", node->name, "']" "[@" PCMK_XA_ID "!='", node->xml_id, "']", NULL); call_id = cib_conn->cmds->query(cib_conn, (const char *) xpath->str, NULL, cib_xpath); fsa_register_cib_callback(call_id, pcmk__str_copy(node->xml_id), search_conflicting_node_callback); } } if (xpath != NULL) { g_string_free(xpath, TRUE); } } crm_trace("Populating section from %s", from_hashtable ? "hashtable" : "cluster"); if ((controld_update_cib(PCMK_XE_NODES, node_list, cib_none, node_list_update_callback) == pcmk_rc_ok) && (pcmk__peer_cache != NULL) && AM_I_DC) { /* * There is no need to update the local CIB with our values if * we've not seen valid membership data */ GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__xml_free(node_list); node_list = pcmk__xe_create(NULL, PCMK_XE_STATUS); g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } if (pcmk__remote_peer_cache != NULL) { g_hash_table_iter_init(&iter, pcmk__remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } } controld_update_cib(PCMK_XE_STATUS, node_list, cib_none, crmd_node_update_complete); } pcmk__xml_free(node_list); } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { bool has_quorum = pcmk_is_set(controld_globals.flags, controld_has_quorum); if (quorum) { controld_set_global_flags(controld_ever_had_quorum); } else if (pcmk_all_flags_set(controld_globals.flags, controld_ever_had_quorum |controld_no_quorum_panic)) { pcmk__panic("Quorum lost"); } if (AM_I_DC && ((has_quorum && !quorum) || (!has_quorum && quorum) || force_update)) { xmlNode *update = NULL; update = pcmk__xe_create(NULL, PCMK_XE_CIB); crm_xml_add_int(update, PCMK_XA_HAVE_QUORUM, quorum); crm_xml_add(update, PCMK_XA_DC_UUID, controld_globals.our_uuid); crm_debug("Updating quorum status to %s", pcmk__btoa(quorum)); controld_update_cib(PCMK_XE_CIB, update, cib_none, cib_quorum_update_complete); pcmk__xml_free(update); /* Quorum changes usually cause a new transition via other activity: * quorum gained via a node joining will abort via the node join, * and quorum lost via a node leaving will usually abort via resource * activity and/or fencing. * * However, it is possible that nothing else causes a transition (e.g. * someone forces quorum via corosync-cmaptcl, or quorum is lost due to * a node in standby shutting down cleanly), so here ensure a new * transition is triggered. */ if (quorum) { /* If quorum was gained, abort after a short delay, in case multiple * nodes are joining around the same time, so the one that brings us * to quorum doesn't cause all the remaining ones to be fenced. */ abort_after_delay(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Quorum gained", 5000); } else { abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Quorum lost", NULL); } } if (quorum) { controld_set_global_flags(controld_has_quorum); } else { controld_clear_global_flags(controld_has_quorum); } } diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h index 0afca28950..bc722cb3de 100644 --- a/include/crm/cluster/internal.h +++ b/include/crm/cluster/internal.h @@ -1,325 +1,325 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_CLUSTER_INTERNAL__H #define PCMK__CRM_CLUSTER_INTERNAL__H #include #include // uint32_t, uint64_t #include // gboolean #include // xmlNode #include // enum crm_ipc_server #include #if SUPPORT_COROSYNC #include // cpg_name, cpg_handle_t #endif #ifdef __cplusplus extern "C" { #endif // @TODO Replace this with a pcmk__node_status_flags value enum crm_proc_flag { crm_proc_none = 0x00000001, // Cluster layers crm_proc_cpg = 0x04000000, }; /*! * \internal * \enum pcmk__node_status_flags * \brief Boolean flags for a \c pcmk__node_status_t object * * Some flags may not be related to status specifically. However, we keep these * separate from enum pcmk__node_flags because they're used with * different object types. */ enum pcmk__node_status_flags { /*! * Node is a Pacemaker Remote node and should not be considered for cluster * membership */ pcmk__node_status_remote = (UINT32_C(1) << 0), //! Node's cache entry is dirty pcmk__node_status_dirty = (UINT32_C(1) << 1), }; // Used with node cache search functions enum pcmk__node_search_flags { //! Does not affect search pcmk__node_search_none = 0, //! Search for cluster nodes from membership cache pcmk__node_search_cluster_member = (1 << 0), //! Search for remote nodes pcmk__node_search_remote = (1 << 1), //! Search for cluster member nodes and remote nodes pcmk__node_search_any = pcmk__node_search_cluster_member |pcmk__node_search_remote, //! Search for cluster nodes from CIB (as of last cache refresh) pcmk__node_search_cluster_cib = (1 << 2), }; /*! * \internal * \enum pcmk__node_update * \brief Type of update to a \c pcmk__node_status_t object */ enum pcmk__node_update { pcmk__node_update_name, //!< Node name updated pcmk__node_update_state, //!< Node connection state updated pcmk__node_update_processes, //!< Node process group membership updated }; typedef struct pcmk__election pcmk__election_t; //! Implementation of pcmk__cluster_private_t struct pcmk__cluster_private { enum pcmk_ipc_server server; //!< Server this connection is for (if any) char *node_name; //!< Local node name at cluster layer pcmk__election_t *election; //!< Election state (if election is needed) /* @TODO Corosync uses an integer node ID, but cluster layers in the * abstract do not necessarily need to */ uint32_t node_id; //!< Local node ID at cluster layer #if SUPPORT_COROSYNC /* @TODO Make these members a separate struct and use void *cluster_data * here instead, to abstract the cluster layer further. */ struct cpg_name group; //!< Corosync CPG name cpg_handle_t cpg_handle; //!< Corosync CPG handle #endif // SUPPORT_COROSYNC }; //! Node status data (may be a cluster node or a Pacemaker Remote node) typedef struct pcmk__node_status { //! Node name as known to cluster layer, or Pacemaker Remote node name char *name; /* @COMPAT This is less than ideal since the value is not a valid XML ID * (for Corosync, it's the string equivalent of the node's numeric node ID, * but XML IDs can't start with a number) and the three elements should have * different IDs. * * Ideally, we would use something like node-NODEID, node_state-NODEID, and * transient_attributes-NODEID as the element IDs. Unfortunately changing it * would be impractical due to backward compatibility; older nodes in a * rolling upgrade will always write and expect the value in the old format. */ /*! * Value of the PCMK_XA_ID XML attribute to use with the node's * PCMK_XE_NODE, PCMK_XE_NODE_STATE, and PCMK_XE_TRANSIENT_ATTRIBUTES * XML elements in the CIB */ char *xml_id; char *state; // @TODO change to enum //! Group of enum pcmk__node_status_flags uint32_t flags; /*! * Most recent cluster membership in which node was seen (0 for Pacemaker * Remote nodes) */ uint64_t membership_id; uint32_t processes; // @TODO most not needed, merge into flags /* @TODO When we can break public API compatibility, we can make the rest of * these members separate structs and use void *cluster_data and * void *user_data here instead, to abstract the cluster layer further. */ //! Arbitrary data (must be freeable by \c free()) void *user_data; char *expected; time_t peer_lost; char *conn_host; time_t when_member; // Since when node has been a cluster member time_t when_online; // Since when peer has been online in CPG /* @TODO The following are currently needed only by the Corosync stack. * Eventually consider moving them to a cluster-layer-specific data object. */ uint32_t cluster_layer_id; //!< Cluster-layer numeric node ID time_t when_lost; //!< When CPG membership was last lost } pcmk__node_status_t; /*! * \internal * \brief Return the process bit corresponding to the current cluster stack * * \return Process flag if detectable, otherwise 0 */ static inline uint32_t crm_get_cluster_proc(void) { switch (pcmk_get_cluster_layer()) { case pcmk_cluster_layer_corosync: return crm_proc_cpg; default: break; } return crm_proc_none; } /*! * \internal * \brief Get log-friendly string description of a Corosync return code * * \param[in] error Corosync return code * * \return Log-friendly string description corresponding to \p error */ static inline const char * pcmk__cs_err_str(int error) { # if SUPPORT_COROSYNC switch (error) { case CS_OK: return "OK"; case CS_ERR_LIBRARY: return "Library error"; case CS_ERR_VERSION: return "Version error"; case CS_ERR_INIT: return "Initialization error"; case CS_ERR_TIMEOUT: return "Timeout"; case CS_ERR_TRY_AGAIN: return "Try again"; case CS_ERR_INVALID_PARAM: return "Invalid parameter"; case CS_ERR_NO_MEMORY: return "No memory"; case CS_ERR_BAD_HANDLE: return "Bad handle"; case CS_ERR_BUSY: return "Busy"; case CS_ERR_ACCESS: return "Access error"; case CS_ERR_NOT_EXIST: return "Doesn't exist"; case CS_ERR_NAME_TOO_LONG: return "Name too long"; case CS_ERR_EXIST: return "Exists"; case CS_ERR_NO_SPACE: return "No space"; case CS_ERR_INTERRUPT: return "Interrupt"; case CS_ERR_NAME_NOT_FOUND: return "Name not found"; case CS_ERR_NO_RESOURCES: return "No resources"; case CS_ERR_NOT_SUPPORTED: return "Not supported"; case CS_ERR_BAD_OPERATION: return "Bad operation"; case CS_ERR_FAILED_OPERATION: return "Failed operation"; case CS_ERR_MESSAGE_ERROR: return "Message error"; case CS_ERR_QUEUE_FULL: return "Queue full"; case CS_ERR_QUEUE_NOT_AVAILABLE: return "Queue not available"; case CS_ERR_BAD_FLAGS: return "Bad flags"; case CS_ERR_TOO_BIG: return "Too big"; case CS_ERR_NO_SECTIONS: return "No sections"; } # endif return "Corosync error"; } # if SUPPORT_COROSYNC #if 0 /* This is the new way to do it, but we still support all Corosync 2 versions, * and this isn't always available. A better alternative here would be to check * for support in the configure script and enable this conditionally. */ #define pcmk__init_cmap(handle) cmap_initialize_map((handle), CMAP_MAP_ICMAP) #else #define pcmk__init_cmap(handle) cmap_initialize(handle) #endif char *pcmk__corosync_cluster_name(void); bool pcmk__corosync_add_nodes(xmlNode *xml_parent); void pcmk__cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); char *pcmk__cpg_message_data(cpg_handle_t handle, uint32_t sender_id, uint32_t pid, void *content, const char **from); # endif -const char *pcmk__cluster_node_uuid(pcmk__node_status_t *node); +const char *pcmk__cluster_get_xml_id(pcmk__node_status_t *node); char *pcmk__cluster_node_name(uint32_t nodeid); const char *pcmk__cluster_local_node_name(void); const char *pcmk__node_name_from_uuid(const char *uuid); pcmk__node_status_t *crm_update_peer_proc(const char *source, pcmk__node_status_t *peer, uint32_t flag, const char *status); pcmk__node_status_t *pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership); void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected); void pcmk__reap_unseen_nodes(uint64_t ring_id); void pcmk__corosync_quorum_connect(gboolean (*dispatch)(unsigned long long, gboolean), void (*destroy) (gpointer)); bool pcmk__cluster_send_message(const pcmk__node_status_t *node, enum pcmk_ipc_server service, const xmlNode *data); // Membership extern GHashTable *pcmk__peer_cache; extern GHashTable *pcmk__remote_peer_cache; bool pcmk__cluster_has_quorum(void); void pcmk__cluster_init_node_caches(void); void pcmk__cluster_destroy_node_caches(void); void pcmk__cluster_set_autoreap(bool enable); void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)); bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node); unsigned int pcmk__cluster_num_active_nodes(void); unsigned int pcmk__cluster_num_remote_nodes(void); pcmk__node_status_t *pcmk__cluster_lookup_remote_node(const char *node_name); void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name); void pcmk__cluster_forget_remote_node(const char *node_name); pcmk__node_status_t *pcmk__search_node_caches(unsigned int id, const char *uname, const char *xml_id, uint32_t flags); void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id); void pcmk__refresh_node_caches_from_cib(xmlNode *cib); pcmk__node_status_t *pcmk__get_node(unsigned int id, const char *uname, const char *xml_id, uint32_t flags); #ifdef __cplusplus } #endif #endif // PCMK__CRM_CLUSTER_INTERNAL__H diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c index 3427a409f3..87abcfc43e 100644 --- a/lib/cluster/cluster.c +++ b/lib/cluster/cluster.c @@ -1,471 +1,474 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include // PRIu32 #include #include #include #include #include #include #include #include #include // uname() #include // gboolean #include #include #include #include #include "crmcluster_private.h" CRM_TRACE_INIT_DATA(cluster); /*! * \internal - * \brief Get a node's cluster-layer UUID, setting it if not already set + * \brief Get a node's XML ID in the CIB, setting it if not already set * * \param[in,out] node Node to check * - * \return Cluster-layer node UUID of \p node, or \c NULL if unknown + * \return CIB XML ID of \p node if known, otherwise \c NULL */ const char * -pcmk__cluster_node_uuid(pcmk__node_status_t *node) +pcmk__cluster_get_xml_id(pcmk__node_status_t *node) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); if (node == NULL) { return NULL; } if (node->xml_id != NULL) { return node->xml_id; } + // xml_id is always set when a Pacemaker Remote node entry is created + CRM_CHECK(!pcmk_is_set(node->flags, pcmk__node_status_remote), return NULL); + switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: node->xml_id = pcmk__corosync_uuid(node); return node->xml_id; #endif // SUPPORT_COROSYNC default: crm_err("Unsupported cluster layer %s", pcmk_cluster_layer_text(cluster_layer)); return NULL; } } /*! * \internal * \brief Connect to the cluster layer * * \param[in,out] cluster Initialized cluster object to connect * * \return Standard Pacemaker return code */ int pcmk_cluster_connect(pcmk_cluster_t *cluster) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); if (cluster == NULL) { return EINVAL; } // cts-lab looks for this message crm_notice("Connecting to %s cluster layer", cluster_layer_s); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: return pcmk__corosync_connect(cluster); #endif // SUPPORT_COROSYNC default: break; } crm_err("Failed to connect to unsupported cluster layer %s", cluster_layer_s); return EPROTONOSUPPORT; } /*! * \brief Disconnect from the cluster layer * * \param[in,out] cluster Cluster object to disconnect * * \return Standard Pacemaker return code */ int pcmk_cluster_disconnect(pcmk_cluster_t *cluster) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); crm_info("Disconnecting from %s cluster layer", cluster_layer_s); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: pcmk__corosync_disconnect(cluster); pcmk__cluster_destroy_node_caches(); return pcmk_rc_ok; #endif // SUPPORT_COROSYNC default: break; } crm_err("Failed to disconnect from unsupported cluster layer %s", cluster_layer_s); return EPROTONOSUPPORT; } /*! * \brief Allocate a new \p pcmk_cluster_t object * * \return A newly allocated \p pcmk_cluster_t object (guaranteed not \c NULL) * \note The caller is responsible for freeing the return value using * \p pcmk_cluster_free(). */ pcmk_cluster_t * pcmk_cluster_new(void) { pcmk_cluster_t *cluster = pcmk__assert_alloc(1, sizeof(pcmk_cluster_t)); cluster->priv = pcmk__assert_alloc(1, sizeof(pcmk__cluster_private_t)); cluster->priv->server = pcmk__parse_server(crm_system_name); return cluster; } /*! * \brief Free a \p pcmk_cluster_t object and its dynamically allocated members * * \param[in,out] cluster Cluster object to free */ void pcmk_cluster_free(pcmk_cluster_t *cluster) { if (cluster == NULL) { return; } election_fini(cluster); free(cluster->priv->node_name); free(cluster->priv); free(cluster); } /*! * \brief Set the destroy function for a cluster object * * \param[in,out] cluster Cluster object * \param[in] fn Destroy function to set * * \return Standard Pacemaker return code */ int pcmk_cluster_set_destroy_fn(pcmk_cluster_t *cluster, void (*fn)(gpointer)) { if (cluster == NULL) { return EINVAL; } cluster->destroy = fn; return pcmk_rc_ok; } /*! * \internal * \brief Send an XML message via the cluster messaging layer * * \param[in] node Cluster node to send message to * \param[in] service Message type to use in message host info * \param[in] data XML message to send * * \return \c true on success, or \c false otherwise */ bool pcmk__cluster_send_message(const pcmk__node_status_t *node, enum pcmk_ipc_server service, const xmlNode *data) { // @TODO Return standard Pacemaker return code switch (pcmk_get_cluster_layer()) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: return pcmk__cpg_send_xml(data, node, service); #endif // SUPPORT_COROSYNC default: break; } return false; } /*! * \internal * \brief Get the node name corresponding to a cluster-layer node ID * * Get the node name from the cluster layer if possible. Otherwise, if for the * local node, call \c uname() and get the \c nodename member from the * struct utsname object. * * \param[in] nodeid Node ID to check (or 0 for the local node) * * \return Node name corresponding to \p nodeid * * \note This will fatally exit if \c uname() fails to get the local node name * or we run out of memory. * \note The caller is responsible for freeing the return value using \c free(). */ char * pcmk__cluster_node_name(uint32_t nodeid) { char *name = NULL; const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); switch (cluster_layer) { #if SUPPORT_COROSYNC case pcmk_cluster_layer_corosync: name = pcmk__corosync_name(0, nodeid); if (name != NULL) { return name; } break; #endif // SUPPORT_COROSYNC default: crm_err("Unsupported cluster layer: %s", cluster_layer_s); break; } if (nodeid == 0) { struct utsname hostinfo; crm_notice("Could not get local node name from %s cluster layer, " "defaulting to local hostname", cluster_layer_s); if (uname(&hostinfo) < 0) { // @TODO Maybe let the caller decide what to do crm_err("Failed to get the local hostname"); crm_exit(CRM_EX_FATAL); } return pcmk__str_copy(hostinfo.nodename); } crm_notice("Could not obtain a node name for node with " PCMK_XA_ID "=" PRIu32, nodeid); return NULL; } /*! * \internal * \brief Get the local node's cluster-layer node name * * If getting the node name from the cluster layer is impossible, call * \c uname() and get the \c nodename member from the struct utsname * object. * * \return Local node's name * * \note This will fatally exit if \c uname() fails to get the local node name * or we run out of memory. */ const char * pcmk__cluster_local_node_name(void) { // @TODO Refactor to avoid trivially leaking name at exit static char *name = NULL; if (name == NULL) { name = pcmk__cluster_node_name(0); } return name; } /*! * \internal * \brief Get the node name corresonding to a node UUID * * Look for the UUID in both the remote node cache and the cluster member cache. * * \param[in] uuid UUID to search for * * \return Node name corresponding to \p uuid if found, or \c NULL otherwise */ const char * pcmk__node_name_from_uuid(const char *uuid) { /* @TODO There are too many functions in libcrmcluster that look up a node * from the node caches (possibly creating a cache entry if none exists). * There are at least the following: * * pcmk__cluster_lookup_remote_node() * * pcmk__get_node() * * pcmk__node_name_from_uuid() * * pcmk__search_node_caches() * * There's a lot of duplication among them, but they all do slightly * different things. We should try to clean them up and consolidate them to * the extent possible, likely with new helper functions. */ GHashTableIter iter; pcmk__node_status_t *node = NULL; CRM_CHECK(uuid != NULL, return NULL); // Remote nodes have the same uname and uuid if (g_hash_table_lookup(pcmk__remote_peer_cache, uuid)) { return uuid; } g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, uuid, pcmk__str_casei)) { return node->name; } } return NULL; } /*! * \brief Get a log-friendly string equivalent of a cluster layer * * \param[in] layer Cluster layer * * \return Log-friendly string corresponding to \p layer */ const char * pcmk_cluster_layer_text(enum pcmk_cluster_layer layer) { switch (layer) { case pcmk_cluster_layer_corosync: return "corosync"; case pcmk_cluster_layer_unknown: return "unknown"; case pcmk_cluster_layer_invalid: return "invalid"; default: crm_err("Invalid cluster layer: %d", layer); return "invalid"; } } /*! * \brief Get and validate the local cluster layer * * If a cluster layer is not configured via the \c PCMK__ENV_CLUSTER_TYPE local * option, this will try to detect an active cluster from among the supported * cluster layers. * * \return Local cluster layer * * \note This will fatally exit if the configured cluster layer is invalid. */ enum pcmk_cluster_layer pcmk_get_cluster_layer(void) { static enum pcmk_cluster_layer cluster_layer = pcmk_cluster_layer_unknown; const char *cluster = NULL; // Cluster layer is stable once set if (cluster_layer != pcmk_cluster_layer_unknown) { return cluster_layer; } cluster = pcmk__env_option(PCMK__ENV_CLUSTER_TYPE); if (cluster != NULL) { crm_info("Verifying configured cluster layer '%s'", cluster); cluster_layer = pcmk_cluster_layer_invalid; #if SUPPORT_COROSYNC if (pcmk__str_eq(cluster, PCMK_VALUE_COROSYNC, pcmk__str_casei)) { cluster_layer = pcmk_cluster_layer_corosync; } #endif // SUPPORT_COROSYNC if (cluster_layer == pcmk_cluster_layer_invalid) { crm_notice("This installation does not support the '%s' cluster " "infrastructure: terminating", cluster); crm_exit(CRM_EX_FATAL); } crm_info("Assuming an active '%s' cluster", cluster); } else { // Nothing configured, so test supported cluster layers #if SUPPORT_COROSYNC crm_debug("Testing with Corosync"); if (pcmk__corosync_is_active()) { cluster_layer = pcmk_cluster_layer_corosync; } #endif // SUPPORT_COROSYNC if (cluster_layer == pcmk_cluster_layer_unknown) { crm_notice("Could not determine the current cluster layer"); } else { crm_info("Detected an active '%s' cluster", pcmk_cluster_layer_text(cluster_layer)); } } return cluster_layer; } // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START #include gboolean crm_cluster_connect(pcmk_cluster_t *cluster) { if (cluster == NULL) { return FALSE; } if (cluster->priv == NULL) { /* sbd (as of at least 1.5.2) doesn't call pcmk_cluster_new() to * allocate the pcmk_cluster_t */ cluster->priv = pcmk__assert_alloc(1, sizeof(pcmk__cluster_private_t)); } return pcmk_cluster_connect(cluster) == pcmk_rc_ok; } const char * name_for_cluster_type(enum cluster_type_e type) { switch (type) { case pcmk_cluster_corosync: return "corosync"; case pcmk_cluster_unknown: return "unknown"; case pcmk_cluster_invalid: return "invalid"; } crm_err("Invalid cluster type: %d", type); return "invalid"; } enum cluster_type_e get_cluster_type(void) { return (enum cluster_type_e) pcmk_get_cluster_layer(); } // LCOV_EXCL_STOP // End deprecated API diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index bccbe12aa7..ad55658d78 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -1,1509 +1,1509 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include // PRIu32 #include // bool #include #include #include #include #include #include #include #include #include #include #include #include "crmcluster_private.h" /* The peer cache remembers cluster nodes that have been seen. This is managed * mostly automatically by libcrmcluster, based on cluster membership events. * * Because cluster nodes can have conflicting names or UUIDs, the hash table key * is a uniquely generated ID. * * @TODO Move caches to pcmk_cluster_t */ GHashTable *pcmk__peer_cache = NULL; /* The remote peer cache tracks pacemaker_remote nodes. While the * value has the same type as the peer cache's, it is tracked separately for * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs, * so the name (which is also the UUID) is used as the hash table key; there * is no equivalent of membership events, so management is not automatic; and * most users of the peer cache need to exclude pacemaker_remote nodes. * * @TODO That said, using a single cache would be more logical and less * error-prone, so it would be a good idea to merge them one day. * * libcrmcluster provides two avenues for populating the cache: * pcmk__cluster_lookup_remote_node() and pcmk__cluster_forget_remote_node() * directly manage it, while refresh_remote_nodes() populates it via the CIB. * * @TODO Move caches to pcmk_cluster_t */ GHashTable *pcmk__remote_peer_cache = NULL; /* * The CIB cluster node cache tracks cluster nodes that have been seen in * the CIB. It is useful mainly when a caller needs to know about a node that * may no longer be in the membership, but doesn't want to add the node to the * main peer cache tables. */ static GHashTable *cluster_node_cib_cache = NULL; static bool autoreap = true; static bool has_quorum = false; // Flag setting and clearing for pcmk__node_status_t:flags #define set_peer_flags(peer, flags_to_set) do { \ (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_set), \ #flags_to_set); \ } while (0) #define clear_peer_flags(peer, flags_to_clear) do { \ (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \ LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_clear), \ #flags_to_clear); \ } while (0) static void update_peer_uname(pcmk__node_status_t *node, const char *uname); static pcmk__node_status_t *find_cib_cluster_node(const char *id, const char *uname); /*! * \internal * \brief Check whether the cluster currently has quorum * * \return \c true if the cluster has quorum, or \c false otherwise */ bool pcmk__cluster_has_quorum(void) { return has_quorum; } /*! * \internal * \brief Set whether the cluster currently has quorum * * \param[in] quorate \c true if the cluster has quorum, or \c false otherwise */ void pcmk__cluster_set_quorum(bool quorate) { has_quorum = quorate; } /*! * \internal * \brief Get the number of Pacemaker Remote nodes that have been seen * * \return Number of cached Pacemaker Remote nodes */ unsigned int pcmk__cluster_num_remote_nodes(void) { if (pcmk__remote_peer_cache == NULL) { return 0U; } return g_hash_table_size(pcmk__remote_peer_cache); } /*! * \internal * \brief Get a remote node cache entry, creating it if necessary * * \param[in] node_name Name of remote node * * \return Cache entry for node on success, or \c NULL (and set \c errno) * otherwise * * \note When creating a new entry, this will leave the node state undetermined. * The caller should also call \c pcmk__update_peer_state() if the state * is known. * \note Because this can add and remove cache entries, callers should not * assume any previously obtained cache entry pointers remain valid. */ pcmk__node_status_t * pcmk__cluster_lookup_remote_node(const char *node_name) { pcmk__node_status_t *node = NULL; char *node_name_copy = NULL; if (node_name == NULL) { errno = EINVAL; return NULL; } /* It's theoretically possible that the node was added to the cluster peer * cache before it was known to be a Pacemaker Remote node. Remove that * entry unless it has a node ID, which means the name actually is * associated with a cluster node. (@TODO return an error in that case?) */ node = pcmk__search_node_caches(0, node_name, NULL, pcmk__node_search_cluster_member); if ((node != NULL) && (node->xml_id == NULL)) { /* node_name could be a pointer into the cache entry being removed, so * reassign it to a copy before the original gets freed */ node_name_copy = strdup(node_name); if (node_name_copy == NULL) { errno = ENOMEM; return NULL; } node_name = node_name_copy; pcmk__cluster_forget_cluster_node(0, node_name); } /* Return existing cache entry if one exists */ node = g_hash_table_lookup(pcmk__remote_peer_cache, node_name); if (node) { free(node_name_copy); return node; } /* Allocate a new entry */ node = calloc(1, sizeof(pcmk__node_status_t)); if (node == NULL) { free(node_name_copy); return NULL; } /* Populate the essential information */ set_peer_flags(node, pcmk__node_status_remote); node->xml_id = strdup(node_name); if (node->xml_id == NULL) { free(node); errno = ENOMEM; free(node_name_copy); return NULL; } /* Add the new entry to the cache */ g_hash_table_replace(pcmk__remote_peer_cache, node->xml_id, node); crm_trace("added %s to remote cache", node_name); /* Update the entry's uname, ensuring peer status callbacks are called */ update_peer_uname(node, node_name); free(node_name_copy); return node; } /*! * \internal * \brief Remove a node from the Pacemaker Remote node cache * * \param[in] node_name Name of node to remove from cache * * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into the cache entry being removed. */ void pcmk__cluster_forget_remote_node(const char *node_name) { /* Do a lookup first, because node_name could be a pointer within the entry * being removed -- we can't log it *after* removing it. */ if (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL) { crm_trace("Removing %s from Pacemaker Remote node cache", node_name); g_hash_table_remove(pcmk__remote_peer_cache, node_name); } } /*! * \internal * \brief Return node status based on a CIB status entry * * \param[in] node_state XML of node state * * \return \c PCMK_VALUE_MEMBER if \c PCMK__XA_IN_CCM is true in * \c PCMK__XE_NODE_STATE, or \c PCMK__VALUE_LOST otherwise */ static const char * remote_state_from_cib(const xmlNode *node_state) { bool in_ccm = false; if ((pcmk__xe_get_bool_attr(node_state, PCMK__XA_IN_CCM, &in_ccm) == pcmk_rc_ok) && in_ccm) { return PCMK_VALUE_MEMBER; } return PCMK__VALUE_LOST; } /* user data for looping through remote node xpath searches */ struct refresh_data { const char *field; /* XML attribute to check for node name */ gboolean has_state; /* whether to update node state based on XML */ }; /*! * \internal * \brief Process one pacemaker_remote node xpath search result * * \param[in] result XML search result * \param[in] user_data what to look for in the XML */ static void remote_cache_refresh_helper(xmlNode *result, void *user_data) { const struct refresh_data *data = user_data; const char *remote = crm_element_value(result, data->field); const char *state = NULL; pcmk__node_status_t *node; CRM_CHECK(remote != NULL, return); /* Determine node's state, if the result has it */ if (data->has_state) { state = remote_state_from_cib(result); } /* Check whether cache already has entry for node */ node = g_hash_table_lookup(pcmk__remote_peer_cache, remote); if (node == NULL) { /* Node is not in cache, so add a new entry for it */ node = pcmk__cluster_lookup_remote_node(remote); pcmk__assert(node != NULL); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } } static void mark_dirty(gpointer key, gpointer value, gpointer user_data) { set_peer_flags((pcmk__node_status_t *) value, pcmk__node_status_dirty); } static gboolean is_dirty(gpointer key, gpointer value, gpointer user_data) { const pcmk__node_status_t *node = value; return pcmk_is_set(node->flags, pcmk__node_status_dirty); } /*! * \internal * \brief Repopulate the remote node cache based on CIB XML * * \param[in] cib CIB XML to parse */ static void refresh_remote_nodes(xmlNode *cib) { struct refresh_data data; pcmk__cluster_init_node_caches(); /* First, we mark all existing cache entries as dirty, * so that later we can remove any that weren't in the CIB. * We don't empty the cache, because we need to detect changes in state. */ g_hash_table_foreach(pcmk__remote_peer_cache, mark_dirty, NULL); /* Look for guest nodes and remote nodes in the status section */ data.field = PCMK_XA_ID; data.has_state = TRUE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_STATUS, remote_cache_refresh_helper, &data); /* Look for guest nodes and remote nodes in the configuration section, * because they may have just been added and not have a status entry yet. * In that case, the cached node state will be left NULL, so that the * peer status callback isn't called until we're sure the node started * successfully. */ data.field = PCMK_XA_VALUE; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_GUEST_NODE_CONFIG, remote_cache_refresh_helper, &data); data.field = PCMK_XA_ID; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_CONFIG, remote_cache_refresh_helper, &data); /* Remove all old cache entries that weren't seen in the CIB */ g_hash_table_foreach_remove(pcmk__remote_peer_cache, is_dirty, NULL); } /*! * \internal * \brief Check whether a node is an active cluster node * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \param[in] node Node to check * * \return \c true if the node is an active cluster node, or \c false otherwise */ bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); if ((node == NULL) || pcmk_is_set(node->flags, pcmk__node_status_remote)) { return false; } switch (cluster_layer) { case pcmk_cluster_layer_corosync: #if SUPPORT_COROSYNC return pcmk__corosync_is_peer_active(node); #else break; #endif // SUPPORT_COROSYNC default: break; } crm_err("Unhandled cluster layer: %s", pcmk_cluster_layer_text(cluster_layer)); return false; } /*! * \internal * \brief Check if a node's entry should be removed from the cluster node cache * * A node should be removed from the cache if it's inactive and matches another * \c pcmk__node_status_t (the search object). The node is considered a * mismatch if any of the following are true: * * The search object is \c NULL. * * The search object has an ID set and the cached node's ID does not match it. * * The search object does not have an ID set, and the cached node's name does * not match the search node's name. (If both names are \c NULL, it's a * match.) * * Otherwise, the node is considered a match. * * Note that if the search object has both an ID and a name set, the name is * ignored for matching purposes. * * \param[in] key Ignored * \param[in] value \c pcmk__node_status_t object from cluster node cache * \param[in] user_data \c pcmk__node_status_t object to match against (search * object) * * \return \c TRUE if the node entry should be removed from \c pcmk__peer_cache, * or \c FALSE otherwise */ static gboolean should_forget_cluster_node(gpointer key, gpointer value, gpointer user_data) { pcmk__node_status_t *node = value; pcmk__node_status_t *search = user_data; if (search == NULL) { return FALSE; } if ((search->cluster_layer_id != 0) && (node->cluster_layer_id != search->cluster_layer_id)) { return FALSE; } if ((search->cluster_layer_id == 0) && !pcmk__str_eq(node->name, search->name, pcmk__str_casei)) { // @TODO Consider name even if ID is set? return FALSE; } if (pcmk__cluster_is_node_active(value)) { return FALSE; } crm_info("Removing node with name %s and cluster layer ID %" PRIu32 " from membership cache", pcmk__s(node->name, "(unknown)"), node->cluster_layer_id); return TRUE; } /*! * \internal * \brief Remove one or more inactive nodes from the cluster node cache * * All inactive nodes matching \p id and \p node_name as described in * \c should_forget_cluster_node documentation are removed from the cache. * * If \p id is 0 and \p node_name is \c NULL, all inactive nodes are removed * from the cache regardless of ID and name. This differs from clearing the * cache, in that entries for active nodes are preserved. * * \param[in] id ID of node to remove from cache (0 to ignore) * \param[in] node_name Name of node to remove from cache (ignored if \p id is * nonzero) * * \note \p node_name is not modified directly, but it will be freed if it's a * pointer into a cache entry that is removed. */ void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name) { pcmk__node_status_t search = { 0, }; char *criterion = NULL; // For logging guint matches = 0; if (pcmk__peer_cache == NULL) { crm_trace("Membership cache not initialized, ignoring removal request"); return; } search.cluster_layer_id = id; search.name = pcmk__str_copy(node_name); // May log after original freed if (id > 0) { criterion = crm_strdup_printf("cluster layer ID %" PRIu32, id); } else if (node_name != NULL) { criterion = crm_strdup_printf("name %s", node_name); } matches = g_hash_table_foreach_remove(pcmk__peer_cache, should_forget_cluster_node, &search); if (matches > 0) { if (criterion != NULL) { crm_notice("Removed %u inactive node%s with %s from the membership " "cache", matches, pcmk__plural_s(matches), criterion); } else { crm_notice("Removed all (%u) inactive cluster nodes from the " "membership cache", matches); } } else { crm_info("No inactive cluster nodes%s%s to remove from the membership " "cache", ((criterion != NULL)? " with " : ""), pcmk__s(criterion, "")); } free(search.name); free(criterion); } static void count_peer(gpointer key, gpointer value, gpointer user_data) { unsigned int *count = user_data; pcmk__node_status_t *node = value; if (pcmk__cluster_is_node_active(node)) { *count = *count + 1; } } /*! * \internal * \brief Get the number of active cluster nodes that have been seen * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \return Number of active nodes in the cluster node cache */ unsigned int pcmk__cluster_num_active_nodes(void) { unsigned int count = 0; if (pcmk__peer_cache != NULL) { g_hash_table_foreach(pcmk__peer_cache, count_peer, &count); } return count; } static void destroy_crm_node(gpointer data) { pcmk__node_status_t *node = data; crm_trace("Destroying entry for node %" PRIu32 ": %s", node->cluster_layer_id, node->name); free(node->name); free(node->state); free(node->xml_id); free(node->user_data); free(node->expected); free(node->conn_host); free(node); } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_init_node_caches(void) { if (pcmk__peer_cache == NULL) { pcmk__peer_cache = pcmk__strikey_table(free, destroy_crm_node); } if (pcmk__remote_peer_cache == NULL) { pcmk__remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node); } if (cluster_node_cib_cache == NULL) { cluster_node_cib_cache = pcmk__strikey_table(free, destroy_crm_node); } } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_destroy_node_caches(void) { if (pcmk__peer_cache != NULL) { crm_trace("Destroying peer cache with %d members", g_hash_table_size(pcmk__peer_cache)); g_hash_table_destroy(pcmk__peer_cache); pcmk__peer_cache = NULL; } if (pcmk__remote_peer_cache != NULL) { crm_trace("Destroying remote peer cache with %d members", pcmk__cluster_num_remote_nodes()); g_hash_table_destroy(pcmk__remote_peer_cache); pcmk__remote_peer_cache = NULL; } if (cluster_node_cib_cache != NULL) { crm_trace("Destroying configured cluster node cache with %d members", g_hash_table_size(cluster_node_cib_cache)); g_hash_table_destroy(cluster_node_cib_cache); cluster_node_cib_cache = NULL; } } static void (*peer_status_callback)(enum pcmk__node_update, pcmk__node_status_t *, const void *) = NULL; /*! * \internal * \brief Set a client function that will be called after peer status changes * * \param[in] dispatch Pointer to function to use as callback * * \note Client callbacks should do only client-specific handling. Callbacks * must not add or remove entries in the peer caches. */ void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)) { // @TODO Improve documentation of peer_status_callback peer_status_callback = dispatch; } /*! * \internal * \brief Tell the library whether to automatically reap lost nodes * * If \c true (the default), calling \c crm_update_peer_proc() will also update * the peer state to \c PCMK_VALUE_MEMBER or \c PCMK__VALUE_LOST, and updating * the peer state will reap peers whose state changes to anything other than * \c PCMK_VALUE_MEMBER. * * Callers should leave this enabled unless they plan to manage the cache * separately on their own. * * \param[in] enable \c true to enable automatic reaping, \c false to disable */ void pcmk__cluster_set_autoreap(bool enable) { autoreap = enable; } static void dump_peer_hash(int level, const char *caller) { GHashTableIter iter; const char *id = NULL; pcmk__node_status_t *node = NULL; g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) { do_crm_log(level, "%s: Node %" PRIu32 "/%s = %p - %s", caller, node->cluster_layer_id, node->name, node, id); } } static gboolean hash_find_by_data(gpointer key, gpointer value, gpointer user_data) { return value == user_data; } /*! * \internal * \brief Search cluster member node cache * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] uuid If not NULL while id is 0, node UUID instead of cluster * node ID to search for * * \return Cluster node cache entry if found, otherwise NULL */ static pcmk__node_status_t * search_cluster_member_cache(unsigned int id, const char *uname, const char *uuid) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; pcmk__assert((id > 0) || (uname != NULL)); pcmk__cluster_init_node_caches(); if (uname != NULL) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s", node->name); by_name = node; break; } } } if (id > 0) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->cluster_layer_id == id) { crm_trace("ID match: %" PRIu32, node->cluster_layer_id); by_id = node; break; } } } else if (uuid != NULL) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, uuid, pcmk__str_casei)) { crm_trace("UUID match: %s", node->xml_id); by_id = node; break; } } } node = by_id; /* Good default */ if(by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %u/%s", by_id, id, uname); } else if(by_id == NULL && by_name) { crm_trace("Only one: %p for %u/%s", by_name, id, uname); if (id && by_name->cluster_layer_id) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes %u and %" PRIu32 " share the same name '%s'", id, by_name->cluster_layer_id, uname); node = NULL; /* Create a new one */ } else { node = by_name; } } else if(by_name == NULL && by_id) { crm_trace("Only one: %p for %u/%s", by_id, id, uname); if ((uname != NULL) && (by_id->name != NULL)) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes '%s' and '%s' share the same cluster nodeid %u: " "assuming '%s' is correct", uname, by_id->name, id, uname); } } else if ((uname != NULL) && (by_id->name != NULL)) { if (pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { crm_notice("Node '%s' has changed its cluster layer ID " "from %" PRIu32 " to %" PRIu32, by_id->name, by_name->cluster_layer_id, by_id->cluster_layer_id); g_hash_table_foreach_remove(pcmk__peer_cache, hash_find_by_data, by_name); } else { crm_warn("Nodes '%s' and '%s' share the same cluster nodeid: %u %s", by_id->name, by_name->name, id, uname); dump_peer_hash(LOG_INFO, __func__); crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE, TRUE); } } else if ((id > 0) && (by_name->cluster_layer_id > 0)) { crm_warn("Nodes %" PRIu32 " and %" PRIu32 " share the same name: '%s'", by_id->cluster_layer_id, by_name->cluster_layer_id, uname); } else { /* Simple merge */ /* Only corosync-based clusters use node IDs. The functions that call * pcmk__update_peer_state() and crm_update_peer_proc() only know * nodeid, so 'by_id' is authoritative when merging. */ dump_peer_hash(LOG_DEBUG, __func__); crm_info("Merging %p into %p", by_name, by_id); g_hash_table_foreach_remove(pcmk__peer_cache, hash_find_by_data, by_name); } return node; } /*! * \internal * \brief Search caches for a node (cluster or Pacemaker Remote) * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] xml_id If not NULL, CIB XML ID of node to search for * \param[in] flags Group of enum pcmk__node_search_flags * * \return Node cache entry if found, otherwise NULL */ pcmk__node_status_t * pcmk__search_node_caches(unsigned int id, const char *uname, const char *xml_id, uint32_t flags) { pcmk__node_status_t *node = NULL; pcmk__assert((id > 0) || (uname != NULL) || (xml_id != NULL)); pcmk__cluster_init_node_caches(); if (pcmk_is_set(flags, pcmk__node_search_remote)) { if (uname != NULL) { node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); } else if (xml_id != NULL) { node = g_hash_table_lookup(pcmk__remote_peer_cache, xml_id); } } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_member)) { node = search_cluster_member_cache(id, uname, xml_id); } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_cib)) { if (xml_id != NULL) { node = find_cib_cluster_node(xml_id, uname); } else { // Assumes XML ID is node ID as string (as with Corosync) char *id_str = (id == 0)? NULL : crm_strdup_printf("%u", id); node = find_cib_cluster_node(id_str, uname); free(id_str); } } return node; } /*! * \internal * \brief Purge a node from cache (both cluster and Pacemaker Remote) * * \param[in] node_name If not NULL, purge only nodes with this name * \param[in] node_id If not 0, purge cluster nodes only if they have this ID * * \note If \p node_name is NULL and \p node_id is 0, no nodes will be purged. * If \p node_name is not NULL and \p node_id is not 0, Pacemaker Remote * nodes that match \p node_name will be purged, and cluster nodes that * match both \p node_name and \p node_id will be purged. * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into a cache entry being removed. */ void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id) { char *node_name_copy = NULL; if ((node_name == NULL) && (node_id == 0U)) { return; } // Purge from Pacemaker Remote node cache if ((node_name != NULL) && (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL)) { /* node_name could be a pointer into the cache entry being purged, * so reassign it to a copy before the original gets freed */ node_name_copy = pcmk__str_copy(node_name); node_name = node_name_copy; crm_trace("Purging %s from Pacemaker Remote node cache", node_name); g_hash_table_remove(pcmk__remote_peer_cache, node_name); } pcmk__cluster_forget_cluster_node(node_id, node_name); free(node_name_copy); } #if SUPPORT_COROSYNC static guint remove_conflicting_peer(pcmk__node_status_t *node) { int matches = 0; GHashTableIter iter; pcmk__node_status_t *existing_node = NULL; if ((node->cluster_layer_id == 0) || (node->name == NULL)) { return 0; } if (!pcmk__corosync_has_nodelist()) { return 0; } g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) { if ((existing_node->cluster_layer_id > 0) && (existing_node->cluster_layer_id != node->cluster_layer_id) && pcmk__str_eq(existing_node->name, node->name, pcmk__str_casei)) { if (pcmk__cluster_is_node_active(existing_node)) { continue; } crm_warn("Removing cached offline node %" PRIu32 "/%s which has " "conflicting name with %" PRIu32, existing_node->cluster_layer_id, existing_node->name, node->cluster_layer_id); g_hash_table_iter_remove(&iter); matches++; } } return matches; } #endif /*! * \internal * \brief Get a cluster node cache entry, possibly creating one if not found * * If \c pcmk__node_search_cluster_member is set in \p flags, the return value * is guaranteed not to be \c NULL. A new cache entry is created if one does not * already exist. * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] xml_id If not NULL while \p id is 0, search for this CIB XML ID * instead of a cluster ID * \param[in] flags Group of enum pcmk__node_search_flags * * \return (Possibly newly created) cluster node cache entry */ /* coverity[-alloc] Memory is referenced in one or both hashtables */ pcmk__node_status_t * pcmk__get_node(unsigned int id, const char *uname, const char *xml_id, uint32_t flags) { pcmk__node_status_t *node = NULL; char *uname_lookup = NULL; pcmk__assert((id > 0) || (uname != NULL)); pcmk__cluster_init_node_caches(); // Check the Pacemaker Remote node cache first if (pcmk_is_set(flags, pcmk__node_search_remote)) { node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); if (node != NULL) { return node; } } if (!pcmk_is_set(flags, pcmk__node_search_cluster_member)) { return NULL; } node = search_cluster_member_cache(id, uname, xml_id); /* if uname wasn't provided, and find_peer did not turn up a uname based on id. * we need to do a lookup of the node name using the id in the cluster membership. */ if ((uname == NULL) && ((node == NULL) || (node->name == NULL))) { uname_lookup = pcmk__cluster_node_name(id); } if (uname_lookup) { uname = uname_lookup; crm_trace("Inferred a name of '%s' for node %u", uname, id); /* try to turn up the node one more time now that we know the uname. */ if (node == NULL) { node = search_cluster_member_cache(id, uname, xml_id); } } if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); crm_info("Created entry %s/%p for node %s/%u (%d total)", uniqueid, node, uname, id, 1 + g_hash_table_size(pcmk__peer_cache)); g_hash_table_replace(pcmk__peer_cache, uniqueid, node); } if ((id > 0) && (uname != NULL) && ((node->cluster_layer_id == 0) || (node->name == NULL))) { crm_info("Node %u is now known as %s", id, uname); } if ((id > 0) && (node->cluster_layer_id == 0)) { node->cluster_layer_id = id; } if ((uname != NULL) && (node->name == NULL)) { update_peer_uname(node, uname); } if ((xml_id == NULL) && (node->xml_id == NULL)) { - xml_id = pcmk__cluster_node_uuid(node); + xml_id = pcmk__cluster_get_xml_id(node); if (xml_id == NULL) { crm_debug("Cannot obtain an XML ID for node %s[%u] at this time", node->name, id); } else { crm_info("Node %s[%u] has XML ID %s", node->name, id, xml_id); } } free(uname_lookup); return node; } /*! * \internal * \brief Update a node's uname * * \param[in,out] node Node object to update * \param[in] uname New name to set * * \note This function should not be called within a peer cache iteration, * because in some cases it can remove conflicting cache entries, * which would invalidate the iterator. */ static void update_peer_uname(pcmk__node_status_t *node, const char *uname) { CRM_CHECK(uname != NULL, crm_err("Bug: can't update node name without name"); return); CRM_CHECK(node != NULL, crm_err("Bug: can't update node name to %s without node", uname); return); if (pcmk__str_eq(uname, node->name, pcmk__str_casei)) { crm_debug("Node name '%s' did not change", uname); return; } for (const char *c = uname; *c; ++c) { if ((*c >= 'A') && (*c <= 'Z')) { crm_warn("Node names with capitals are discouraged, consider changing '%s'", uname); break; } } pcmk__str_update(&node->name, uname); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_name, node, NULL); } #if SUPPORT_COROSYNC if ((pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { remove_conflicting_peer(node); } #endif } /*! * \internal * \brief Get log-friendly string equivalent of a process flag * * \param[in] proc Process flag * * \return Log-friendly string equivalent of \p proc */ static inline const char * proc2text(enum crm_proc_flag proc) { const char *text = "unknown"; switch (proc) { case crm_proc_none: text = "none"; break; case crm_proc_cpg: text = "corosync-cpg"; break; } return text; } /*! * \internal * \brief Update a node's process information (and potentially state) * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] flag Bitmask of new process information * \param[in] status node status (online, offline, etc.) * * \return NULL if any node was reaped from peer caches, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, otherwise * reaping could invalidate the iterator. */ pcmk__node_status_t * crm_update_peer_proc(const char *source, pcmk__node_status_t *node, uint32_t flag, const char *status) { uint32_t last = 0; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", source, proc2text(flag), status); return NULL); /* Pacemaker doesn't spawn processes on remote nodes */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return node; } last = node->processes; if (status == NULL) { node->processes = flag; if (node->processes != last) { changed = TRUE; } } else if (pcmk__str_eq(status, PCMK_VALUE_ONLINE, pcmk__str_casei)) { if ((node->processes & flag) != flag) { node->processes = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } } else if (node->processes & flag) { node->processes = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } if (changed) { if (status == NULL && flag <= crm_proc_none) { crm_info("%s: Node %s[%" PRIu32 "] - all processes are now offline", source, node->name, node->cluster_layer_id); } else { crm_info("%s: Node %s[%" PRIu32 "] - %s is now %s", source, node->name, node->cluster_layer_id, proc2text(flag), status); } if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { node->when_online = time(NULL); } else { node->when_online = 0; } /* Call the client callback first, then update the peer state, * in case the node will be reaped */ if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_processes, node, &last); } /* The client callback shouldn't touch the peer caches, * but as a safety net, bail if the peer cache was destroyed. */ if (pcmk__peer_cache == NULL) { return NULL; } if (autoreap) { const char *peer_state = NULL; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { peer_state = PCMK_VALUE_MEMBER; } else { peer_state = PCMK__VALUE_LOST; } node = pcmk__update_peer_state(__func__, node, peer_state, 0); } } else { crm_trace("%s: Node %s[%" PRIu32 "] - %s is unchanged (%s)", source, node->name, node->cluster_layer_id, proc2text(flag), status); } return node; } /*! * \internal * \brief Update a cluster node cache entry's expected join state * * \param[in] source Caller's function name (for logging) * \param[in,out] node Node to update * \param[in] expected Node's new join state */ void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected) { char *last = NULL; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected); return); /* Remote nodes don't participate in joins */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return; } last = node->expected; if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) { node->expected = strdup(expected); changed = TRUE; } if (changed) { crm_info("%s: Node %s[%" PRIu32 "] - expected state is now %s (was %s)", source, node->name, node->cluster_layer_id, expected, last); free(last); } else { crm_trace("%s: Node %s[%" PRIu32 "] - expected state is unchanged (%s)", source, node->name, node->cluster_layer_id, expected); } } /*! * \internal * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * \param[in,out] iter If not NULL, pointer to node's peer cache iterator * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function may be called from * within a peer cache iteration if the iterator is supplied. */ static pcmk__node_status_t * update_peer_state_iter(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership, GHashTableIter *iter) { gboolean is_member; CRM_CHECK(node != NULL, crm_err("Could not set state for unknown host to %s " QB_XS " source=%s", state, source); return NULL); is_member = pcmk__str_eq(state, PCMK_VALUE_MEMBER, pcmk__str_none); if (is_member) { node->when_lost = 0; if (membership) { node->membership_id = membership; } } if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) { char *last = node->state; if (is_member) { node->when_member = time(NULL); } else { node->when_member = 0; } node->state = strdup(state); crm_notice("Node %s state is now %s " QB_XS " nodeid=%" PRIu32 " previous=%s source=%s", node->name, state, node->cluster_layer_id, pcmk__s(last, "unknown"), source); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_state, node, last); } free(last); if (autoreap && !is_member && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { /* We only autoreap from the peer cache, not the remote peer cache, * because the latter should be managed only by * refresh_remote_nodes(). */ if(iter) { crm_notice("Purged 1 peer with cluster layer ID %" PRIu32 "and/or name=%s from the membership cache", node->cluster_layer_id, node->name); g_hash_table_iter_remove(iter); } else { pcmk__cluster_forget_cluster_node(node->cluster_layer_id, node->name); } node = NULL; } } else { crm_trace("Node %s state is unchanged (%s) " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, state, node->cluster_layer_id, source); } return node; } /*! * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, * otherwise reaping could invalidate the iterator. */ pcmk__node_status_t * pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership) { return update_peer_state_iter(source, node, state, membership, NULL); } /*! * \internal * \brief Reap all nodes from cache whose membership information does not match * * \param[in] membership Membership ID of nodes to keep */ void pcmk__reap_unseen_nodes(uint64_t membership) { GHashTableIter iter; pcmk__node_status_t *node = NULL; crm_trace("Reaping unseen nodes..."); g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) { if (node->membership_id != membership) { if (node->state) { /* Calling update_peer_state_iter() allows us to remove the node * from pcmk__peer_cache without invalidating our iterator */ update_peer_state_iter(__func__, node, PCMK__VALUE_LOST, membership, &iter); } else { crm_info("State of node %s[%" PRIu32 "] is still unknown", node->name, node->cluster_layer_id); } } } } static pcmk__node_status_t * find_cib_cluster_node(const char *id, const char *uname) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; if (uname) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s = %p", node->name, node); by_name = node; break; } } } if (id) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, id, pcmk__str_casei)) { crm_trace("ID match: %s= %p", id, node); by_id = node; break; } } } node = by_id; /* Good default */ if (by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %s/%s", by_id, id, uname); } else if (by_id == NULL && by_name) { crm_trace("Only one: %p for %s/%s", by_name, id, uname); if (id) { node = NULL; } else { node = by_name; } } else if (by_name == NULL && by_id) { crm_trace("Only one: %p for %s/%s", by_id, id, uname); if (uname) { node = NULL; } } else if ((uname != NULL) && (by_id->name != NULL) && pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { /* Multiple nodes have the same uname in the CIB. * Return by_id. */ } else if ((id != NULL) && (by_name->xml_id != NULL) && pcmk__str_eq(id, by_name->xml_id, pcmk__str_casei)) { /* Multiple nodes have the same id in the CIB. * Return by_name. */ node = by_name; } else { node = NULL; } if (node == NULL) { crm_debug("Couldn't find node%s%s%s%s", id? " " : "", id? id : "", uname? " with name " : "", uname? uname : ""); } return node; } static void cluster_node_cib_cache_refresh_helper(xmlNode *xml_node, void *user_data) { const char *id = crm_element_value(xml_node, PCMK_XA_ID); const char *uname = crm_element_value(xml_node, PCMK_XA_UNAME); pcmk__node_status_t * node = NULL; CRM_CHECK(id != NULL && uname !=NULL, return); node = find_cib_cluster_node(id, uname); if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); node->name = pcmk__str_copy(uname); node->xml_id = pcmk__str_copy(id); g_hash_table_replace(cluster_node_cib_cache, uniqueid, node); } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { pcmk__str_update(&node->name, uname); /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); } } static void refresh_cluster_node_cib_cache(xmlNode *cib) { pcmk__cluster_init_node_caches(); g_hash_table_foreach(cluster_node_cib_cache, mark_dirty, NULL); crm_foreach_xpath_result(cib, PCMK__XP_MEMBER_NODE_CONFIG, cluster_node_cib_cache_refresh_helper, NULL); // Remove all old cache entries that weren't seen in the CIB g_hash_table_foreach_remove(cluster_node_cib_cache, is_dirty, NULL); } void pcmk__refresh_node_caches_from_cib(xmlNode *cib) { refresh_remote_nodes(cib); refresh_cluster_node_cib_cache(cib); } // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START #include void crm_peer_init(void) { pcmk__cluster_init_node_caches(); } // LCOV_EXCL_STOP // End deprecated API