diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c index 58db5d8cf7..db24ec59d7 100644 --- a/daemons/attrd/attrd_elections.c +++ b/daemons/attrd/attrd_elections.c @@ -1,185 +1,184 @@ /* * Copyright 2013-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include "pacemaker-attrd.h" static char *peer_writer = NULL; -static pcmk__election_t *writer = NULL; static gboolean attrd_election_cb(gpointer user_data) { attrd_declare_winner(); /* Update the peers after an election */ attrd_peer_sync(NULL); /* After winning an election, update the CIB with the values of all * attributes as the winner knows them. */ attrd_write_attributes(attrd_write_all); return G_SOURCE_REMOVE; } void attrd_election_init(void) { - writer = election_init(pcmk_ipc_attrd, attrd_cluster->priv->node_name, - attrd_election_cb); + election_init(attrd_cluster, pcmk_ipc_attrd, attrd_cluster->priv->node_name, + attrd_election_cb); } void attrd_election_fini(void) { - election_fini(writer); + election_fini(attrd_cluster); } void attrd_start_election_if_needed(void) { if ((peer_writer == NULL) - && (election_state(writer) != election_in_progress) + && (election_state(attrd_cluster) != election_in_progress) && !attrd_shutting_down(false)) { crm_info("Starting an election to determine the writer"); - election_vote(writer); + election_vote(attrd_cluster); } } bool attrd_election_won(void) { - return (election_state(writer) == election_won); + return (election_state(attrd_cluster) == election_won); } void attrd_handle_election_op(const pcmk__node_status_t *peer, xmlNode *xml) { enum election_result rc = 0; - enum election_result previous = election_state(writer); + enum election_result previous = election_state(attrd_cluster); crm_xml_add(xml, PCMK__XA_SRC, peer->name); // Don't become writer if we're shutting down - rc = election_count_vote(writer, xml, !attrd_shutting_down(false)); + rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down(false)); switch(rc) { case election_start: crm_debug("Unsetting writer (was %s) and starting new election", peer_writer? peer_writer : "unset"); free(peer_writer); peer_writer = NULL; - election_vote(writer); + election_vote(attrd_cluster); break; case election_lost: /* The election API should really distinguish between "we just lost * to this peer" and "we already lost previously, and we are * discarding this vote for some reason", but it doesn't. * * In the first case, we want to tentatively set the peer writer to * this peer, even though another peer may eventually win (which we * will learn via attrd_check_for_new_writer()), so * attrd_start_election_if_needed() doesn't start a new election. * * Approximate a test for that case as best as possible. */ if ((peer_writer == NULL) || (previous != election_lost)) { pcmk__str_update(&peer_writer, peer->name); crm_debug("Election lost, presuming %s is writer for now", peer_writer); } break; case election_in_progress: - election_check(writer); + election_check(attrd_cluster); break; default: crm_info("Ignoring election op from %s due to error", peer->name); break; } } bool attrd_check_for_new_writer(const pcmk__node_status_t *peer, const xmlNode *xml) { int peer_state = 0; crm_element_value_int(xml, PCMK__XA_ATTR_WRITER, &peer_state); if (peer_state == election_won) { - if ((election_state(writer) == election_won) + if ((election_state(attrd_cluster) == election_won) && !pcmk__str_eq(peer->name, attrd_cluster->priv->node_name, pcmk__str_casei)) { crm_notice("Detected another attribute writer (%s), starting new " "election", peer->name); - election_vote(writer); + election_vote(attrd_cluster); } else if (!pcmk__str_eq(peer->name, peer_writer, pcmk__str_casei)) { crm_notice("Recorded new attribute writer: %s (was %s)", peer->name, pcmk__s(peer_writer, "unset")); pcmk__str_update(&peer_writer, peer->name); } } return (peer_state == election_won); } void attrd_declare_winner(void) { crm_notice("Recorded local node as attribute writer (was %s)", (peer_writer? peer_writer : "unset")); pcmk__str_update(&peer_writer, attrd_cluster->priv->node_name); } void attrd_remove_voter(const pcmk__node_status_t *peer) { - election_remove(writer, peer->name); + election_remove(attrd_cluster, peer->name); if ((peer_writer != NULL) && pcmk__str_eq(peer->name, peer_writer, pcmk__str_casei)) { free(peer_writer); peer_writer = NULL; crm_notice("Lost attribute writer %s", peer->name); /* Clear any election dampening in effect. Otherwise, if the lost writer * had just won, the election could fizzle out with no new writer. */ - election_clear_dampening(writer); + election_clear_dampening(attrd_cluster); /* If the writer received attribute updates during its shutdown, it will * not have written them to the CIB. Ensure we get a new writer so they * are written out. This means that every node that sees the writer * leave will start a new election, but that's better than losing * attributes. */ attrd_start_election_if_needed(); /* If an election is in progress, we need to call election_check(), in case * this lost peer is the only one that hasn't voted, otherwise the election * would be pending until it's timed out. */ - } else if (election_state(writer) == election_in_progress) { + } else if (election_state(attrd_cluster) == election_in_progress) { crm_debug("Checking election status upon loss of voter %s", peer->name); - election_check(writer); + election_check(attrd_cluster); } } void attrd_xml_add_writer(xmlNode *xml) { - crm_xml_add_int(xml, PCMK__XA_ATTR_WRITER, election_state(writer)); + crm_xml_add_int(xml, PCMK__XA_ATTR_WRITER, election_state(attrd_cluster)); } diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index ff9fb68444..83d176c551 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,698 +1,699 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include static qb_ipcs_service_t *ipcs = NULL; static crm_trigger_t *config_read_trigger = NULL; #if SUPPORT_COROSYNC extern gboolean crm_connect_corosync(pcmk_cluster_t *cluster); #endif static void crm_shutdown(int nsig); static gboolean crm_read_options(gpointer user_data); /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; - static pcmk_cluster_t *cluster = NULL; - if (cluster == NULL) { - cluster = pcmk_cluster_new(); + if (controld_globals.cluster == NULL) { + controld_globals.cluster = pcmk_cluster_new(); } if (action & A_HA_DISCONNECT) { - pcmk_cluster_disconnect(cluster); + pcmk_cluster_disconnect(controld_globals.cluster); crm_info("Disconnected from the cluster"); controld_set_fsa_input_flags(R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { pcmk__cluster_set_status_callback(&peer_update_callback); pcmk__cluster_set_autoreap(false); #if SUPPORT_COROSYNC if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { - registered = crm_connect_corosync(cluster); + registered = crm_connect_corosync(controld_globals.cluster); } #endif // SUPPORT_COROSYNC if (registered) { pcmk__node_status_t *node = - pcmk__get_node(cluster->priv->node_id, cluster->priv->node_name, - NULL, pcmk__node_search_cluster_member); + pcmk__get_node(controld_globals.cluster->priv->node_id, + controld_globals.cluster->priv->node_name, NULL, + pcmk__node_search_cluster_member); - controld_election_init(cluster->priv->node_name); - controld_globals.our_nodename = cluster->priv->node_name; + controld_election_init(controld_globals.cluster->priv->node_name); + controld_globals.our_nodename = + controld_globals.cluster->priv->node_name; free(controld_globals.our_uuid); controld_globals.our_uuid = pcmk__str_copy(pcmk__cluster_node_uuid(node)); if (controld_globals.our_uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } } if (!registered) { controld_set_fsa_input_flags(R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __func__); controld_clear_fsa_input_flags(R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __func__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ controld_set_fsa_input_flags(R_SHUTDOWN); controld_disconnect_fencer(FALSE); } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; controld_set_fsa_input_flags(R_SHUTDOWN); //controld_set_fsa_input_flags(R_STAYDOWN); crm_info("Sending shutdown request to all peers (DC is %s)", pcmk__s(controld_globals.dc_name, "not set")); msg = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD, NULL, CRM_SYSTEM_CRMD, CRM_OP_SHUTDOWN_REQ, NULL); if (!pcmk__cluster_send_message(NULL, pcmk_ipc_controld, msg)) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } pcmk__xml_free(msg); } void crmd_fast_exit(crm_exit_t exit_code) { if (pcmk_is_set(controld_globals.fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn " QB_XS " remapping exit code %d to %d", exit_code, CRM_EX_FATAL); exit_code = CRM_EX_FATAL; } else if ((exit_code == CRM_EX_OK) && pcmk_is_set(controld_globals.fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = CRM_EX_ERROR; } if (controld_globals.logger_out != NULL) { controld_globals.logger_out->finish(controld_globals.logger_out, exit_code, true, NULL); pcmk__output_free(controld_globals.logger_out); controld_globals.logger_out = NULL; } crm_exit(exit_code); } crm_exit_t crmd_exit(crm_exit_t exit_code) { GMainLoop *mloop = controld_globals.mainloop; static bool in_progress = FALSE; if (in_progress && (exit_code == CRM_EX_OK)) { crm_debug("Exit is already in progress"); return exit_code; } else if(in_progress) { crm_notice("Error during shutdown process, exiting now with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } in_progress = TRUE; crm_trace("Preparing to exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); /* Suppress secondary errors resulting from us disconnecting everything */ controld_set_fsa_input_flags(R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } controld_close_attrd_ipc(); controld_shutdown_schedulerd_ipc(); controld_disconnect_fencer(TRUE); if ((exit_code == CRM_EX_OK) && (controld_globals.mainloop == NULL)) { crm_debug("No mainloop detected"); exit_code = CRM_EX_ERROR; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if (exit_code != CRM_EX_OK) { crm_notice("Forcing immediate exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } /* Clean up as much memory as possible for valgrind */ for (GList *iter = controld_globals.fsa_message_queue; iter != NULL; iter = iter->next) { fsa_data_t *fsa_data = (fsa_data_t *) iter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(controld_globals.fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } controld_clear_fsa_input_flags(R_MEMBERSHIP); g_list_free(controld_globals.fsa_message_queue); controld_globals.fsa_message_queue = NULL; controld_free_node_pending_timers(); controld_election_fini(); /* Tear down the CIB manager connection, but don't free it yet -- it could * be used when we drain the mainloop later. */ controld_disconnect_cib_manager(); verify_stopped(controld_globals.fsa_state, LOG_WARNING); controld_clear_fsa_input_flags(R_LRM_CONNECTED); lrm_state_destroy_all(); mainloop_destroy_trigger(config_read_trigger); config_read_trigger = NULL; controld_destroy_fsa_trigger(); controld_destroy_transition_trigger(); pcmk__client_cleanup(); pcmk__cluster_destroy_node_caches(); controld_free_fsa_timers(); te_cleanup_stonith_history_sync(NULL, TRUE); controld_free_sched_timer(); free(controld_globals.our_nodename); controld_globals.our_nodename = NULL; free(controld_globals.our_uuid); controld_globals.our_uuid = NULL; free(controld_globals.dc_name); controld_globals.dc_name = NULL; free(controld_globals.dc_version); controld_globals.dc_version = NULL; free(controld_globals.cluster_name); controld_globals.cluster_name = NULL; free(controld_globals.te_uuid); controld_globals.te_uuid = NULL; free_max_generation(); controld_destroy_failed_sync_table(); controld_destroy_outside_events_table(); mainloop_destroy_signal(SIGPIPE); mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); /* leave SIGCHLD engaged as we might still want to drain some service-actions */ if (mloop) { GMainContext *ctx = g_main_loop_get_context(controld_globals.mainloop); /* Don't re-enter this block */ controld_globals.mainloop = NULL; /* no signals on final draining anymore */ mainloop_destroy_signal(SIGCHLD); crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); { int lpc = 0; while((g_main_context_pending(ctx) && lpc < 10)) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); } else { mainloop_destroy_signal(SIGCHLD); } cib_delete(controld_globals.cib_conn); controld_globals.cib_conn = NULL; throttle_fini(); /* Graceful */ crm_trace("Done preparing for exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); return exit_code; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; if (pcmk_is_set(action, A_EXIT_1)) { exit_code = CRM_EX_ERROR; crm_err("Exiting now due to errors"); } verify_stopped(cur_state, LOG_ERR); crmd_exit(exit_code); } static void sigpipe_ignore(int nsig) { return; } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); mainloop_add_signal(SIGPIPE, sigpipe_ignore); config_read_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); controld_init_fsa_trigger(); controld_init_transition_trigger(); crm_debug("Creating CIB manager and executor objects"); controld_globals.cib_conn = cib_new(); lrm_state_init_local(); if (controld_init_fsa_timers() == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } // \return libqb error code (0 on success, -errno on error) static int32_t accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Accepting new IPC client connection"); if (pcmk__new_client(c, uid, gid) == NULL) { return -ENOMEM; } return 0; } // \return libqb error code (0 on success, -errno on error) static int32_t dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; pcmk__client_t *client = pcmk__find_client(c); xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags); if (msg == NULL) { pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, CRM_EX_PROTOCOL); return 0; } pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, CRM_EX_INDETERMINATE); CRM_ASSERT(client->user != NULL); pcmk__update_acl_user(msg, PCMK__XA_CRM_USER, client->user); crm_xml_add(msg, PCMK__XA_CRM_SYS_FROM, client->id); if (controld_authorize_ipc_message(msg, client, NULL)) { crm_trace("Processing IPC message from client %s", pcmk__client_name(client)); route_message(C_IPC_MESSAGE, msg); } controld_trigger_fsa(); pcmk__xml_free(msg); return 0; } static int32_t ipc_client_disconnected(qb_ipcs_connection_t *c) { pcmk__client_t *client = pcmk__find_client(c); if (client) { crm_trace("Disconnecting %sregistered client %s (%p/%p)", (client->userdata? "" : "un"), pcmk__client_name(client), c, client); free(client->userdata); pcmk__free_client(client); controld_trigger_fsa(); } return 0; } static void ipc_connection_destroyed(qb_ipcs_connection_t *c) { crm_trace("Connection %p", c); ipc_client_disconnected(c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = accept_controller_client, .connection_created = NULL, .msg_process = dispatch_controller_ipc, .connection_closed = ipc_client_disconnected, .connection_destroyed = ipc_connection_destroyed }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_MEMBERSHIP)) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) { crm_info("Delaying start, not connected to executor (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_READ_CONFIG)) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_PEER_DATA)) { crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = pcmk__serve_controld_ipc(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_notice("Pacemaker controller successfully started and accepting connections"); } controld_set_fsa_input_flags(R_ST_REQUIRED); controld_timer_fencer_connect(GINT_TO_POINTER(TRUE)); controld_clear_fsa_input_flags(R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { controld_set_fsa_input_flags(R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); xmlNode *crmconfig = NULL; xmlNode *alerts = NULL; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); controld_set_fsa_input_flags(R_STAYDOWN); } goto bail; } crmconfig = output; if ((crmconfig != NULL) && !pcmk__xe_is(crmconfig, PCMK_XE_CRM_CONFIG)) { crmconfig = pcmk__xe_first_child(crmconfig, PCMK_XE_CRM_CONFIG, NULL, NULL); } if (!crmconfig) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query for " PCMK_XE_CRM_CONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = pcmk__strkey_table(free, free); pe_unpack_nvpairs(crmconfig, crmconfig, PCMK_XE_CLUSTER_PROPERTY_SET, NULL, config_hash, PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS, FALSE, now, NULL); // Validate all options, and use defaults if not already present in hash pcmk__validate_cluster_options(config_hash); /* Validate the watchdog timeout in the context of the local node * environment. If invalid, the controller will exit with a fatal error. * * We do this via a wrapper in the controller, so that we call * pcmk__valid_stonith_watchdog_timeout() only if watchdog fencing is * enabled for the local node. Otherwise, we may exit unnecessarily. * * A validator function in libcrmcommon can't act as such a wrapper, because * it doesn't have a stonith API connection or the local node name. */ value = g_hash_table_lookup(config_hash, PCMK_OPT_STONITH_WATCHDOG_TIMEOUT); controld_verify_stonith_watchdog_timeout(value); value = g_hash_table_lookup(config_hash, PCMK_OPT_NO_QUORUM_POLICY); if (pcmk__str_eq(value, PCMK_VALUE_FENCE_LEGACY, pcmk__str_casei) && (pcmk__locate_sbd() != 0)) { controld_set_global_flags(controld_no_quorum_suicide); } value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK); if (crm_is_true(value)) { controld_set_global_flags(controld_shutdown_lock_enabled); } else { controld_clear_global_flags(controld_shutdown_lock_enabled); } value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK_LIMIT); pcmk_parse_interval_spec(value, &controld_globals.shutdown_lock_limit); controld_globals.shutdown_lock_limit /= 1000; value = g_hash_table_lookup(config_hash, PCMK_OPT_NODE_PENDING_TIMEOUT); pcmk_parse_interval_spec(value, &controld_globals.node_pending_timeout); controld_globals.node_pending_timeout /= 1000; value = g_hash_table_lookup(config_hash, PCMK_OPT_CLUSTER_NAME); pcmk__str_update(&(controld_globals.cluster_name), value); // Let subcomponents initialize their own static variables controld_configure_election(config_hash); controld_configure_fencing(config_hash); controld_configure_fsa_timers(config_hash); controld_configure_throttle(config_hash); alerts = pcmk__xe_first_child(output, PCMK_XE_ALERTS, NULL, NULL); crmd_unpack_alerts(alerts); controld_set_fsa_input_flags(R_READ_CONFIG); controld_trigger_fsa(); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } /*! * \internal * \brief Trigger read and processing of the configuration * * \param[in] fn Calling function name * \param[in] line Line number where call occurred */ void controld_trigger_config_as(const char *fn, int line) { if (config_read_trigger != NULL) { crm_trace("%s:%d - Triggered config processing", fn, line); mainloop_set_trigger(config_read_trigger); } } gboolean crm_read_options(gpointer user_data) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = cib_conn->cmds->query(cib_conn, "//" PCMK_XE_CRM_CONFIG " | //" PCMK_XE_ALERTS, NULL, cib_xpath); fsa_register_cib_callback(call_id, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); controld_trigger_config(); } static void crm_shutdown(int nsig) { const char *value = NULL; guint default_period_ms = 0; if ((controld_globals.mainloop == NULL) || !g_main_loop_is_running(controld_globals.mainloop)) { crmd_exit(CRM_EX_OK); return; } if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); return; } controld_set_fsa_input_flags(R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); /* If shutdown timer doesn't have a period set, use the default * * @TODO: Evaluate whether this is still necessary. As long as * config_query_callback() has been run at least once, it doesn't look like * anything could have changed the timer period since then. */ value = pcmk__cluster_option(NULL, PCMK_OPT_SHUTDOWN_ESCALATION); pcmk_parse_interval_spec(value, &default_period_ms); controld_shutdown_start_countdown(default_period_ms); } diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c index 4f6208c90c..3a0cad1cb0 100644 --- a/daemons/controld/controld_election.c +++ b/daemons/controld/controld_election.c @@ -1,290 +1,289 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include -static pcmk__election_t *fsa_election = NULL; - static gboolean election_win_cb(gpointer data) { register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL); return FALSE; } void controld_election_init(const char *uname) { - fsa_election = election_init(pcmk_ipc_controld, uname, election_win_cb); + election_init(controld_globals.cluster, pcmk_ipc_controld, uname, + election_win_cb); } /*! * \internal * \brief Configure election options based on the CIB * * \param[in,out] options Name/value pairs for configured options */ void controld_configure_election(GHashTable *options) { const char *value = g_hash_table_lookup(options, PCMK_OPT_ELECTION_TIMEOUT); guint interval_ms = 0U; pcmk_parse_interval_spec(value, &interval_ms); - election_timeout_set_period(fsa_election, interval_ms); + election_timeout_set_period(controld_globals.cluster, interval_ms); } void controld_remove_voter(const char *uname) { - election_remove(fsa_election, uname); + election_remove(controld_globals.cluster, uname); if (pcmk__str_eq(uname, controld_globals.dc_name, pcmk__str_casei)) { /* Clear any election dampening in effect. Otherwise, if the lost DC had * just won, an immediate new election could fizzle out with no new DC. */ - election_clear_dampening(fsa_election); + election_clear_dampening(controld_globals.cluster); } } void controld_election_fini(void) { - election_fini(fsa_election); - fsa_election = NULL; + election_fini(controld_globals.cluster); } void controld_stop_current_election_timeout(void) { - election_timeout_stop(fsa_election); + election_timeout_stop(controld_globals.cluster); } /* A_ELECTION_VOTE */ void do_election_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean not_voting = FALSE; /* don't vote if we're in one of these states or wanting to shut down */ switch (cur_state) { case S_STARTING: case S_RECOVERY: case S_STOPPING: case S_TERMINATE: crm_warn("Not voting in election, we're in state %s", fsa_state2string(cur_state)); not_voting = TRUE; break; case S_ELECTION: case S_INTEGRATION: case S_RELEASE_DC: break; default: crm_err("Broken? Voting in state %s", fsa_state2string(cur_state)); break; } if (not_voting == FALSE) { if (pcmk_is_set(controld_globals.fsa_input_register, R_STARTING)) { not_voting = TRUE; } } if (not_voting) { if (AM_I_DC) { register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); } else { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } return; } - election_vote(fsa_election); + election_vote(controld_globals.cluster); return; } void do_election_check(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (controld_globals.fsa_state == S_ELECTION) { - election_check(fsa_election); + election_check(controld_globals.cluster); } else { crm_debug("Ignoring election check because we are not in an election"); } } /* A_ELECTION_COUNT */ void do_election_count_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { enum election_result rc = 0; ha_msg_input_t *vote = fsa_typed_data(fsa_dt_ha_msg); if (pcmk__peer_cache == NULL) { if (!pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { crm_err("Internal error, no peer cache"); } return; } - rc = election_count_vote(fsa_election, vote->msg, cur_state != S_STARTING); + rc = election_count_vote(controld_globals.cluster, vote->msg, + (cur_state != S_STARTING)); switch(rc) { case election_start: - election_reset(fsa_election); + election_reset(controld_globals.cluster); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); break; case election_lost: update_dc(NULL); if (pcmk_is_set(controld_globals.fsa_input_register, R_THE_DC)) { cib_t *cib_conn = controld_globals.cib_conn; register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); cib_conn->cmds->set_secondary(cib_conn, cib_none); } else if (cur_state != S_STARTING) { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } break; default: crm_trace("Election message resulted in state %d", rc); } } static void feature_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_notice("Feature update failed: %s " QB_XS " rc=%d", pcmk_strerror(rc), rc); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /*! * \internal * \brief Update a node attribute in the CIB during a DC takeover * * \param[in] name Name of attribute to update * \param[in] value New attribute value */ #define dc_takeover_update_attr(name, value) do { \ cib__update_node_attr(controld_globals.logger_out, \ controld_globals.cib_conn, cib_none, \ PCMK_XE_CRM_CONFIG, NULL, NULL, NULL, NULL, \ name, value, NULL, NULL); \ } while (0) /* A_DC_TAKEOVER */ void do_dc_takeover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *cib = NULL; const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer); pid_t watchdog = pcmk__locate_sbd(); crm_info("Taking over DC status for this partition"); controld_set_fsa_input_flags(R_THE_DC); execute_stonith_cleanup(); - election_reset(fsa_election); + election_reset(controld_globals.cluster); controld_set_fsa_input_flags(R_JOIN_OK|R_INVOKE_PE); controld_globals.cib_conn->cmds->set_primary(controld_globals.cib_conn, cib_none); cib = pcmk__xe_create(NULL, PCMK_XE_CIB); crm_xml_add(cib, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET); controld_update_cib(PCMK_XE_CIB, cib, cib_none, feature_update_callback); dc_takeover_update_attr(PCMK_OPT_HAVE_WATCHDOG, pcmk__btoa(watchdog)); dc_takeover_update_attr(PCMK_OPT_DC_VERSION, PACEMAKER_VERSION "-" BUILD_VERSION); dc_takeover_update_attr(PCMK_OPT_CLUSTER_INFRASTRUCTURE, cluster_layer_s); #if SUPPORT_COROSYNC if ((controld_globals.cluster_name == NULL) && (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync)) { char *cluster_name = pcmk__corosync_cluster_name(); if (cluster_name != NULL) { dc_takeover_update_attr(PCMK_OPT_CLUSTER_NAME, cluster_name); } free(cluster_name); } #endif controld_trigger_config(); pcmk__xml_free(cib); } /* A_DC_RELEASE */ void do_dc_release(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (action & A_DC_RELEASE) { crm_debug("Releasing the role of DC"); controld_clear_fsa_input_flags(R_THE_DC); controld_expect_sched_reply(NULL); } else if (action & A_DC_RELEASED) { crm_info("DC role released"); if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { xmlNode *update = NULL; pcmk__node_status_t *node = pcmk__get_node(0, controld_globals.our_nodename, NULL, pcmk__node_search_cluster_member); pcmk__update_peer_expected(__func__, node, CRMD_JOINSTATE_DOWN); update = create_node_state_update(node, node_update_expected, NULL, __func__); /* Don't need a based response because controld will stop. */ fsa_cib_anon_update_discard_reply(PCMK_XE_STATUS, update); pcmk__xml_free(update); } register_fsa_input(C_FSA_INTERNAL, I_RELEASE_SUCCESS, NULL); } else { crm_err("Unknown DC action %s", fsa_action2string(action)); } crm_trace("Am I still the DC? %s", pcmk__btoa(AM_I_DC)); } diff --git a/daemons/controld/controld_globals.h b/daemons/controld/controld_globals.h index 9cf105f40a..f2b0166ff0 100644 --- a/daemons/controld/controld_globals.h +++ b/daemons/controld/controld_globals.h @@ -1,151 +1,156 @@ /* * Copyright 2022-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CONTROLD_GLOBALS__H # define CONTROLD_GLOBALS__H #include // pcmk__output_t, etc. #include // uint32_t, uint64_t #include // GList, GMainLoop #include // cib_t #include // pcmk__graph_t #include // enum crmd_fsa_state typedef struct { - // Booleans - - //! Group of \p controld_flags values + // Group of \p controld_flags values uint32_t flags; - // Controller FSA + /* Controller finite state automaton */ - //! FSA state + // FSA state enum crmd_fsa_state fsa_state; - //! FSA actions (group of \p A_* flags) + // FSA actions (group of \p A_* flags) uint64_t fsa_actions; - //! FSA input register contents (group of \p R_* flags) + // FSA input register contents (group of \p R_* flags) uint64_t fsa_input_register; - //! FSA message queue + // FSA message queue GList *fsa_message_queue; - // CIB + /* CIB */ - //! Connection to the CIB + // Connection to the CIB cib_t *cib_conn; - // Scheduler + /* Scheduler */ - //! Reference of the scheduler request being waited on + // Reference of the scheduler request being waited on char *fsa_pe_ref; - // Transitioner + /* Transitioner */ - //! Transitioner UUID + // Transitioner UUID char *te_uuid; - //! Graph of transition currently being processed + // Graph of transition currently being processed pcmk__graph_t *transition_graph; - // Logging + /* Logging */ - //! Output object for controller log messages + // Output object for controller log messages pcmk__output_t *logger_out; - // Other + /* Cluster layer */ - //! Cluster name + // Cluster name char *cluster_name; - //! Designated controller name + // Cluster connection + pcmk_cluster_t *cluster; + + /* @TODO Figure out, document, and clean up the code involving + * controld_globals.membership_id, controld_globals.peer_seq, and + * highest_seq. It's convoluted with no comments. It has something to do + * with corosync quorum notifications and the current ring ID, but it's + * unclear why we need three separate variables for it. + */ + // Last saved cluster communication layer membership ID + unsigned long long membership_id; + + unsigned long long peer_seq; + + + /* Other */ + + // Designated controller name char *dc_name; - //! Designated controller's Pacemaker version + // Designated controller's Pacemaker version char *dc_version; - //! Local node's node name + // Local node's node name + // @TODO Use controld_globals.cluster->priv->node_name instead char *our_nodename; - //! Local node's UUID + // Local node's UUID char *our_uuid; - //! Last saved cluster communication layer membership ID - unsigned long long membership_id; - - /* @TODO Figure out, document, and clean up the code involving - * controld_peer_seq, controld_globals.membership_id, and highest_seq. It's - * convoluted with no comments. It has something to do with corosync quorum - * notifications and the current ring ID, but it's unclear why we need three - * separate variables for it. - */ - unsigned long long peer_seq; - - //! Max lifetime (in seconds) of a resource's shutdown lock to a node + // Max lifetime (in seconds) of a resource's shutdown lock to a node guint shutdown_lock_limit; - //! Node pending timeout + // Node pending timeout guint node_pending_timeout; - //! Main event loop + // Main event loop GMainLoop *mainloop; } controld_globals_t; extern controld_globals_t controld_globals; /*! * \internal * \enum controld_flags * \brief Bit flags to store various controller state and configuration info */ enum controld_flags { //! The DC left in a membership change that is being processed controld_dc_left = (1 << 0), //! The FSA is stalled waiting for further input controld_fsa_is_stalled = (1 << 1), //! The local node has been in a quorate partition at some point controld_ever_had_quorum = (1 << 2), //! The local node is currently in a quorate partition controld_has_quorum = (1 << 3), //! Panic the local node if it loses quorum controld_no_quorum_suicide = (1 << 4), //! Lock resources to the local node when it shuts down cleanly controld_shutdown_lock_enabled = (1 << 5), }; # define controld_set_global_flags(flags_to_set) do { \ controld_globals.flags = pcmk__set_flags_as(__func__, __LINE__, \ LOG_TRACE, \ "Global", "controller", \ controld_globals.flags, \ (flags_to_set), \ #flags_to_set); \ } while (0) # define controld_clear_global_flags(flags_to_clear) do { \ controld_globals.flags \ = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Global", \ "controller", controld_globals.flags, \ (flags_to_clear), #flags_to_clear); \ } while (0) #endif // ifndef CONTROLD_GLOBALS__H diff --git a/include/crm/cluster/election_internal.h b/include/crm/cluster/election_internal.h index 5d62620d3a..bc9e9835bc 100644 --- a/include/crm/cluster/election_internal.h +++ b/include/crm/cluster/election_internal.h @@ -1,94 +1,93 @@ /* * Copyright 2009-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H #define PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H #include // bool #include // guint, GSourceFunc #include // xmlNode #include // enum pcmk_ipc_server +#include // pcmk_cluster_t #ifdef __cplusplus extern "C" { #endif /** * \file * \brief Functions for conducting elections * * An election is useful for a daemon that runs on all nodes but needs any one * instance to perform a special role. * * Elections are closely tied to the cluster peer cache. Peers in the cache that * are active members are eligible to vote. Elections are named for logging * purposes, but only one election may exist at any time, so typically an * election would be created at daemon start-up and freed at shutdown. * * Pacemaker's election procedure has been heavily adapted from the * Invitation Algorithm variant of the Garcia-Molina Bully Algorithm: * * https://en.wikipedia.org/wiki/Bully_algorithm * * Elections are conducted via cluster messages. There are two types of * messages: a "vote" is a declaration of the voting node's candidacy, and is * always broadcast; a "no-vote" is a concession by the responding node, and is * always a reply to the preferred node's vote. (These correspond to "invite" * and "accept" in the traditional algorithm.) * * A vote together with any no-vote replies to it is considered an election * round. Rounds are numbered with a simple counter unique to each node * (this would be the group number in the traditional algorithm). Concurrent * election rounds are possible. * * An election round is started when any node broadcasts a vote. When a node * receives another node's vote, it compares itself against the sending node * according to certain metrics, and either starts a new round (if it prefers * itself) or replies to the other node with a no-vote (if it prefers that * node). * * If a node receives no-votes from all other active nodes, it declares itself * the winner. The library API does not notify other nodes of this; callers * must implement that if desired. */ -typedef struct pcmk__election pcmk__election_t; - /*! Possible election states */ enum election_result { election_start = 0, /*! new election needed */ election_in_progress, /*! election started but not all peers have voted */ election_lost, /*! local node lost most recent election */ election_won, /*! local node won most recent election */ election_error, /*! election message or election object invalid */ }; -void election_fini(pcmk__election_t *e); -void election_reset(pcmk__election_t *e); -pcmk__election_t *election_init(enum pcmk_ipc_server, const char *uname, - GSourceFunc cb); +void election_fini(pcmk_cluster_t *cluster); +void election_reset(pcmk_cluster_t *cluster); +void election_init(pcmk_cluster_t *cluster, enum pcmk_ipc_server, + const char *uname, GSourceFunc cb); -void election_timeout_set_period(pcmk__election_t *e, guint period_ms); -void election_timeout_stop(pcmk__election_t *e); +void election_timeout_set_period(pcmk_cluster_t *cluster, guint period_ms); +void election_timeout_stop(pcmk_cluster_t *cluster); -void election_vote(pcmk__election_t *e); -bool election_check(pcmk__election_t *e); -void election_remove(pcmk__election_t *e, const char *uname); -enum election_result election_state(const pcmk__election_t *e); -enum election_result election_count_vote(pcmk__election_t *e, +void election_vote(pcmk_cluster_t *cluster); +bool election_check(pcmk_cluster_t *cluster); +void election_remove(pcmk_cluster_t *cluster, const char *uname); +enum election_result election_state(const pcmk_cluster_t *cluster); +enum election_result election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message, bool can_win); -void election_clear_dampening(pcmk__election_t *e); +void election_clear_dampening(pcmk_cluster_t *cluster); #ifdef __cplusplus } #endif #endif // PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h index 591be0e88d..13943d085b 100644 --- a/include/crm/cluster/internal.h +++ b/include/crm/cluster/internal.h @@ -1,321 +1,325 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_CLUSTER_INTERNAL__H #define PCMK__CRM_CLUSTER_INTERNAL__H #include #include // uint32_t, uint64_t #include // gboolean #include // xmlNode #include // enum crm_ipc_server #include #if SUPPORT_COROSYNC #include // cpg_name, cpg_handle_t #endif #ifdef __cplusplus extern "C" { #endif // @TODO Replace this with a pcmk__node_status_flags value enum crm_proc_flag { crm_proc_none = 0x00000001, // Cluster layers crm_proc_cpg = 0x04000000, }; /*! * \internal * \enum pcmk__node_status_flags * \brief Boolean flags for a \c pcmk__node_status_t object * * Some flags may not be related to status specifically. However, we keep these * separate from enum pcmk__node_flags because they're used with * different object types. */ enum pcmk__node_status_flags { /*! * Node is a Pacemaker Remote node and should not be considered for cluster * membership */ pcmk__node_status_remote = (UINT32_C(1) << 0), //! Node's cache entry is dirty pcmk__node_status_dirty = (UINT32_C(1) << 1), }; // Used with node cache search functions enum pcmk__node_search_flags { //! Does not affect search pcmk__node_search_none = 0, //! Search for cluster nodes from membership cache pcmk__node_search_cluster_member = (1 << 0), //! Search for remote nodes pcmk__node_search_remote = (1 << 1), //! Search for cluster member nodes and remote nodes pcmk__node_search_any = pcmk__node_search_cluster_member |pcmk__node_search_remote, //! Search for cluster nodes from CIB (as of last cache refresh) pcmk__node_search_cluster_cib = (1 << 2), }; /*! * \internal * \enum pcmk__node_update * \brief Type of update to a \c pcmk__node_status_t object */ enum pcmk__node_update { pcmk__node_update_name, //!< Node name updated pcmk__node_update_state, //!< Node connection state updated pcmk__node_update_processes, //!< Node process group membership updated }; +typedef struct pcmk__election pcmk__election_t; + //! Implementation of pcmk__cluster_private_t struct pcmk__cluster_private { enum pcmk_ipc_server server; //!< Server this connection is for (if any) // @TODO Drop and replace with per-daemon cluster-layer ID global variables? uint32_t node_id; //!< Local node ID at cluster layer // @TODO Drop and replace with per-daemon node name global variables? char *node_name; //!< Local node name at cluster layer + pcmk__election_t *election; //!< Election state (if election is needed) + #if SUPPORT_COROSYNC /* @TODO Make these members a separate struct and use void *cluster_data * here instead, to abstract the cluster layer further. */ struct cpg_name group; //!< Corosync CPG name cpg_handle_t cpg_handle; //!< Corosync CPG handle #endif // SUPPORT_COROSYNC }; //! Node status data (may be a cluster node or a Pacemaker Remote node) typedef struct pcmk__node_status { //! Node name as known to cluster layer, or Pacemaker Remote node name char *name; /* @COMPAT This is less than ideal since the value is not a valid XML ID * (for Corosync, it's the string equivalent of the node's numeric node ID, * but XML IDs can't start with a number) and the three elements should have * different IDs. * * Ideally, we would use something like node-NODEID, node_state-NODEID, and * transient_attributes-NODEID as the element IDs. Unfortunately changing it * would be impractical due to backward compatibility; older nodes in a * rolling upgrade will always write and expect the value in the old format. */ /*! * Value of the PCMK_XA_ID XML attribute to use with the node's * PCMK_XE_NODE, PCMK_XE_NODE_STATE, and PCMK_XE_TRANSIENT_ATTRIBUTES * XML elements in the CIB */ char *xml_id; char *state; // @TODO change to enum //! Group of enum pcmk__node_status_flags uint32_t flags; /*! * Most recent cluster membership in which node was seen (0 for Pacemaker * Remote nodes) */ uint64_t membership_id; uint32_t processes; // @TODO most not needed, merge into flags /* @TODO When we can break public API compatibility, we can make the rest of * these members separate structs and use void *cluster_data and * void *user_data here instead, to abstract the cluster layer further. */ //! Arbitrary data (must be freeable by \c free()) void *user_data; char *expected; time_t peer_lost; char *conn_host; time_t when_member; // Since when node has been a cluster member time_t when_online; // Since when peer has been online in CPG /* @TODO The following are currently needed only by the Corosync stack. * Eventually consider moving them to a cluster-layer-specific data object. */ uint32_t cluster_layer_id; //!< Cluster-layer numeric node ID time_t when_lost; //!< When CPG membership was last lost } pcmk__node_status_t; /*! * \internal * \brief Return the process bit corresponding to the current cluster stack * * \return Process flag if detectable, otherwise 0 */ static inline uint32_t crm_get_cluster_proc(void) { switch (pcmk_get_cluster_layer()) { case pcmk_cluster_layer_corosync: return crm_proc_cpg; default: break; } return crm_proc_none; } /*! * \internal * \brief Get log-friendly string description of a Corosync return code * * \param[in] error Corosync return code * * \return Log-friendly string description corresponding to \p error */ static inline const char * pcmk__cs_err_str(int error) { # if SUPPORT_COROSYNC switch (error) { case CS_OK: return "OK"; case CS_ERR_LIBRARY: return "Library error"; case CS_ERR_VERSION: return "Version error"; case CS_ERR_INIT: return "Initialization error"; case CS_ERR_TIMEOUT: return "Timeout"; case CS_ERR_TRY_AGAIN: return "Try again"; case CS_ERR_INVALID_PARAM: return "Invalid parameter"; case CS_ERR_NO_MEMORY: return "No memory"; case CS_ERR_BAD_HANDLE: return "Bad handle"; case CS_ERR_BUSY: return "Busy"; case CS_ERR_ACCESS: return "Access error"; case CS_ERR_NOT_EXIST: return "Doesn't exist"; case CS_ERR_NAME_TOO_LONG: return "Name too long"; case CS_ERR_EXIST: return "Exists"; case CS_ERR_NO_SPACE: return "No space"; case CS_ERR_INTERRUPT: return "Interrupt"; case CS_ERR_NAME_NOT_FOUND: return "Name not found"; case CS_ERR_NO_RESOURCES: return "No resources"; case CS_ERR_NOT_SUPPORTED: return "Not supported"; case CS_ERR_BAD_OPERATION: return "Bad operation"; case CS_ERR_FAILED_OPERATION: return "Failed operation"; case CS_ERR_MESSAGE_ERROR: return "Message error"; case CS_ERR_QUEUE_FULL: return "Queue full"; case CS_ERR_QUEUE_NOT_AVAILABLE: return "Queue not available"; case CS_ERR_BAD_FLAGS: return "Bad flags"; case CS_ERR_TOO_BIG: return "Too big"; case CS_ERR_NO_SECTIONS: return "No sections"; } # endif return "Corosync error"; } # if SUPPORT_COROSYNC #if 0 /* This is the new way to do it, but we still support all Corosync 2 versions, * and this isn't always available. A better alternative here would be to check * for support in the configure script and enable this conditionally. */ #define pcmk__init_cmap(handle) cmap_initialize_map((handle), CMAP_MAP_ICMAP) #else #define pcmk__init_cmap(handle) cmap_initialize(handle) #endif char *pcmk__corosync_cluster_name(void); bool pcmk__corosync_add_nodes(xmlNode *xml_parent); void pcmk__cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); char *pcmk__cpg_message_data(cpg_handle_t handle, uint32_t sender_id, uint32_t pid, void *content, const char **from); # endif const char *pcmk__cluster_node_uuid(pcmk__node_status_t *node); char *pcmk__cluster_node_name(uint32_t nodeid); const char *pcmk__cluster_local_node_name(void); const char *pcmk__node_name_from_uuid(const char *uuid); pcmk__node_status_t *crm_update_peer_proc(const char *source, pcmk__node_status_t *peer, uint32_t flag, const char *status); pcmk__node_status_t *pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership); void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected); void pcmk__reap_unseen_nodes(uint64_t ring_id); void pcmk__corosync_quorum_connect(gboolean (*dispatch)(unsigned long long, gboolean), void (*destroy) (gpointer)); bool pcmk__cluster_send_message(const pcmk__node_status_t *node, enum pcmk_ipc_server service, const xmlNode *data); // Membership extern GHashTable *pcmk__peer_cache; extern GHashTable *pcmk__remote_peer_cache; bool pcmk__cluster_has_quorum(void); void pcmk__cluster_init_node_caches(void); void pcmk__cluster_destroy_node_caches(void); void pcmk__cluster_set_autoreap(bool enable); void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)); bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node); unsigned int pcmk__cluster_num_active_nodes(void); unsigned int pcmk__cluster_num_remote_nodes(void); pcmk__node_status_t *pcmk__cluster_lookup_remote_node(const char *node_name); void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name); void pcmk__cluster_forget_remote_node(const char *node_name); pcmk__node_status_t *pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags); void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id); void pcmk__refresh_node_caches_from_cib(xmlNode *cib); pcmk__node_status_t *pcmk__get_node(unsigned int id, const char *uname, const char *uuid, uint32_t flags); #ifdef __cplusplus } #endif #endif // PCMK__CRM_CLUSTER_INTERNAL__H diff --git a/lib/cluster/election.c b/lib/cluster/election.c index d0b3f80334..0910b71bf1 100644 --- a/lib/cluster/election.c +++ b/lib/cluster/election.c @@ -1,720 +1,731 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #define STORM_INTERVAL 2 /* in seconds */ struct pcmk__election { enum pcmk_ipc_server server; // For message type enum election_result state; guint count; // How many times local node has voted char *uname; // Local node's name GSourceFunc cb; // Function to call if election is won GHashTable *voted; // Key = node name, value = how node voted mainloop_timer_t *timeout; // When to abort if all votes not received int election_wins; // Track wins, for storm detection bool wrote_blackbox; // Write a storm blackbox at most once time_t expires; // When storm detection period ends time_t last_election_loss; // When dampening period ends }; static void -election_complete(pcmk__election_t *e) +election_complete(pcmk_cluster_t *cluster) { - e->state = election_won; - if (e->cb != NULL) { - e->cb(e); + CRM_ASSERT((cluster != NULL) && (cluster->priv->election != NULL)); + cluster->priv->election->state = election_won; + if (cluster->priv->election->cb != NULL) { + cluster->priv->election->cb(cluster); } - election_reset(e); + election_reset(cluster); } static gboolean election_timer_cb(gpointer user_data) { - pcmk__election_t *e = user_data; + pcmk_cluster_t *cluster = user_data; crm_info("Declaring local node as winner after election timed out"); - election_complete(e); + election_complete(cluster); return FALSE; } /*! * \brief Get current state of an election * - * \param[in] e Election object + * \param[in] cluster Cluster with election * * \return Current state of \e */ enum election_result -election_state(const pcmk__election_t *e) +election_state(const pcmk_cluster_t *cluster) { - return (e == NULL)? election_error : e->state; + if ((cluster == NULL) || (cluster->priv->election == NULL)) { + return election_error; + } + return cluster->priv->election->state; } /* The local node will be declared the winner if missing votes are not received * within this time. The value is chosen to be the same as the default for the * election-timeout cluster option. */ #define ELECTION_TIMEOUT_MS 120000 /*! - * \brief Create a new election object + * \brief Track election state in a cluster * - * Every node that wishes to participate in an election must create an election - * object. Typically, this should be done once, at start-up. A caller should - * only create a single election object. + * Every node that wishes to participate in an election must initialize the + * election once, typically at start-up. * + * \param[in] cluster Cluster that election is for * \param[in] server Server to use for message type in election messages * \param[in] uname Local node's name * \param[in] cb Function to call if local node wins election * - * \return Newly allocated election object on success, NULL on error - * \note The caller is responsible for freeing the returned value using + * \note The caller is responsible for freeing the new election using * election_fini(). */ -pcmk__election_t * -election_init(enum pcmk_ipc_server server, const char *uname, GSourceFunc cb) +void +election_init(pcmk_cluster_t *cluster, enum pcmk_ipc_server server, + const char *uname, GSourceFunc cb) { - pcmk__election_t *e = NULL; const char *name = pcmk__s(crm_system_name, "election"); CRM_ASSERT(uname != NULL); - e = pcmk__assert_alloc(1, sizeof(pcmk__election_t)); - e->server = server; - e->uname = pcmk__str_copy(uname); - e->cb = cb; - e->timeout = mainloop_timer_add(name, ELECTION_TIMEOUT_MS, FALSE, - election_timer_cb, e); - return e; + CRM_CHECK(cluster->priv->election == NULL, return); + + cluster->priv->election = pcmk__assert_alloc(1, sizeof(pcmk__election_t)); + cluster->priv->election->server = server; + cluster->priv->election->uname = pcmk__str_copy(uname); + cluster->priv->election->cb = cb; + cluster->priv->election->timeout = mainloop_timer_add(name, + ELECTION_TIMEOUT_MS, + FALSE, + election_timer_cb, + cluster); } /*! * \brief Disregard any previous vote by specified peer * * This discards any recorded vote from a specified peer. Election users should * call this whenever a voting peer becomes inactive. * - * \param[in,out] e Election object - * \param[in] uname Name of peer to disregard + * \param[in,out] cluster Cluster with election + * \param[in] uname Name of peer to disregard */ void -election_remove(pcmk__election_t *e, const char *uname) +election_remove(pcmk_cluster_t *cluster, const char *uname) { - if ((e != NULL) && (uname != NULL) && (e->voted != NULL)) { + if ((cluster != NULL) && (cluster->priv->election != NULL) + && (uname != NULL) && (cluster->priv->election->voted != NULL)) { crm_trace("Discarding (no-)vote from lost peer %s", uname); - g_hash_table_remove(e->voted, uname); + g_hash_table_remove(cluster->priv->election->voted, uname); } } /*! * \brief Stop election timer and disregard all votes * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election */ void -election_reset(pcmk__election_t *e) +election_reset(pcmk_cluster_t *cluster) { - if (e != NULL) { + if ((cluster != NULL) && (cluster->priv->election != NULL)) { crm_trace("Resetting election"); - mainloop_timer_stop(e->timeout); - if (e->voted) { - crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted)); - g_hash_table_destroy(e->voted); - e->voted = NULL; + mainloop_timer_stop(cluster->priv->election->timeout); + if (cluster->priv->election->voted != NULL) { + g_hash_table_destroy(cluster->priv->election->voted); + cluster->priv->election->voted = NULL; } } } /*! * \brief Free an election object * * Free all memory associated with an election object, stopping its * election timer (if running). * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election */ void -election_fini(pcmk__election_t *e) +election_fini(pcmk_cluster_t *cluster) { - if (e != NULL) { - election_reset(e); + if ((cluster != NULL) && (cluster->priv->election != NULL)) { + election_reset(cluster); crm_trace("Destroying election"); - mainloop_timer_del(e->timeout); - free(e->uname); - free(e); + mainloop_timer_del(cluster->priv->election->timeout); + free(cluster->priv->election->uname); + free(cluster->priv->election); + cluster->priv->election = NULL; } } static void -election_timeout_start(pcmk__election_t *e) +election_timeout_start(pcmk_cluster_t *cluster) { - if (e != NULL) { - mainloop_timer_start(e->timeout); - } + mainloop_timer_start(cluster->priv->election->timeout); } /*! * \brief Stop an election's timer, if running * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election */ void -election_timeout_stop(pcmk__election_t *e) +election_timeout_stop(pcmk_cluster_t *cluster) { - if (e != NULL) { - mainloop_timer_stop(e->timeout); + if ((cluster != NULL) && (cluster->priv->election != NULL)) { + mainloop_timer_stop(cluster->priv->election->timeout); } } /*! * \brief Change an election's timeout (restarting timer if running) * - * \param[in,out] e Election object - * \param[in] period New timeout + * \param[in,out] cluster Cluster with election + * \param[in] period New timeout */ void -election_timeout_set_period(pcmk__election_t *e, guint period) +election_timeout_set_period(pcmk_cluster_t *cluster, guint period) { - if (e != NULL) { - mainloop_timer_set_period(e->timeout, period); - } else { - crm_err("No election defined"); - } + CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return); + mainloop_timer_set_period(cluster->priv->election->timeout, period); } static int get_uptime(struct timeval *output) { static time_t expires = 0; static struct rusage info; time_t tm_now = time(NULL); if (expires < tm_now) { int rc = 0; info.ru_utime.tv_sec = 0; info.ru_utime.tv_usec = 0; rc = getrusage(RUSAGE_SELF, &info); output->tv_sec = 0; output->tv_usec = 0; if (rc < 0) { crm_perror(LOG_ERR, "Could not calculate the current uptime"); expires = 0; return -1; } crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec, (long)info.ru_utime.tv_usec); } expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */ output->tv_sec = info.ru_utime.tv_sec; output->tv_usec = info.ru_utime.tv_usec; return 1; } static int compare_age(struct timeval your_age) { struct timeval our_age; get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */ if (our_age.tv_sec > your_age.tv_sec) { crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); return 1; } else if (our_age.tv_sec < your_age.tv_sec) { crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); return -1; } else if (our_age.tv_usec > your_age.tv_usec) { crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)", (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); return 1; } else if (our_age.tv_usec < your_age.tv_usec) { crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)", (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); return -1; } return 0; } /*! * \brief Start a new election by offering local node's candidacy * * Broadcast a "vote" election message containing the local node's ID, * (incremented) election counter, and uptime, and start the election timer. * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election * * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if * all active peers do so, or if the election times out, the local node * wins the election. (If we lose to any peer vote, we will stop the * timer, so a timeout means we did not lose -- either some peer did not * vote, or we did not call election_check() in time.) */ void -election_vote(pcmk__election_t *e) +election_vote(pcmk_cluster_t *cluster) { struct timeval age; xmlNode *vote = NULL; pcmk__node_status_t *our_node = NULL; const char *message_type = NULL; - if (e == NULL) { - crm_trace("Election vote requested, but no election available"); - return; - } + CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return); - our_node = pcmk__get_node(0, e->uname, NULL, + our_node = pcmk__get_node(0, cluster->priv->election->uname, NULL, pcmk__node_search_cluster_member); if (!pcmk__cluster_is_node_active(our_node)) { crm_trace("Cannot vote yet: local node not connected to cluster"); return; } - election_reset(e); - e->state = election_in_progress; - message_type = pcmk__server_message_type(e->server); + election_reset(cluster); + cluster->priv->election->state = election_in_progress; + message_type = pcmk__server_message_type(cluster->priv->election->server); /* @COMPAT We use message_type as the sender and recipient system for * backward compatibility (see T566). */ - vote = pcmk__new_request(e->server, message_type, NULL, - message_type, CRM_OP_VOTE, NULL); + vote = pcmk__new_request(cluster->priv->election->server, message_type, + NULL, message_type, CRM_OP_VOTE, NULL); - e->count++; + cluster->priv->election->count++; crm_xml_add(vote, PCMK__XA_ELECTION_OWNER, our_node->xml_id); - crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, e->count); + crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, cluster->priv->election->count); // Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is actually microseconds get_uptime(&age); crm_xml_add_timeval(vote, PCMK__XA_ELECTION_AGE_SEC, PCMK__XA_ELECTION_AGE_NANO_SEC, &age); - pcmk__cluster_send_message(NULL, e->server, vote); + pcmk__cluster_send_message(NULL, cluster->priv->election->server, vote); pcmk__xml_free(vote); - crm_debug("Started election round %d", e->count); - election_timeout_start(e); + crm_debug("Started election round %d", cluster->priv->election->count); + election_timeout_start(cluster); return; } /*! * \brief Check whether local node has won an election * * If all known peers have sent no-vote messages, stop the election timer, set * the election state to won, and call any registered win callback. * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election * * \return TRUE if local node has won, FALSE otherwise * \note If all known peers have sent no-vote messages, but the election owner * does not call this function, the election will not be won (and the * callback will not be called) until the election times out. * \note This should be called when election_count_vote() returns * \c election_in_progress. */ bool -election_check(pcmk__election_t *e) +election_check(pcmk_cluster_t *cluster) { int voted_size = 0; int num_members = 0; - if (e == NULL) { - crm_trace("Election check requested, but no election available"); - return FALSE; - } - if (e->voted == NULL) { + CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), + return false); + + if (cluster->priv->election->voted == NULL) { crm_trace("Election check requested, but no votes received yet"); return FALSE; } - voted_size = g_hash_table_size(e->voted); + voted_size = g_hash_table_size(cluster->priv->election->voted); num_members = pcmk__cluster_num_active_nodes(); /* in the case of #voted > #members, it is better to * wait for the timeout and give the cluster time to * stabilize */ if (voted_size >= num_members) { /* we won and everyone has voted */ - election_timeout_stop(e); + election_timeout_stop(cluster); if (voted_size > num_members) { GHashTableIter gIter; const pcmk__node_status_t *node = NULL; char *key = NULL; crm_warn("Received too many votes in election"); g_hash_table_iter_init(&gIter, pcmk__peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) { if (pcmk__cluster_is_node_active(node)) { crm_warn("* expected vote: %s", node->name); } } - g_hash_table_iter_init(&gIter, e->voted); + g_hash_table_iter_init(&gIter, cluster->priv->election->voted); while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) { crm_warn("* actual vote: %s", key); } } crm_info("Election won by local node"); - election_complete(e); + election_complete(cluster); return TRUE; } else { crm_debug("Election still waiting on %d of %d vote%s", num_members - voted_size, num_members, pcmk__plural_s(num_members)); } return FALSE; } #define LOSS_DAMPEN 2 /* in seconds */ struct vote { const char *op; const char *from; const char *version; const char *election_owner; int election_id; struct timeval age; }; /*! * \brief Unpack an election message * * \param[in] message Election message XML * \param[out] vote Parsed fields from message * * \return TRUE if election message and election are valid, FALSE otherwise * \note The parsed struct's pointer members are valid only for the lifetime of * the message argument. */ static bool parse_election_message(const xmlNode *message, struct vote *vote) { CRM_CHECK(message && vote, return FALSE); vote->election_id = -1; vote->age.tv_sec = -1; vote->age.tv_usec = -1; vote->op = crm_element_value(message, PCMK__XA_CRM_TASK); vote->from = crm_element_value(message, PCMK__XA_SRC); vote->version = crm_element_value(message, PCMK_XA_VERSION); vote->election_owner = crm_element_value(message, PCMK__XA_ELECTION_OWNER); crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id)); if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL) || (vote->election_owner == NULL) || (vote->election_id < 0)) { crm_warn("Invalid %s message from %s", pcmk__s(vote->op, "election"), pcmk__s(vote->from, "unspecified node")); crm_log_xml_trace(message, "bad-vote"); return FALSE; } // Op-specific validation if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) { /* Only vote ops have uptime. Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is in microseconds. */ crm_element_value_timeval(message, PCMK__XA_ELECTION_AGE_SEC, PCMK__XA_ELECTION_AGE_NANO_SEC, &(vote->age)); if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) { crm_warn("Cannot count election %s from %s " "because it is missing uptime", vote->op, vote->from); return FALSE; } } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) { crm_info("Cannot process election message from %s " "because %s is not a known election op", vote->from, vote->op); return FALSE; } /* If the membership cache is NULL, we REALLY shouldn't be voting -- * the question is how we managed to get here. */ if (pcmk__peer_cache == NULL) { crm_info("Cannot count election %s from %s " "because no peer information available", vote->op, vote->from); return FALSE; } return TRUE; } static void -record_vote(pcmk__election_t *e, struct vote *vote) +record_vote(pcmk_cluster_t *cluster, struct vote *vote) { - CRM_ASSERT(e && vote && vote->from && vote->op); + CRM_ASSERT((vote->from != NULL) && (vote->op != NULL)); - if (e->voted == NULL) { - e->voted = pcmk__strkey_table(free, free); + if (cluster->priv->election->voted == NULL) { + cluster->priv->election->voted = pcmk__strkey_table(free, free); } - pcmk__insert_dup(e->voted, vote->from, vote->op); + pcmk__insert_dup(cluster->priv->election->voted, vote->from, vote->op); } static void -send_no_vote(pcmk__election_t *e, pcmk__node_status_t *peer, struct vote *vote) +send_no_vote(pcmk_cluster_t *cluster, pcmk__node_status_t *peer, + struct vote *vote) { - const char *message_type = pcmk__server_message_type(e->server); - xmlNode *novote = pcmk__new_request(e->server, message_type, - vote->from, message_type, - CRM_OP_NOVOTE, NULL); + const char *message_type = NULL; + xmlNode *novote = NULL; + message_type = pcmk__server_message_type(cluster->priv->election->server); + novote = pcmk__new_request(cluster->priv->election->server, message_type, + vote->from, message_type, CRM_OP_NOVOTE, NULL); crm_xml_add(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner); crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id); - pcmk__cluster_send_message(peer, e->server, novote); + pcmk__cluster_send_message(peer, cluster->priv->election->server, novote); pcmk__xml_free(novote); } /*! * \brief Process an election message (vote or no-vote) from a peer * - * \param[in,out] e Election object + * \param[in,out] cluster Cluster with election * \param[in] message Election message XML from peer * \param[in] can_win Whether local node is eligible to win * * \return Election state after new vote is considered * \note If the peer message is a vote, and we prefer the peer to win, this will * send a no-vote reply to the peer. * \note The situations "we lost to this vote" from "this is a late no-vote * after we've already lost" both return election_lost. If a caller needs * to distinguish them, it should save the current state before calling * this function, and then compare the result. */ enum election_result -election_count_vote(pcmk__election_t *e, const xmlNode *message, bool can_win) +election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message, + bool can_win) { int log_level = LOG_INFO; gboolean done = FALSE; gboolean we_lose = FALSE; const char *reason = "unknown"; bool we_are_owner = FALSE; pcmk__node_status_t *our_node = NULL; pcmk__node_status_t *your_node = NULL; time_t tm_now = time(NULL); struct vote vote; - CRM_CHECK((e != NULL) && (message != NULL), return election_error); + CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL) + && (message != NULL), return election_error); + if (!parse_election_message(message, &vote)) { return election_error; } your_node = pcmk__get_node(0, vote.from, NULL, pcmk__node_search_cluster_member); - our_node = pcmk__get_node(0, e->uname, NULL, + our_node = pcmk__get_node(0, cluster->priv->election->uname, NULL, pcmk__node_search_cluster_member); we_are_owner = (our_node != NULL) && pcmk__str_eq(our_node->xml_id, vote.election_owner, pcmk__str_none); if (!can_win) { reason = "Not eligible"; we_lose = TRUE; } else if (!pcmk__cluster_is_node_active(our_node)) { reason = "We are not part of the cluster"; log_level = LOG_ERR; we_lose = TRUE; - } else if (we_are_owner && (vote.election_id != e->count)) { + } else if (we_are_owner + && (vote.election_id != cluster->priv->election->count)) { log_level = LOG_TRACE; reason = "Superseded"; done = TRUE; } else if (!pcmk__cluster_is_node_active(your_node)) { /* Possibly we cached the message in the FSA queue at a point that it wasn't */ reason = "Peer is not part of our cluster"; log_level = LOG_WARNING; done = TRUE; } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none) - || pcmk__str_eq(vote.from, e->uname, pcmk__str_none)) { + || pcmk__str_eq(vote.from, cluster->priv->election->uname, + pcmk__str_casei)) { /* Receiving our own broadcast vote, or a no-vote from peer, is a vote * for us to win */ if (!we_are_owner) { crm_warn("Cannot count election round %d %s from %s " "because we did not start election (node ID %s did)", vote.election_id, vote.op, vote.from, vote.election_owner); return election_error; } - if (e->state != election_in_progress) { + if (cluster->priv->election->state != election_in_progress) { // Should only happen if we already lost crm_debug("Not counting election round %d %s from %s " "because no election in progress", vote.election_id, vote.op, vote.from); - return e->state; + return cluster->priv->election->state; } - record_vote(e, &vote); + record_vote(cluster, &vote); reason = "Recorded"; done = TRUE; } else { // A peer vote requires a comparison to determine which node is better int age_result = compare_age(vote.age); int version_result = compare_version(vote.version, CRM_FEATURE_SET); if (version_result < 0) { reason = "Version"; we_lose = TRUE; } else if (version_result > 0) { reason = "Version"; } else if (age_result < 0) { reason = "Uptime"; we_lose = TRUE; } else if (age_result > 0) { reason = "Uptime"; - } else if (strcasecmp(e->uname, vote.from) > 0) { + } else if (strcasecmp(cluster->priv->election->uname, vote.from) > 0) { reason = "Host name"; we_lose = TRUE; } else { reason = "Host name"; } } - if (e->expires < tm_now) { - e->election_wins = 0; - e->expires = tm_now + STORM_INTERVAL; + if (cluster->priv->election->expires < tm_now) { + cluster->priv->election->election_wins = 0; + cluster->priv->election->expires = tm_now + STORM_INTERVAL; } else if (done == FALSE && we_lose == FALSE) { int peers = 1 + g_hash_table_size(pcmk__peer_cache); /* If every node has to vote down every other node, thats N*(N-1) total elections * Allow some leeway before _really_ complaining */ - e->election_wins++; - if (e->election_wins > (peers * peers)) { + cluster->priv->election->election_wins++; + if (cluster->priv->election->election_wins > (peers * peers)) { crm_warn("Election storm detected: %d wins in %d seconds", - e->election_wins, STORM_INTERVAL); - e->election_wins = 0; - e->expires = tm_now + STORM_INTERVAL; - if (e->wrote_blackbox == FALSE) { + cluster->priv->election->election_wins, STORM_INTERVAL); + cluster->priv->election->election_wins = 0; + cluster->priv->election->expires = tm_now + STORM_INTERVAL; + if (!(cluster->priv->election->wrote_blackbox)) { /* It's questionable whether a black box (from every node in the * cluster) would be truly helpful in diagnosing an election * storm. It's also highly doubtful a production environment * would get multiple election storms from distinct causes, so * saving one blackbox per process lifetime should be * sufficient. Alternatives would be to save a timestamp of the * last blackbox write instead of a boolean, and write a new one * if some amount of time has passed; or to save a storm count, * write a blackbox on every Nth occurrence. */ crm_write_blackbox(0, NULL); - e->wrote_blackbox = TRUE; + cluster->priv->election->wrote_blackbox = true; } } } if (done) { do_crm_log(log_level + 1, "Processed election round %d %s (current round %d) " "from %s (%s)", - vote.election_id, vote.op, e->count, vote.from, reason); - return e->state; + vote.election_id, vote.op, cluster->priv->election->count, + vote.from, reason); + return cluster->priv->election->state; } else if (we_lose == FALSE) { /* We track the time of the last election loss to implement an election * dampening period, reducing the likelihood of an election storm. If * this node has lost within the dampening period, don't start a new * election, even if we win against a peer's vote -- the peer we lost to * should win again. * * @TODO This has a problem case: if an election winner immediately * leaves the cluster, and a new election is immediately called, all * nodes could lose, with no new winner elected. The ideal solution * would be to tie the election structure with the peer caches, which * would allow us to clear the dampening when the previous winner * leaves (and would allow other improvements as well). */ - if ((e->last_election_loss == 0) - || ((tm_now - e->last_election_loss) > (time_t) LOSS_DAMPEN)) { + if ((cluster->priv->election->last_election_loss == 0) + || ((tm_now - cluster->priv->election->last_election_loss) + > (time_t) LOSS_DAMPEN)) { do_crm_log(log_level, "Election round %d (started by node ID %s) pass: " "%s from %s (%s)", vote.election_id, vote.election_owner, vote.op, vote.from, reason); - e->last_election_loss = 0; - election_timeout_stop(e); + cluster->priv->election->last_election_loss = 0; + election_timeout_stop(cluster); /* Start a new election by voting down this, and other, peers */ - e->state = election_start; - return e->state; + cluster->priv->election->state = election_start; + return cluster->priv->election->state; } else { - char *loss_time = ctime(&e->last_election_loss); + char *loss_time = NULL; + loss_time = ctime(&(cluster->priv->election->last_election_loss)); if (loss_time) { // Show only HH:MM:SS loss_time += 11; loss_time[8] = '\0'; } crm_info("Ignoring election round %d (started by node ID %s) pass " "vs %s because we lost less than %ds ago at %s", vote.election_id, vote.election_owner, vote.from, LOSS_DAMPEN, (loss_time? loss_time : "unknown")); } } - e->last_election_loss = tm_now; + cluster->priv->election->last_election_loss = tm_now; do_crm_log(log_level, "Election round %d (started by node ID %s) lost: " "%s from %s (%s)", vote.election_id, vote.election_owner, vote.op, vote.from, reason); - election_reset(e); - send_no_vote(e, your_node, &vote); - e->state = election_lost; - return e->state; + election_reset(cluster); + send_no_vote(cluster, your_node, &vote); + cluster->priv->election->state = election_lost; + return cluster->priv->election->state; } /*! * \brief Reset any election dampening currently in effect * - * \param[in,out] e Election object to clear + * \param[in,out] cluster Cluster with election */ void -election_clear_dampening(pcmk__election_t *e) +election_clear_dampening(pcmk_cluster_t *cluster) { - e->last_election_loss = 0; + if ((cluster != NULL) && (cluster->priv->election != NULL)) { + cluster->priv->election->last_election_loss = 0; + } }