diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index a313f03a77..119dccd535 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,844 +1,843 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include static qb_ipcs_service_t *ipcs = NULL; #if SUPPORT_COROSYNC extern gboolean crm_connect_corosync(crm_cluster_t * cluster); #endif void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); -gboolean fsa_has_quorum = FALSE; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; bool no_quorum_suicide_escalation = FALSE; bool controld_shutdown_lock_enabled = false; /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; static crm_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = calloc(1, sizeof(crm_cluster_t)); } if (action & A_HA_DISCONNECT) { crm_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); controld_set_fsa_input_flags(R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); crm_set_autoreap(FALSE); if (is_corosync_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(cluster); #endif } if (registered == TRUE) { controld_election_init(cluster->uname); fsa_our_uname = cluster->uname; fsa_our_uuid = cluster->uuid; if(cluster->uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } } if (registered == FALSE) { controld_set_fsa_input_flags(R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __func__); controld_clear_fsa_input_flags(R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __func__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ controld_set_fsa_input_flags(R_SHUTDOWN); controld_disconnect_fencer(FALSE); } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; controld_set_fsa_input_flags(R_SHUTDOWN); //controld_set_fsa_input_flags(R_STAYDOWN); crm_info("Sending shutdown request to all peers (DC is %s)", (fsa_our_dc? fsa_our_dc : "not set")); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern pcmk__output_t *logger_out; void crmd_fast_exit(crm_exit_t exit_code) { if (pcmk_is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn "CRM_XS" remapping exit code %d to %d", exit_code, CRM_EX_FATAL); exit_code = CRM_EX_FATAL; } else if ((exit_code == CRM_EX_OK) && pcmk_is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = CRM_EX_ERROR; } if (logger_out != NULL) { logger_out->finish(logger_out, exit_code, true, NULL); pcmk__output_free(logger_out); logger_out = NULL; } crm_exit(exit_code); } crm_exit_t crmd_exit(crm_exit_t exit_code) { GList *gIter = NULL; GMainLoop *mloop = crmd_mainloop; static bool in_progress = FALSE; if (in_progress && (exit_code == CRM_EX_OK)) { crm_debug("Exit is already in progress"); return exit_code; } else if(in_progress) { crm_notice("Error during shutdown process, exiting now with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } in_progress = TRUE; crm_trace("Preparing to exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); /* Suppress secondary errors resulting from us disconnecting everything */ controld_set_fsa_input_flags(R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } controld_close_attrd_ipc(); controld_shutdown_schedulerd_ipc(); controld_disconnect_fencer(TRUE); if ((exit_code == CRM_EX_OK) && (crmd_mainloop == NULL)) { crm_debug("No mainloop detected"); exit_code = CRM_EX_ERROR; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if (exit_code != CRM_EX_OK) { crm_notice("Forcing immediate exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } /* Clean up as much memory as possible for valgrind */ for (gIter = fsa_message_queue; gIter != NULL; gIter = gIter->next) { fsa_data_t *fsa_data = gIter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } controld_clear_fsa_input_flags(R_MEMBERSHIP); g_list_free(fsa_message_queue); fsa_message_queue = NULL; controld_election_fini(); /* Tear down the CIB manager connection, but don't free it yet -- it could * be used when we drain the mainloop later. */ controld_disconnect_cib_manager(); verify_stopped(fsa_state, LOG_WARNING); controld_clear_fsa_input_flags(R_LRM_CONNECTED); lrm_state_destroy_all(); /* This basically will not work, since mainloop has a reference to it */ mainloop_destroy_trigger(fsa_source); fsa_source = NULL; mainloop_destroy_trigger(config_read); config_read = NULL; mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; pcmk__client_cleanup(); crm_peer_destroy(); controld_free_fsa_timers(); te_cleanup_stonith_history_sync(NULL, TRUE); controld_free_sched_timer(); free(fsa_our_dc_version); fsa_our_dc_version = NULL; free(fsa_our_uname); fsa_our_uname = NULL; free(fsa_our_uuid); fsa_our_uuid = NULL; free(fsa_our_dc); fsa_our_dc = NULL; free(fsa_cluster_name); fsa_cluster_name = NULL; free(te_uuid); te_uuid = NULL; free(failed_stop_offset); failed_stop_offset = NULL; free(failed_start_offset); failed_start_offset = NULL; free_max_generation(); mainloop_destroy_signal(SIGPIPE); mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); /* leave SIGCHLD engaged as we might still want to drain some service-actions */ if (mloop) { GMainContext *ctx = g_main_loop_get_context(crmd_mainloop); /* Don't re-enter this block */ crmd_mainloop = NULL; /* no signals on final draining anymore */ mainloop_destroy_signal(SIGCHLD); crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); { int lpc = 0; while((g_main_context_pending(ctx) && lpc < 10)) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); } else { mainloop_destroy_signal(SIGCHLD); } cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; throttle_fini(); /* Graceful */ crm_trace("Done preparing for exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); return exit_code; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { log_level = LOG_ERR; exit_type = "forcefully"; exit_code = CRM_EX_ERROR; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the controller", fsa_action2string(action), exit_type); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } static void sigpipe_ignore(int nsig) { return; } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); mainloop_add_signal(SIGPIPE, sigpipe_ignore); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger, NULL); crm_debug("Creating CIB manager and executor objects"); fsa_cib_conn = cib_new(); lrm_state_init_local(); if (controld_init_fsa_timers() == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } // \return libqb error code (0 on success, -errno on error) static int32_t accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Accepting new IPC client connection"); if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } // \return libqb error code (0 on success, -errno on error) static int32_t dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; pcmk__client_t *client = pcmk__find_client(c); xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags); if (msg == NULL) { pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_PROTOCOL); return 0; } pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_INDETERMINATE); CRM_ASSERT(client->user != NULL); pcmk__update_acl_user(msg, F_CRM_USER, client->user); crm_xml_add(msg, F_CRM_SYS_FROM, client->id); if (controld_authorize_ipc_message(msg, client, NULL)) { crm_trace("Processing IPC message from client %s", pcmk__client_name(client)); route_message(C_IPC_MESSAGE, msg); } trigger_fsa(); free_xml(msg); return 0; } static int32_t ipc_client_disconnected(qb_ipcs_connection_t *c) { pcmk__client_t *client = pcmk__find_client(c); if (client) { crm_trace("Disconnecting %sregistered client %s (%p/%p)", (client->userdata? "" : "un"), pcmk__client_name(client), c, client); free(client->userdata); pcmk__free_client(client); trigger_fsa(); } return 0; } static void ipc_connection_destroyed(qb_ipcs_connection_t *c) { crm_trace("Connection %p", c); ipc_client_disconnected(c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = accept_controller_client, .connection_created = NULL, .msg_process = dispatch_controller_ipc, .connection_closed = ipc_client_disconnected, .connection_destroyed = ipc_connection_destroyed }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (!pcmk_is_set(fsa_input_register, R_MEMBERSHIP)) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(fsa_input_register, R_LRM_CONNECTED)) { crm_info("Delaying start, not connected to executor (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(fsa_input_register, R_CIB_CONNECTED)) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(fsa_input_register, R_READ_CONFIG)) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(fsa_input_register, R_PEER_DATA)) { crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = pcmk__serve_controld_ipc(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_notice("Pacemaker controller successfully started and accepting connections"); } controld_trigger_fencer_connect(); controld_clear_fsa_input_flags(R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { controld_set_fsa_input_flags(R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } static pcmk__cluster_option_t controller_options[] = { /* name, old name, type, allowed values, * default value, validator, * short description, * long description */ { "dc-version", NULL, "string", NULL, PCMK__VALUE_NONE, NULL, N_("Pacemaker version on cluster node elected Designated Controller (DC)"), N_("Includes a hash which identifies the exact changeset the code was " "built from. Used for diagnostic purposes.") }, { "cluster-infrastructure", NULL, "string", NULL, "corosync", NULL, N_("The messaging stack on which Pacemaker is currently running"), N_("Used for informational and diagnostic purposes.") }, { "cluster-name", NULL, "string", NULL, NULL, NULL, N_("An arbitrary name for the cluster"), N_("This optional value is mostly for users' convenience as desired " "in administration, but may also be used in Pacemaker " "configuration rules via the #cluster-name node attribute, and " "by higher-level tools and resource agents.") }, { XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time", NULL, "20s", pcmk__valid_interval_spec, N_("How long to wait for a response from other nodes during start-up"), N_("The optimal value will depend on the speed and load of your network " "and the type of switches used.") }, { XML_CONFIG_ATTR_RECHECK, NULL, "time", N_("Zero disables polling, while positive values are an interval in seconds" "(unless other units are specified, for example \"5min\")"), "15min", pcmk__valid_interval_spec, N_("Polling interval to recheck cluster state and evaluate rules " "with date specifications"), N_("Pacemaker is primarily event-driven, and looks ahead to know when to " "recheck cluster state for failure timeouts and most time-based " "rules. However, it will also recheck the cluster after this " "amount of inactivity, to evaluate rules with date specifications " "and serve as a fail-safe for certain types of scheduler bugs.") }, { "load-threshold", NULL, "percentage", NULL, "80%", pcmk__valid_percentage, N_("Maximum amount of system load that should be used by cluster nodes"), N_("The cluster will slow down its recovery process when the amount of " "system resources used (currently CPU) approaches this limit"), }, { "node-action-limit", NULL, "integer", NULL, "0", pcmk__valid_number, N_("Maximum number of jobs that can be scheduled per node " "(defaults to 2x cores)") }, { XML_CONFIG_ATTR_FENCE_REACTION, NULL, "string", NULL, "stop", NULL, N_("How a cluster node should react if notified of its own fencing"), N_("A cluster node may receive notification of its own fencing if fencing " "is misconfigured, or if fabric fencing is in use that doesn't cut " "cluster communication. Allowed values are \"stop\" to attempt to " "immediately stop Pacemaker and stay stopped, or \"panic\" to attempt " "to immediately reboot the local node, falling back to stop on failure.") }, { XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL, "2min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("Declare an election failed if it is not decided within this much " "time. If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL, "20min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("Exit immediately if shutdown does not complete within this much " "time. If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "join-integration-timeout", "crmd-integration-timeout", "time", NULL, "3min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "join-finalization-timeout", "crmd-finalization-timeout", "time", NULL, "30min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "transition-delay", "crmd-transition-delay", "time", NULL, "0s", pcmk__valid_interval_spec, N_("*** Advanced Use Only *** Enabling this option will slow down " "cluster recovery under all conditions"), N_("Delay cluster recovery for this much time to allow for additional " "events to occur. Useful if your configuration is sensitive to " "the order in which ping updates arrive.") }, { "stonith-watchdog-timeout", NULL, "time", NULL, "0", controld_verify_stonith_watchdog_timeout, N_("How long before nodes can be assumed to be safely down when " "watchdog-based self-fencing via SBD is in use"), N_("If this is set to a positive value, lost nodes are assumed to " "self-fence using watchdog-based SBD within this much time. This " "does not require a fencing resource to be explicitly configured, " "though a fence_watchdog resource can be configured, to limit use " "to specific nodes. If this is set to 0 (the default), the cluster " "will never assume watchdog-based self-fencing. If this is set to a " "negative value, the cluster will use twice the local value of the " "`SBD_WATCHDOG_TIMEOUT` environment variable if that is positive, " "or otherwise treat this as 0. WARNING: When used, this timeout " "must be larger than `SBD_WATCHDOG_TIMEOUT` on all nodes that use " "watchdog-based SBD, and Pacemaker will refuse to start on any of " "those nodes where this is not true for the local value or SBD is " "not active. When this is set to a negative value, " "`SBD_WATCHDOG_TIMEOUT` must be set to the same value on all nodes " "that use SBD, otherwise data corruption or loss could occur.") }, { "stonith-max-attempts", NULL, "integer", NULL, "10", pcmk__valid_positive_number, N_("How many times fencing can fail before it will no longer be " "immediately re-attempted on a target") }, // Already documented in libpe_status (other values must be kept identical) { "no-quorum-policy", NULL, "select", "stop, freeze, ignore, demote, suicide", "stop", pcmk__valid_quorum, "What to do when the cluster does not have quorum", NULL }, { XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL, "false", pcmk__valid_boolean, "Whether to lock resources to a cleanly shut down node", "When true, resources active on a node when it is cleanly shut down " "are kept \"locked\" to that node (not allowed to run elsewhere) " "until they start again on that node after it rejoins (or for at " "most shutdown-lock-limit, if set). Stonith resources and " "Pacemaker Remote connections are never locked. Clone and bundle " "instances and the promoted role of promotable clones are currently" " never locked, though support could be added in a future release." }, }; void crmd_metadata(void) { const char *desc_short = "Pacemaker controller options"; const char *desc_long = "Cluster options used by Pacemaker's controller"; gchar *s = pcmk__format_option_metadata("pacemaker-controld", desc_short, desc_long, controller_options, PCMK__NELEM(controller_options)); printf("%s", s); g_free(s); } static const char * controller_option(GHashTable *options, const char *name) { return pcmk__cluster_option(options, controller_options, PCMK__NELEM(controller_options), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); xmlNode *crmconfig = NULL; xmlNode *alerts = NULL; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); controld_set_fsa_input_flags(R_STAYDOWN); } goto bail; } crmconfig = output; if ((crmconfig) && (crm_element_name(crmconfig)) && (strcmp(crm_element_name(crmconfig), XML_CIB_TAG_CRMCONFIG) != 0)) { crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG); } if (!crmconfig) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query for " XML_CIB_TAG_CRMCONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = pcmk__strkey_table(free, free); pe_unpack_nvpairs(crmconfig, crmconfig, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL); pcmk__validate_cluster_options(config_hash, controller_options, PCMK__NELEM(controller_options)); value = controller_option(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_timer->period_ms = crm_parse_interval_spec(value); value = controller_option(config_hash, "node-action-limit"); throttle_update_job_max(value); value = controller_option(config_hash, "load-threshold"); if(value) { throttle_set_load_target(strtof(value, NULL) / 100.0); } value = controller_option(config_hash, "no-quorum-policy"); if (pcmk__str_eq(value, "suicide", pcmk__str_casei) && pcmk__locate_sbd()) { no_quorum_suicide_escalation = TRUE; } set_fence_reaction(controller_option(config_hash, XML_CONFIG_ATTR_FENCE_REACTION)); value = controller_option(config_hash, "stonith-max-attempts"); update_stonith_max_attempts(value); value = controller_option(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value); crm_debug("Shutdown escalation occurs if DC has not responded to request in %ums", shutdown_escalation_timer->period_ms); value = controller_option(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); controld_set_election_period(value); value = controller_option(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_interval_ms = crm_parse_interval_spec(value); crm_debug("Re-run scheduler after %dms of inactivity", recheck_interval_ms); value = controller_option(config_hash, "transition-delay"); transition_timer->period_ms = crm_parse_interval_spec(value); value = controller_option(config_hash, "join-integration-timeout"); integration_timer->period_ms = crm_parse_interval_spec(value); value = controller_option(config_hash, "join-finalization-timeout"); finalization_timer->period_ms = crm_parse_interval_spec(value); value = controller_option(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK); controld_shutdown_lock_enabled = crm_is_true(value); free(fsa_cluster_name); fsa_cluster_name = NULL; value = g_hash_table_lookup(config_hash, "cluster-name"); if (value) { fsa_cluster_name = strdup(value); } alerts = first_named_child(output, XML_CIB_TAG_ALERTS); crmd_unpack_alerts(alerts); controld_set_fsa_input_flags(R_READ_CONFIG); crm_trace("Triggering FSA: %s", __func__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, "//" XML_CIB_TAG_CRMCONFIG " | //" XML_CIB_TAG_ALERTS, NULL, cib_xpath | cib_scope_local); fsa_register_cib_callback(call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if ((crmd_mainloop == NULL) || !g_main_loop_is_running(crmd_mainloop)) { crmd_exit(CRM_EX_OK); return; } if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); return; } controld_set_fsa_input_flags(R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms == 0) { const char *value = controller_option(NULL, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value); } crm_notice("Initiating controller shutdown sequence " CRM_XS " limit=%ums", shutdown_escalation_timer->period_ms); controld_start_timer(shutdown_escalation_timer); } diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c index 53a4c21699..6e2a84ecd2 100644 --- a/daemons/controld/controld_membership.c +++ b/daemons/controld/controld_membership.c @@ -1,444 +1,451 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include void post_cache_update(int instance); extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; if (crm_is_peer_active(node) == FALSE) { crm_update_peer_join(__func__, node, crm_join_none); if(node && node->uname) { if (pcmk__str_eq(fsa_our_uname, node->uname, pcmk__str_casei)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (AM_I_DC == FALSE && pcmk__str_eq(node->uname, fsa_our_dc, pcmk__str_casei)) { crm_warn("Our DC node (%s) left the cluster", node->uname); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { check_join_state(fsa_state, __func__); } if(node && node->uuid) { fail_incompletable_actions(transition_graph, node->uuid); } } } void post_cache_update(int instance) { xmlNode *no_op = NULL; crm_peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); controld_set_fsa_input_flags(R_MEMBERSHIP); if (AM_I_DC) { populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer | node_update_expected, __func__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ controld_set_fsa_action_flags(A_ELECTION_CHECK); trigger_fsa(); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); free_xml(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else if(call_id < pcmk_ok) { crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /*! * \internal * \brief Create an XML node state tag with updates * * \param[in,out] node Node whose state will be used for update * \param[in] flags Bitmask of node_update_flags indicating what to update * \param[in,out] parent XML node to contain update (or NULL) * \param[in] source Who requested the update (only used for logging) * * \return Pointer to created node state tag */ xmlNode * create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, const char *source) { const char *value = NULL; xmlNode *node_state; if (!node->state) { crm_info("Node update for %s cancelled: no state, not seen yet", node->uname); return NULL; } node_state = create_xml_node(parent, XML_CIB_TAG_STATE); if (pcmk_is_set(node->flags, crm_remote_node)) { pcmk__xe_set_bool_attr(node_state, XML_NODE_IS_REMOTE, true); } set_uuid(node_state, XML_ATTR_UUID, node); if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) { crm_info("Node update for %s cancelled: no id", node->uname); free_xml(node_state); return NULL; } crm_xml_add(node_state, XML_ATTR_UNAME, node->uname); if ((flags & node_update_cluster) && node->state) { pcmk__xe_set_bool_attr(node_state, XML_NODE_IN_CLUSTER, pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)); } if (!pcmk_is_set(node->flags, crm_remote_node)) { if (flags & node_update_peer) { value = OFFLINESTATUS; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { value = ONLINESTATUS; } crm_xml_add(node_state, XML_NODE_IS_PEER, value); } if (flags & node_update_join) { if (node->join <= crm_join_none) { value = CRMD_JOINSTATE_DOWN; } else { value = CRMD_JOINSTATE_MEMBER; } crm_xml_add(node_state, XML_NODE_JOIN_STATE, value); } if (flags & node_update_expected) { crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected); } } crm_xml_add(node_state, XML_ATTR_ORIGIN, source); return node_state; } static void remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *node_uuid = user_data; do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)", node_uuid, pcmk_strerror(rc), rc); } static void search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *new_node_uuid = user_data; xmlNode *node_xml = NULL; if (rc != pcmk_ok) { if (rc != -ENXIO) { crm_notice("Searching conflicting nodes for %s failed: %s (%d)", new_node_uuid, pcmk_strerror(rc), rc); } return; } else if (output == NULL) { return; } if (pcmk__str_eq(crm_element_name(output), XML_CIB_TAG_NODE, pcmk__str_casei)) { node_xml = output; } else { node_xml = pcmk__xml_first_child(output); } for (; node_xml != NULL; node_xml = pcmk__xml_next(node_xml)) { const char *node_uuid = NULL; const char *node_uname = NULL; GHashTableIter iter; crm_node_t *node = NULL; gboolean known = FALSE; if (!pcmk__str_eq(crm_element_name(node_xml), XML_CIB_TAG_NODE, pcmk__str_casei)) { continue; } node_uuid = crm_element_value(node_xml, XML_ATTR_ID); node_uname = crm_element_value(node_xml, XML_ATTR_UNAME); if (node_uuid == NULL || node_uname == NULL) { continue; } g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->uuid && pcmk__str_eq(node->uuid, node_uuid, pcmk__str_casei) && node->uname && pcmk__str_eq(node->uname, node_uname, pcmk__str_casei)) { known = TRUE; break; } } if (known == FALSE) { int delete_call_id = 0; xmlNode *node_state_xml = NULL; crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s", node_uuid, node_uname, new_node_uuid); delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_NODES, node_xml, cib_scope_local | cib_quorum_override); fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid), remove_conflicting_node_callback); node_state_xml = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(node_state_xml, XML_ATTR_ID, node_uuid); crm_xml_add(node_state_xml, XML_ATTR_UNAME, node_uname); delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state_xml, cib_scope_local | cib_quorum_override); fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid), remove_conflicting_node_callback); free_xml(node_state_xml); } } } static void node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if(call_id < pcmk_ok) { crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else if(rc < pcmk_ok) { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void populate_cib_nodes(enum node_update_flags flags, const char *source) { int call_id = 0; gboolean from_hashtable = TRUE; int call_options = cib_scope_local | cib_quorum_override; xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); #if SUPPORT_COROSYNC if (!pcmk_is_set(flags, node_update_quick) && is_corosync_cluster()) { from_hashtable = pcmk__corosync_add_nodes(node_list); } #endif if (from_hashtable) { GHashTableIter iter; crm_node_t *node = NULL; GString *xpath = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *new_node = NULL; if ((node->uuid != NULL) && (node->uname != NULL)) { crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); if (xpath == NULL) { xpath = g_string_sized_new(512); } else { g_string_truncate(xpath, 0); } /* We need both to be valid */ new_node = create_xml_node(node_list, XML_CIB_TAG_NODE); crm_xml_add(new_node, XML_ATTR_ID, node->uuid); crm_xml_add(new_node, XML_ATTR_UNAME, node->uname); /* Search and remove unknown nodes with the conflicting uname from CIB */ pcmk__g_strcat(xpath, "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UNAME "='", node->uname, "']" "[@" XML_ATTR_ID "!='", node->uuid, "']", NULL); call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, (const char *) xpath->str, NULL, cib_scope_local | cib_xpath); fsa_register_cib_callback(call_id, FALSE, strdup(node->uuid), search_conflicting_node_callback); } } if (xpath != NULL) { g_string_free(xpath, TRUE); } } crm_trace("Populating section from %s", from_hashtable ? "hashtable" : "cluster"); fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, node_list_update_callback); free_xml(node_list); if (call_id >= pcmk_ok && crm_peer_cache != NULL && AM_I_DC) { /* * There is no need to update the local CIB with our values if * we've not seen valid membership data */ GHashTableIter iter; crm_node_t *node = NULL; node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } if (crm_remote_peer_cache) { g_hash_table_iter_init(&iter, crm_remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } } fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete); free_xml(node_list); } } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { if (quorum) { controld_globals.flags |= controld_ever_had_quorum; } else if (pcmk_is_set(controld_globals.flags, controld_ever_had_quorum) && no_quorum_suicide_escalation) { pcmk__panic(__func__); } - if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { + if (AM_I_DC + && ((pcmk_is_set(controld_globals.flags, controld_has_quorum) != quorum) + || force_update)) { int call_id = 0; xmlNode *update = NULL; int call_options = cib_scope_local | cib_quorum_override; update = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); crm_xml_add(update, XML_ATTR_DC_UUID, fsa_our_uuid); fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); crm_debug("Updating quorum status to %s (call=%d)", pcmk__btoa(quorum), call_id); fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); /* Quorum changes usually cause a new transition via other activity: * quorum gained via a node joining will abort via the node join, * and quorum lost via a node leaving will usually abort via resource * activity and/or fencing. * * However, it is possible that nothing else causes a transition (e.g. * someone forces quorum via corosync-cmaptcl, or quorum is lost due to * a node in standby shutting down cleanly), so here ensure a new * transition is triggered. */ if (quorum) { /* If quorum was gained, abort after a short delay, in case multiple * nodes are joining around the same time, so the one that brings us * to quorum doesn't cause all the remaining ones to be fenced. */ abort_after_delay(INFINITY, pcmk__graph_restart, "Quorum gained", 5000); } else { abort_transition(INFINITY, pcmk__graph_restart, "Quorum lost", NULL); } } - fsa_has_quorum = quorum; + + if (quorum) { + controld_globals.flags |= controld_has_quorum; + } else { + controld_globals.flags &= ~controld_has_quorum; + } } diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c index af38b196fb..87be5bcc27 100644 --- a/daemons/controld/controld_messages.c +++ b/daemons/controld/controld_messages.c @@ -1,1289 +1,1291 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include GList *fsa_message_queue = NULL; extern void crm_shutdown(int nsig); static enum crmd_fsa_input handle_message(xmlNode *msg, enum crmd_fsa_cause cause); static void handle_response(xmlNode *stored_msg); static enum crmd_fsa_input handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause); static enum crmd_fsa_input handle_shutdown_request(xmlNode *stored_msg); static void send_msg_via_ipc(xmlNode * msg, const char *sys); /* debug only, can wrap all it likes */ static int last_data_id = 0; void register_fsa_error_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, fsa_data_t * cur_data, void *new_data, const char *raised_from) { /* save the current actions if any */ if (fsa_actions != A_NOTHING) { register_fsa_input_adv(cur_data ? cur_data->fsa_cause : C_FSA_INTERNAL, I_NULL, cur_data ? cur_data->data : NULL, fsa_actions, TRUE, __func__); } /* reset the action list */ crm_info("Resetting the current action list"); fsa_dump_actions(fsa_actions, "Drop"); fsa_actions = A_NOTHING; /* register the error */ register_fsa_input_adv(cause, input, new_data, A_NOTHING, TRUE, raised_from); } void register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, void *data, uint64_t with_actions, gboolean prepend, const char *raised_from) { unsigned old_len = g_list_length(fsa_message_queue); fsa_data_t *fsa_data = NULL; if (raised_from == NULL) { raised_from = ""; } if (input == I_NULL && with_actions == A_NOTHING /* && data == NULL */ ) { /* no point doing anything */ crm_err("Cannot add entry to queue: no input and no action"); return; } if (input == I_WAIT_FOR_EVENT) { controld_globals.flags |= controld_fsa_is_stalled; crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d", raised_from, fsa_cause2string(cause), data, old_len); if (old_len > 0) { fsa_dump_queue(LOG_TRACE); prepend = FALSE; } if (data == NULL) { controld_set_fsa_action_flags(with_actions); fsa_dump_actions(with_actions, "Restored"); return; } /* Store everything in the new event and reset fsa_actions */ with_actions |= fsa_actions; fsa_actions = A_NOTHING; } last_data_id++; crm_trace("%s %s FSA input %d (%s) due to %s, %s data", raised_from, (prepend? "prepended" : "appended"), last_data_id, fsa_input2string(input), fsa_cause2string(cause), (data? "with" : "without")); fsa_data = calloc(1, sizeof(fsa_data_t)); fsa_data->id = last_data_id; fsa_data->fsa_input = input; fsa_data->fsa_cause = cause; fsa_data->origin = raised_from; fsa_data->data = NULL; fsa_data->data_type = fsa_dt_none; fsa_data->actions = with_actions; if (with_actions != A_NOTHING) { crm_trace("Adding actions %.16llx to input", (unsigned long long) with_actions); } if (data != NULL) { switch (cause) { case C_FSA_INTERNAL: case C_CRMD_STATUS_CALLBACK: case C_IPC_MESSAGE: case C_HA_MESSAGE: CRM_CHECK(((ha_msg_input_t *) data)->msg != NULL, crm_err("Bogus data from %s", raised_from)); crm_trace("Copying %s data from %s as cluster message data", fsa_cause2string(cause), raised_from); fsa_data->data = copy_ha_msg_input(data); fsa_data->data_type = fsa_dt_ha_msg; break; case C_LRM_OP_CALLBACK: crm_trace("Copying %s data from %s as lrmd_event_data_t", fsa_cause2string(cause), raised_from); fsa_data->data = lrmd_copy_event((lrmd_event_data_t *) data); fsa_data->data_type = fsa_dt_lrm; break; case C_TIMER_POPPED: case C_SHUTDOWN: case C_UNKNOWN: case C_STARTUP: crm_crit("Copying %s data (from %s) is not yet implemented", fsa_cause2string(cause), raised_from); crmd_exit(CRM_EX_SOFTWARE); break; } } /* make sure to free it properly later */ if (prepend) { fsa_message_queue = g_list_prepend(fsa_message_queue, fsa_data); } else { fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); } crm_trace("FSA message queue length is %d", g_list_length(fsa_message_queue)); /* fsa_dump_queue(LOG_TRACE); */ if (old_len == g_list_length(fsa_message_queue)) { crm_err("Couldn't add message to the queue"); } if (fsa_source && input != I_WAIT_FOR_EVENT) { crm_trace("Triggering FSA"); mainloop_set_trigger(fsa_source); } } void fsa_dump_queue(int log_level) { int offset = 0; GList *lpc = NULL; for (lpc = fsa_message_queue; lpc != NULL; lpc = lpc->next) { fsa_data_t *data = (fsa_data_t *) lpc->data; do_crm_log_unlikely(log_level, "queue[%d.%d]: input %s raised by %s(%p.%d)\t(cause=%s)", offset++, data->id, fsa_input2string(data->fsa_input), data->origin, data->data, data->data_type, fsa_cause2string(data->fsa_cause)); } } ha_msg_input_t * copy_ha_msg_input(ha_msg_input_t * orig) { ha_msg_input_t *copy = calloc(1, sizeof(ha_msg_input_t)); CRM_ASSERT(copy != NULL); copy->msg = (orig && orig->msg)? copy_xml(orig->msg) : NULL; copy->xml = get_message_xml(copy->msg, F_CRM_DATA); return copy; } void delete_fsa_input(fsa_data_t * fsa_data) { lrmd_event_data_t *op = NULL; xmlNode *foo = NULL; if (fsa_data == NULL) { return; } crm_trace("About to free %s data", fsa_cause2string(fsa_data->fsa_cause)); if (fsa_data->data != NULL) { switch (fsa_data->data_type) { case fsa_dt_ha_msg: delete_ha_msg_input(fsa_data->data); break; case fsa_dt_xml: foo = fsa_data->data; free_xml(foo); break; case fsa_dt_lrm: op = (lrmd_event_data_t *) fsa_data->data; lrmd_free_event(op); break; case fsa_dt_none: if (fsa_data->data != NULL) { crm_err("Don't know how to free %s data from %s", fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); crmd_exit(CRM_EX_SOFTWARE); } break; } crm_trace("%s data freed", fsa_cause2string(fsa_data->fsa_cause)); } free(fsa_data); } /* returns the next message */ fsa_data_t * get_message(void) { fsa_data_t *message = g_list_nth_data(fsa_message_queue, 0); fsa_message_queue = g_list_remove(fsa_message_queue, message); crm_trace("Processing input %d", message->id); return message; } void * fsa_typed_data_adv(fsa_data_t * fsa_data, enum fsa_data_type a_type, const char *caller) { void *ret_val = NULL; if (fsa_data == NULL) { crm_err("%s: No FSA data available", caller); } else if (fsa_data->data == NULL) { crm_err("%s: No message data available. Origin: %s", caller, fsa_data->origin); } else if (fsa_data->data_type != a_type) { crm_crit("%s: Message data was the wrong type! %d vs. requested=%d. Origin: %s", caller, fsa_data->data_type, a_type, fsa_data->origin); CRM_ASSERT(fsa_data->data_type == a_type); } else { ret_val = fsa_data->data; } return ret_val; } /* A_MSG_ROUTE */ void do_msg_route(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); route_message(msg_data->fsa_cause, input->msg); } void route_message(enum crmd_fsa_cause cause, xmlNode * input) { ha_msg_input_t fsa_input; enum crmd_fsa_input result = I_NULL; fsa_input.msg = input; CRM_CHECK(cause == C_IPC_MESSAGE || cause == C_HA_MESSAGE, return); /* try passing the buck first */ if (relay_message(input, cause == C_IPC_MESSAGE)) { return; } /* handle locally */ result = handle_message(input, cause); /* done or process later? */ switch (result) { case I_NULL: case I_CIB_OP: case I_ROUTER: case I_NODE_JOIN: case I_JOIN_REQUEST: case I_JOIN_RESULT: break; default: /* Defering local processing of message */ register_fsa_input_later(cause, result, &fsa_input); return; } if (result != I_NULL) { /* add to the front of the queue */ register_fsa_input(cause, result, &fsa_input); } } gboolean relay_message(xmlNode * msg, gboolean originated_locally) { int dest = 1; bool is_for_dc = false; bool is_for_dcib = false; bool is_for_te = false; bool is_for_crm = false; bool is_for_cib = false; bool is_local = false; const char *host_to = crm_element_value(msg, F_CRM_HOST_TO); const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); const char *type = crm_element_value(msg, F_TYPE); const char *task = crm_element_value(msg, F_CRM_TASK); const char *ref = crm_element_value(msg, XML_ATTR_REFERENCE); if (ref == NULL) { ref = "without reference ID"; } if (msg == NULL) { crm_warn("Cannot route empty message"); return TRUE; } else if (pcmk__str_eq(task, CRM_OP_HELLO, pcmk__str_casei)) { crm_trace("No routing needed for hello message %s", ref); return TRUE; } else if (!pcmk__str_eq(type, T_CRM, pcmk__str_casei)) { crm_warn("Received invalid message %s: type '%s' not '" T_CRM "'", ref, pcmk__s(type, "")); crm_log_xml_warn(msg, "[bad message type]"); return TRUE; } else if (sys_to == NULL) { crm_warn("Received invalid message %s: no subsystem", ref); crm_log_xml_warn(msg, "[no subsystem]"); return TRUE; } is_for_dc = (strcasecmp(CRM_SYSTEM_DC, sys_to) == 0); is_for_dcib = (strcasecmp(CRM_SYSTEM_DCIB, sys_to) == 0); is_for_te = (strcasecmp(CRM_SYSTEM_TENGINE, sys_to) == 0); is_for_cib = (strcasecmp(CRM_SYSTEM_CIB, sys_to) == 0); is_for_crm = (strcasecmp(CRM_SYSTEM_CRMD, sys_to) == 0); is_local = false; if (pcmk__str_empty(host_to)) { if (is_for_dc || is_for_te) { is_local = false; } else if (is_for_crm) { if (pcmk__strcase_any_of(task, CRM_OP_NODE_INFO, PCMK__CONTROLD_CMD_NODES, NULL)) { /* Node info requests do not specify a host, which is normally * treated as "all hosts", because the whole point is that the * client may not know the local node name. Always handle these * requests locally. */ is_local = true; } else { is_local = !originated_locally; } } else { is_local = true; } } else if (pcmk__str_eq(fsa_our_uname, host_to, pcmk__str_casei)) { is_local = true; } else if (is_for_crm && pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_casei)) { xmlNode *msg_data = get_message_xml(msg, F_CRM_DATA); const char *mode = crm_element_value(msg_data, PCMK__XA_MODE); if (pcmk__str_eq(mode, XML_TAG_CIB, pcmk__str_casei)) { // Local delete of an offline node's resource history is_local = true; } } if (is_for_dc || is_for_dcib || is_for_te) { if (AM_I_DC && is_for_te) { crm_trace("Route message %s locally as transition request", ref); send_msg_via_ipc(msg, sys_to); } else if (AM_I_DC) { crm_trace("Route message %s locally as DC request", ref); return FALSE; // More to be done by caller } else if (originated_locally && !pcmk__strcase_any_of(sys_from, CRM_SYSTEM_PENGINE, CRM_SYSTEM_TENGINE, NULL)) { #if SUPPORT_COROSYNC if (is_corosync_cluster()) { dest = text2msg_type(sys_to); } #endif crm_trace("Relay message %s to DC", ref); send_cluster_message(host_to ? crm_get_peer(0, host_to) : NULL, dest, msg, TRUE); } else { /* Neither the TE nor the scheduler should be sending messages * to DCs on other nodes. By definition, if we are no longer the DC, * then the scheduler's or TE's data should be discarded. */ crm_trace("Discard message %s because we are not DC", ref); } } else if (is_local && (is_for_crm || is_for_cib)) { crm_trace("Route message %s locally as controller request", ref); return FALSE; // More to be done by caller } else if (is_local) { crm_trace("Relay message %s locally to %s", ref, (sys_to? sys_to : "unknown client")); crm_log_xml_trace(msg, "[IPC relay]"); send_msg_via_ipc(msg, sys_to); } else { crm_node_t *node_to = NULL; #if SUPPORT_COROSYNC if (is_corosync_cluster()) { dest = text2msg_type(sys_to); if (dest == crm_msg_none || dest > crm_msg_stonith_ng) { dest = crm_msg_crmd; } } #endif if (host_to) { node_to = pcmk__search_cluster_node_cache(0, host_to); if (node_to == NULL) { crm_warn("Cannot route message %s: Unknown node %s", ref, host_to); return TRUE; } crm_trace("Relay message %s to %s", ref, (node_to->uname? node_to->uname : "peer")); } else { crm_trace("Broadcast message %s to all peers", ref); } send_cluster_message(host_to ? node_to : NULL, dest, msg, TRUE); } return TRUE; // No further processing of message is needed } // Return true if field contains a positive integer static bool authorize_version(xmlNode *message_data, const char *field, const char *client_name, const char *ref, const char *uuid) { const char *version = crm_element_value(message_data, field); long long version_num; if ((pcmk__scan_ll(version, &version_num, -1LL) != pcmk_rc_ok) || (version_num < 0LL)) { crm_warn("Rejected IPC hello from %s: '%s' is not a valid protocol %s " CRM_XS " ref=%s uuid=%s", client_name, ((version == NULL)? "" : version), field, (ref? ref : "none"), uuid); return false; } return true; } /*! * \internal * \brief Check whether a client IPC message is acceptable * * If a given client IPC message is a hello, "authorize" it by ensuring it has * valid information such as a protocol version, and return false indicating * that nothing further needs to be done with the message. If the message is not * a hello, just return true to indicate it needs further processing. * * \param[in] client_msg XML of IPC message * \param[in,out] curr_client If IPC is not proxied, client that sent message * \param[in] proxy_session If IPC is proxied, the session ID * * \return true if message needs further processing, false if it doesn't */ bool controld_authorize_ipc_message(const xmlNode *client_msg, pcmk__client_t *curr_client, const char *proxy_session) { xmlNode *message_data = NULL; const char *client_name = NULL; const char *op = crm_element_value(client_msg, F_CRM_TASK); const char *ref = crm_element_value(client_msg, XML_ATTR_REFERENCE); const char *uuid = (curr_client? curr_client->id : proxy_session); if (uuid == NULL) { crm_warn("IPC message from client rejected: No client identifier " CRM_XS " ref=%s", (ref? ref : "none")); goto rejected; } if (!pcmk__str_eq(CRM_OP_HELLO, op, pcmk__str_casei)) { // Only hello messages need to be authorized return true; } message_data = get_message_xml(client_msg, F_CRM_DATA); client_name = crm_element_value(message_data, "client_name"); if (pcmk__str_empty(client_name)) { crm_warn("IPC hello from client rejected: No client name", CRM_XS " ref=%s uuid=%s", (ref? ref : "none"), uuid); goto rejected; } if (!authorize_version(message_data, "major_version", client_name, ref, uuid)) { goto rejected; } if (!authorize_version(message_data, "minor_version", client_name, ref, uuid)) { goto rejected; } crm_trace("Validated IPC hello from client %s", client_name); if (curr_client) { curr_client->userdata = strdup(client_name); } mainloop_set_trigger(fsa_source); return false; rejected: if (curr_client) { qb_ipcs_disconnect(curr_client->ipcs); } return false; } static enum crmd_fsa_input handle_message(xmlNode *msg, enum crmd_fsa_cause cause) { const char *type = NULL; CRM_CHECK(msg != NULL, return I_NULL); type = crm_element_value(msg, F_CRM_MSG_TYPE); if (pcmk__str_eq(type, XML_ATTR_REQUEST, pcmk__str_none)) { return handle_request(msg, cause); } else if (pcmk__str_eq(type, XML_ATTR_RESPONSE, pcmk__str_none)) { handle_response(msg); return I_NULL; } crm_err("Unknown message type: %s", type); return I_NULL; } static enum crmd_fsa_input handle_failcount_op(xmlNode * stored_msg) { const char *rsc = NULL; const char *uname = NULL; const char *op = NULL; char *interval_spec = NULL; guint interval_ms = 0; gboolean is_remote_node = FALSE; xmlNode *xml_op = get_message_xml(stored_msg, F_CRM_DATA); if (xml_op) { xmlNode *xml_rsc = first_named_child(xml_op, XML_CIB_TAG_RESOURCE); xmlNode *xml_attrs = first_named_child(xml_op, XML_TAG_ATTRS); if (xml_rsc) { rsc = ID(xml_rsc); } if (xml_attrs) { op = crm_element_value(xml_attrs, CRM_META "_" XML_RSC_ATTR_CLEAR_OP); crm_element_value_ms(xml_attrs, CRM_META "_" XML_RSC_ATTR_CLEAR_INTERVAL, &interval_ms); } } uname = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); if ((rsc == NULL) || (uname == NULL)) { crm_log_xml_warn(stored_msg, "invalid failcount op"); return I_NULL; } if (crm_element_value(xml_op, XML_LRM_ATTR_ROUTER_NODE)) { is_remote_node = TRUE; } if (interval_ms) { interval_spec = crm_strdup_printf("%ums", interval_ms); } update_attrd_clear_failures(uname, rsc, op, interval_spec, is_remote_node); free(interval_spec); lrm_clear_last_failure(rsc, uname, op, interval_ms); return I_NULL; } static enum crmd_fsa_input handle_lrm_delete(xmlNode *stored_msg) { const char *mode = NULL; xmlNode *msg_data = get_message_xml(stored_msg, F_CRM_DATA); CRM_CHECK(msg_data != NULL, return I_NULL); /* CRM_OP_LRM_DELETE has two distinct modes. The default behavior is to * relay the operation to the affected node, which will unregister the * resource from the local executor, clear the resource's history from the * CIB, and do some bookkeeping in the controller. * * However, if the affected node is offline, the client will specify * mode="cib" which means the controller receiving the operation should * clear the resource's history from the CIB and nothing else. This is used * to clear shutdown locks. */ mode = crm_element_value(msg_data, PCMK__XA_MODE); if ((mode == NULL) || strcmp(mode, XML_TAG_CIB)) { // Relay to affected node crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD); return I_ROUTER; } else { // Delete CIB history locally (compare with do_lrm_delete()) const char *from_sys = NULL; const char *user_name = NULL; const char *rsc_id = NULL; const char *node = NULL; xmlNode *rsc_xml = NULL; int rc = pcmk_rc_ok; rsc_xml = first_named_child(msg_data, XML_CIB_TAG_RESOURCE); CRM_CHECK(rsc_xml != NULL, return I_NULL); rsc_id = ID(rsc_xml); from_sys = crm_element_value(stored_msg, F_CRM_SYS_FROM); node = crm_element_value(msg_data, XML_LRM_ATTR_TARGET); user_name = pcmk__update_acl_user(stored_msg, F_CRM_USER, NULL); crm_debug("Handling " CRM_OP_LRM_DELETE " for %s on %s locally%s%s " "(clearing CIB resource history only)", rsc_id, node, (user_name? " for user " : ""), (user_name? user_name : "")); rc = controld_delete_resource_history(rsc_id, node, user_name, cib_dryrun|cib_sync_call); if (rc == pcmk_rc_ok) { rc = controld_delete_resource_history(rsc_id, node, user_name, crmd_cib_smart_opt()); } //Notify client and tengine.(Only notify tengine if mode = "cib" and CRM_OP_LRM_DELETE.) if (from_sys) { lrmd_event_data_t *op = NULL; const char *from_host = crm_element_value(stored_msg, F_CRM_HOST_FROM); const char *transition; if (strcmp(from_sys, CRM_SYSTEM_TENGINE)) { transition = crm_element_value(msg_data, XML_ATTR_TRANSITION_KEY); } else { transition = crm_element_value(stored_msg, XML_ATTR_TRANSITION_KEY); } crm_info("Notifying %s on %s that %s was%s deleted", from_sys, (from_host? from_host : "local node"), rsc_id, ((rc == pcmk_rc_ok)? "" : " not")); op = lrmd_new_event(rsc_id, CRMD_ACTION_DELETE, 0); op->type = lrmd_event_exec_complete; op->user_data = strdup(transition? transition : FAKE_TE_ID); op->params = pcmk__strkey_table(free, free); g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET)); controld_rc2event(op, rc); controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id); lrmd_free_event(op); controld_trigger_delete_refresh(from_sys, rsc_id); } return I_NULL; } } /*! * \brief Handle a CRM_OP_REMOTE_STATE message by updating remote peer cache * * \param[in] msg Message XML * * \return Next FSA input */ static enum crmd_fsa_input handle_remote_state(const xmlNode *msg) { const char *remote_uname = ID(msg); crm_node_t *remote_peer; bool remote_is_up = false; int rc = pcmk_rc_ok; rc = pcmk__xe_get_bool_attr(msg, XML_NODE_IN_CLUSTER, &remote_is_up); CRM_CHECK(remote_uname && rc == pcmk_rc_ok, return I_NULL); remote_peer = crm_remote_peer_get(remote_uname); CRM_CHECK(remote_peer, return I_NULL); pcmk__update_peer_state(__func__, remote_peer, remote_is_up ? CRM_NODE_MEMBER : CRM_NODE_LOST, 0); return I_NULL; } /*! * \brief Handle a CRM_OP_PING message * * \param[in] msg Message XML * * \return Next FSA input */ static enum crmd_fsa_input handle_ping(const xmlNode *msg) { const char *value = NULL; xmlNode *ping = NULL; xmlNode *reply = NULL; // Build reply ping = create_xml_node(NULL, XML_CRM_TAG_PING); value = crm_element_value(msg, F_CRM_SYS_TO); crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); // Add controller state value = fsa_state2string(fsa_state); crm_xml_add(ping, XML_PING_ATTR_CRMDSTATE, value); crm_notice("Current ping state: %s", value); // CTS needs this // Add controller health // @TODO maybe do some checks to determine meaningful status crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); // Send reply reply = create_reply(msg, ping); free_xml(ping); if (reply != NULL) { (void) relay_message(reply, TRUE); free_xml(reply); } // Nothing further to do return I_NULL; } /*! * \brief Handle a PCMK__CONTROLD_CMD_NODES message * * \param[in] request Message XML * * \return Next FSA input */ static enum crmd_fsa_input handle_node_list(const xmlNode *request) { GHashTableIter iter; crm_node_t *node = NULL; xmlNode *reply = NULL; xmlNode *reply_data = NULL; // Create message data for reply reply_data = create_xml_node(NULL, XML_CIB_TAG_NODES); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { xmlNode *xml = create_xml_node(reply_data, XML_CIB_TAG_NODE); crm_xml_add_ll(xml, XML_ATTR_ID, (long long) node->id); // uint32_t crm_xml_add(xml, XML_ATTR_UNAME, node->uname); crm_xml_add(xml, XML_NODE_IN_CLUSTER, node->state); } // Create and send reply reply = create_reply(request, reply_data); free_xml(reply_data); if (reply) { (void) relay_message(reply, TRUE); free_xml(reply); } // Nothing further to do return I_NULL; } /*! * \brief Handle a CRM_OP_NODE_INFO request * * \param[in] msg Message XML * * \return Next FSA input */ static enum crmd_fsa_input handle_node_info_request(const xmlNode *msg) { const char *value = NULL; crm_node_t *node = NULL; int node_id = 0; xmlNode *reply = NULL; xmlNode *reply_data = NULL; // Build reply reply_data = create_xml_node(NULL, XML_CIB_TAG_NODE); crm_xml_add(reply_data, XML_PING_ATTR_SYSFROM, CRM_SYSTEM_CRMD); // Add whether current partition has quorum - pcmk__xe_set_bool_attr(reply_data, XML_ATTR_HAVE_QUORUM, fsa_has_quorum); + pcmk__xe_set_bool_attr(reply_data, XML_ATTR_HAVE_QUORUM, + pcmk_is_set(controld_globals.flags, + controld_has_quorum)); // Check whether client requested node info by ID and/or name crm_element_value_int(msg, XML_ATTR_ID, &node_id); if (node_id < 0) { node_id = 0; } value = crm_element_value(msg, XML_ATTR_UNAME); // Default to local node if none given if ((node_id == 0) && (value == NULL)) { value = fsa_our_uname; } node = pcmk__search_node_caches(node_id, value, CRM_GET_PEER_ANY); if (node) { crm_xml_add_int(reply_data, XML_ATTR_ID, node->id); crm_xml_add(reply_data, XML_ATTR_UUID, node->uuid); crm_xml_add(reply_data, XML_ATTR_UNAME, node->uname); crm_xml_add(reply_data, XML_NODE_IS_PEER, node->state); pcmk__xe_set_bool_attr(reply_data, XML_NODE_IS_REMOTE, pcmk_is_set(node->flags, crm_remote_node)); } // Send reply reply = create_reply(msg, reply_data); free_xml(reply_data); if (reply != NULL) { (void) relay_message(reply, TRUE); free_xml(reply); } // Nothing further to do return I_NULL; } static void verify_feature_set(xmlNode *msg) { const char *dc_version = crm_element_value(msg, XML_ATTR_CRM_VERSION); if (dc_version == NULL) { /* All we really know is that the DC feature set is older than 3.1.0, * but that's also all that really matters. */ dc_version = "3.0.14"; } if (feature_set_compatible(dc_version, CRM_FEATURE_SET)) { crm_trace("Local feature set (%s) is compatible with DC's (%s)", CRM_FEATURE_SET, dc_version); } else { crm_err("Local feature set (%s) is incompatible with DC's (%s)", CRM_FEATURE_SET, dc_version); // Nothing is likely to improve without administrator involvement controld_set_fsa_input_flags(R_STAYDOWN); crmd_exit(CRM_EX_FATAL); } } // DC gets own shutdown all-clear static enum crmd_fsa_input handle_shutdown_self_ack(xmlNode *stored_msg) { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { // The expected case -- we initiated own shutdown sequence crm_info("Shutting down controller"); return I_STOP; } if (pcmk__str_eq(host_from, fsa_our_dc, pcmk__str_casei)) { // Must be logic error -- DC confirming its own unrequested shutdown crm_err("Shutting down controller immediately due to " "unexpected shutdown confirmation"); return I_TERMINATE; } if (fsa_state != S_STOPPING) { // Shouldn't happen -- non-DC confirming unrequested shutdown crm_err("Starting new DC election because %s is " "confirming shutdown we did not request", (host_from? host_from : "another node")); return I_ELECTION; } // Shouldn't happen, but we are already stopping anyway crm_debug("Ignoring unexpected shutdown confirmation from %s", (host_from? host_from : "another node")); return I_NULL; } // Non-DC gets shutdown all-clear from DC static enum crmd_fsa_input handle_shutdown_ack(xmlNode *stored_msg) { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); if (host_from == NULL) { crm_warn("Ignoring shutdown request without origin specified"); return I_NULL; } if ((fsa_our_dc == NULL) || (strcmp(host_from, fsa_our_dc) == 0)) { if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("Shutting down controller after confirmation from %s", host_from); } else { crm_err("Shutting down controller after unexpected " "shutdown request from %s", host_from); controld_set_fsa_input_flags(R_STAYDOWN); } return I_STOP; } crm_warn("Ignoring shutdown request from %s because DC is %s", host_from, fsa_our_dc); return I_NULL; } static enum crmd_fsa_input handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause) { xmlNode *msg = NULL; const char *op = crm_element_value(stored_msg, F_CRM_TASK); /* Optimize this for the DC - it has the most to do */ if (op == NULL) { crm_log_xml_warn(stored_msg, "[request without " F_CRM_TASK "]"); return I_NULL; } if (strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0) { const char *from = crm_element_value(stored_msg, F_CRM_HOST_FROM); crm_node_t *node = pcmk__search_cluster_node_cache(0, from); pcmk__update_peer_expected(__func__, node, CRMD_JOINSTATE_DOWN); if(AM_I_DC == FALSE) { return I_NULL; /* Done */ } } /*========== DC-Only Actions ==========*/ if (AM_I_DC) { if (strcmp(op, CRM_OP_JOIN_ANNOUNCE) == 0) { return I_NODE_JOIN; } else if (strcmp(op, CRM_OP_JOIN_REQUEST) == 0) { return I_JOIN_REQUEST; } else if (strcmp(op, CRM_OP_JOIN_CONFIRM) == 0) { return I_JOIN_RESULT; } else if (strcmp(op, CRM_OP_SHUTDOWN) == 0) { return handle_shutdown_self_ack(stored_msg); } else if (strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0) { // Another controller wants to shut down its node return handle_shutdown_request(stored_msg); } else if (strcmp(op, CRM_OP_REMOTE_STATE) == 0) { /* a remote connection host is letting us know the node state */ return handle_remote_state(stored_msg); } } /*========== common actions ==========*/ if (strcmp(op, CRM_OP_NOVOTE) == 0) { ha_msg_input_t fsa_input; fsa_input.msg = stored_msg; register_fsa_input_adv(C_HA_MESSAGE, I_NULL, &fsa_input, A_ELECTION_COUNT | A_ELECTION_CHECK, FALSE, __func__); } else if (strcmp(op, CRM_OP_THROTTLE) == 0) { throttle_update(stored_msg); if (AM_I_DC && transition_graph != NULL) { if (!transition_graph->complete) { crm_debug("The throttle changed. Trigger a graph."); trigger_graph(); } } return I_NULL; } else if (strcmp(op, CRM_OP_CLEAR_FAILCOUNT) == 0) { return handle_failcount_op(stored_msg); } else if (strcmp(op, CRM_OP_VOTE) == 0) { /* count the vote and decide what to do after that */ ha_msg_input_t fsa_input; fsa_input.msg = stored_msg; register_fsa_input_adv(C_HA_MESSAGE, I_NULL, &fsa_input, A_ELECTION_COUNT | A_ELECTION_CHECK, FALSE, __func__); /* Sometimes we _must_ go into S_ELECTION */ if (fsa_state == S_HALT) { crm_debug("Forcing an election from S_HALT"); return I_ELECTION; #if 0 } else if (AM_I_DC) { /* This is the old way of doing things but what is gained? */ return I_ELECTION; #endif } } else if (strcmp(op, CRM_OP_JOIN_OFFER) == 0) { verify_feature_set(stored_msg); crm_debug("Raising I_JOIN_OFFER: join-%s", crm_element_value(stored_msg, F_CRM_JOIN_ID)); return I_JOIN_OFFER; } else if (strcmp(op, CRM_OP_JOIN_ACKNAK) == 0) { crm_debug("Raising I_JOIN_RESULT: join-%s", crm_element_value(stored_msg, F_CRM_JOIN_ID)); return I_JOIN_RESULT; } else if (strcmp(op, CRM_OP_LRM_DELETE) == 0) { return handle_lrm_delete(stored_msg); } else if ((strcmp(op, CRM_OP_LRM_FAIL) == 0) || (strcmp(op, CRM_OP_LRM_REFRESH) == 0) // @COMPAT || (strcmp(op, CRM_OP_REPROBE) == 0)) { crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD); return I_ROUTER; } else if (strcmp(op, CRM_OP_NOOP) == 0) { return I_NULL; } else if (strcmp(op, CRM_OP_LOCAL_SHUTDOWN) == 0) { crm_shutdown(SIGTERM); /*return I_SHUTDOWN; */ return I_NULL; } else if (strcmp(op, CRM_OP_PING) == 0) { return handle_ping(stored_msg); } else if (strcmp(op, CRM_OP_NODE_INFO) == 0) { return handle_node_info_request(stored_msg); } else if (strcmp(op, CRM_OP_RM_NODE_CACHE) == 0) { int id = 0; const char *name = NULL; crm_element_value_int(stored_msg, XML_ATTR_ID, &id); name = crm_element_value(stored_msg, XML_ATTR_UNAME); if(cause == C_IPC_MESSAGE) { msg = create_request(CRM_OP_RM_NODE_CACHE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { crm_err("Could not instruct peers to remove references to node %s/%u", name, id); } else { crm_notice("Instructing peers to remove references to node %s/%u", name, id); } free_xml(msg); } else { reap_crm_member(id, name); /* If we're forgetting this node, also forget any failures to fence * it, so we don't carry that over to any node added later with the * same name. */ st_fail_count_reset(name); } } else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) { xmlNode *xml = get_message_xml(stored_msg, F_CRM_DATA); remote_ra_process_maintenance_nodes(xml); } else if (strcmp(op, PCMK__CONTROLD_CMD_NODES) == 0) { return handle_node_list(stored_msg); /*========== (NOT_DC)-Only Actions ==========*/ } else if (!AM_I_DC) { if (strcmp(op, CRM_OP_SHUTDOWN) == 0) { return handle_shutdown_ack(stored_msg); } } else { crm_err("Unexpected request (%s) sent to %s", op, AM_I_DC ? "the DC" : "non-DC node"); crm_log_xml_err(stored_msg, "Unexpected"); } return I_NULL; } static void handle_response(xmlNode *stored_msg) { const char *op = crm_element_value(stored_msg, F_CRM_TASK); if (op == NULL) { crm_log_xml_err(stored_msg, "Bad message"); } else if (AM_I_DC && strcmp(op, CRM_OP_PECALC) == 0) { // Check whether scheduler answer been superseded by subsequent request const char *msg_ref = crm_element_value(stored_msg, XML_ATTR_REFERENCE); if (msg_ref == NULL) { crm_err("%s - Ignoring calculation with no reference", op); } else if (pcmk__str_eq(msg_ref, fsa_pe_ref, pcmk__str_casei)) { ha_msg_input_t fsa_input; controld_stop_sched_timer(); fsa_input.msg = stored_msg; register_fsa_input_later(C_IPC_MESSAGE, I_PE_SUCCESS, &fsa_input); } else { crm_info("%s calculation %s is obsolete", op, msg_ref); } } else if (strcmp(op, CRM_OP_VOTE) == 0 || strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0 || strcmp(op, CRM_OP_SHUTDOWN) == 0) { } else { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); crm_err("Unexpected response (op=%s, src=%s) sent to the %s", op, host_from, AM_I_DC ? "DC" : "controller"); } } static enum crmd_fsa_input handle_shutdown_request(xmlNode * stored_msg) { /* handle here to avoid potential version issues * where the shutdown message/procedure may have * been changed in later versions. * * This way the DC is always in control of the shutdown */ char *now_s = NULL; const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); if (host_from == NULL) { /* we're shutting down and the DC */ host_from = fsa_our_uname; } crm_info("Creating shutdown request for %s (state=%s)", host_from, fsa_state2string(fsa_state)); crm_log_xml_trace(stored_msg, "message"); now_s = pcmk__ttoa(time(NULL)); update_attrd(host_from, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, FALSE); free(now_s); /* will be picked up by the TE as long as its running */ return I_NULL; } static void send_msg_via_ipc(xmlNode * msg, const char *sys) { pcmk__client_t *client_channel = NULL; CRM_CHECK(sys != NULL, return); client_channel = pcmk__find_client_by_id(sys); if (crm_element_value(msg, F_CRM_HOST_FROM) == NULL) { crm_xml_add(msg, F_CRM_HOST_FROM, fsa_our_uname); } if (client_channel != NULL) { /* Transient clients such as crmadmin */ pcmk__ipc_send_xml(client_channel, 0, msg, crm_ipc_server_event); } else if (pcmk__str_eq(sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) { xmlNode *data = get_message_xml(msg, F_CRM_DATA); process_te_message(msg, data); } else if (pcmk__str_eq(sys, CRM_SYSTEM_LRMD, pcmk__str_none)) { fsa_data_t fsa_data; ha_msg_input_t fsa_input; fsa_input.msg = msg; fsa_input.xml = get_message_xml(msg, F_CRM_DATA); fsa_data.id = 0; fsa_data.actions = 0; fsa_data.data = &fsa_input; fsa_data.fsa_input = I_MESSAGE; fsa_data.fsa_cause = C_IPC_MESSAGE; fsa_data.origin = __func__; fsa_data.data_type = fsa_dt_ha_msg; do_lrm_invoke(A_LRM_INVOKE, C_IPC_MESSAGE, fsa_state, I_MESSAGE, &fsa_data); } else if (crmd_is_proxy_session(sys)) { crmd_proxy_send(sys, msg); } else { crm_info("Received invalid request: unknown subsystem '%s'", sys); } } void delete_ha_msg_input(ha_msg_input_t * orig) { if (orig == NULL) { return; } free_xml(orig->msg); free(orig); } /*! * \internal * \brief Notify the DC of a remote node state change * * \param[in] node_name Node's name * \param[in] node_up TRUE if node is up, FALSE if down */ void send_remote_state_message(const char *node_name, gboolean node_up) { /* If we don't have a DC, or the message fails, we have a failsafe: * the DC will eventually pick up the change via the CIB node state. * The message allows it to happen sooner if possible. */ if (fsa_our_dc) { xmlNode *msg = create_request(CRM_OP_REMOTE_STATE, NULL, fsa_our_dc, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); crm_info("Notifying DC %s of Pacemaker Remote node %s %s", fsa_our_dc, node_name, (node_up? "coming up" : "going down")); crm_xml_add(msg, XML_ATTR_ID, node_name); pcmk__xe_set_bool_attr(msg, XML_NODE_IN_CLUSTER, node_up); send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, msg, TRUE); free_xml(msg); } else { crm_debug("No DC to notify of Pacemaker Remote node %s %s", node_name, (node_up? "coming up" : "going down")); } } diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c index 8c9e0bbffd..363e0e333c 100644 --- a/daemons/controld/controld_schedulerd.c +++ b/daemons/controld/controld_schedulerd.c @@ -1,494 +1,498 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include /* pid_t, sleep, ssize_t */ #include #include #include #include #include #include #include #include #include static void handle_disconnect(void); static pcmk_ipc_api_t *schedulerd_api = NULL; /*! * \internal * \brief Close any scheduler connection and free associated memory */ void controld_shutdown_schedulerd_ipc(void) { controld_clear_fsa_input_flags(R_PE_REQUIRED); pcmk_disconnect_ipc(schedulerd_api); handle_disconnect(); pcmk_free_ipc_api(schedulerd_api); schedulerd_api = NULL; } /*! * \internal * \brief Save CIB query result to file, raising FSA error * * \param[in] msg Ignored * \param[in] call_id Call ID of CIB query * \param[in] rc Return code of CIB query * \param[in,out] output Result of CIB query * \param[in] user_data Unique identifier for filename * * \note This is intended to be called after a scheduler connection fails. */ static void save_cib_contents(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { const char *id = user_data; register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__); CRM_CHECK(id != NULL, return); if (rc == pcmk_ok) { char *filename = crm_strdup_printf(PE_STATE_DIR "/pe-core-%s.bz2", id); if (write_xml_file(output, filename, TRUE) < 0) { crm_err("Could not save Cluster Information Base to %s after scheduler crash", filename); } else { crm_notice("Saved Cluster Information Base to %s after scheduler crash", filename); } free(filename); } } /*! * \internal * \brief Respond to scheduler connection failure */ static void handle_disconnect(void) { // If we aren't connected to the scheduler, we can't expect a reply controld_expect_sched_reply(NULL); if (pcmk_is_set(fsa_input_register, R_PE_REQUIRED)) { int rc = pcmk_ok; char *uuid_str = crm_generate_uuid(); crm_crit("Connection to the scheduler failed " CRM_XS " uuid=%s", uuid_str); /* * The scheduler died... * * Save the current CIB so that we have a chance of * figuring out what killed it. * * Delay raising the I_ERROR until the query below completes or * 5s is up, whichever comes first. * */ rc = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local); fsa_register_cib_callback(rc, FALSE, uuid_str, save_cib_contents); } else { crm_info("Connection to the scheduler released"); } controld_clear_fsa_input_flags(R_PE_CONNECTED); mainloop_set_trigger(fsa_source); return; } static void handle_reply(pcmk_schedulerd_api_reply_t *reply) { const char *msg_ref = NULL; if (!AM_I_DC) { return; } msg_ref = reply->data.graph.reference; if (msg_ref == NULL) { crm_err("%s - Ignoring calculation with no reference", CRM_OP_PECALC); } else if (pcmk__str_eq(msg_ref, fsa_pe_ref, pcmk__str_none)) { ha_msg_input_t fsa_input; xmlNode *crm_data_node; controld_stop_sched_timer(); /* do_te_invoke (which will eventually process the fsa_input we are constructing * here) requires that fsa_input.xml be non-NULL. That will only happen if * copy_ha_msg_input (which is called by register_fsa_input_adv) sees the * fsa_input.msg that it is expecting. The scheduler's IPC dispatch function * gave us the values we need, we just need to put them into XML. * * The name of the top level element here is irrelevant. Nothing checks it. */ fsa_input.msg = create_xml_node(NULL, "dummy-reply"); crm_xml_add(fsa_input.msg, XML_ATTR_REFERENCE, msg_ref); crm_xml_add(fsa_input.msg, F_CRM_TGRAPH_INPUT, reply->data.graph.input); crm_data_node = create_xml_node(fsa_input.msg, F_CRM_DATA); add_node_copy(crm_data_node, reply->data.graph.tgraph); register_fsa_input_later(C_IPC_MESSAGE, I_PE_SUCCESS, &fsa_input); free_xml(fsa_input.msg); } else { crm_info("%s calculation %s is obsolete", CRM_OP_PECALC, msg_ref); } } static void scheduler_event_callback(pcmk_ipc_api_t *api, enum pcmk_ipc_event event_type, crm_exit_t status, void *event_data, void *user_data) { pcmk_schedulerd_api_reply_t *reply = event_data; switch (event_type) { case pcmk_ipc_event_disconnect: handle_disconnect(); break; case pcmk_ipc_event_reply: handle_reply(reply); break; default: break; } } static bool new_schedulerd_ipc_connection(void) { int rc; controld_set_fsa_input_flags(R_PE_REQUIRED); if (schedulerd_api == NULL) { rc = pcmk_new_ipc_api(&schedulerd_api, pcmk_ipc_schedulerd); if (rc != pcmk_rc_ok) { crm_err("Error connecting to the scheduler: %s", pcmk_rc_str(rc)); return false; } } pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL); rc = pcmk_connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main); if (rc != pcmk_rc_ok) { crm_err("Error connecting to the scheduler: %s", pcmk_rc_str(rc)); return false; } controld_set_fsa_input_flags(R_PE_CONNECTED); return true; } static void do_pe_invoke_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data); /* A_PE_START, A_PE_STOP, O_PE_RESTART */ void do_pe_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (action & A_PE_STOP) { controld_clear_fsa_input_flags(R_PE_REQUIRED); pcmk_disconnect_ipc(schedulerd_api); handle_disconnect(); } if ((action & A_PE_START) && !pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) { if (cur_state == S_STOPPING) { crm_info("Ignoring request to connect to scheduler while shutting down"); } else if (!new_schedulerd_ipc_connection()) { crm_warn("Could not connect to scheduler"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } } } static int fsa_pe_query = 0; char *fsa_pe_ref = NULL; static mainloop_timer_t *controld_sched_timer = NULL; // @TODO Make this a configurable cluster option if there's demand for it #define SCHED_TIMEOUT_MS (120000) /*! * \internal * \brief Handle a timeout waiting for scheduler reply * * \param[in] user_data Ignored * * \return FALSE (indicating that timer should not be restarted) */ static gboolean controld_sched_timeout(gpointer user_data) { if (AM_I_DC) { /* If this node is the DC but can't communicate with the scheduler, just * exit (and likely get fenced) so this node doesn't interfere with any * further DC elections. * * @TODO We could try something less drastic first, like disconnecting * and reconnecting to the scheduler, but something is likely going * seriously wrong, so perhaps it's better to just fail as quickly as * possible. */ crmd_exit(CRM_EX_FATAL); } return FALSE; } void controld_stop_sched_timer(void) { if (controld_sched_timer && fsa_pe_ref) { crm_trace("Stopping timer for scheduler reply %s", fsa_pe_ref); } mainloop_timer_stop(controld_sched_timer); } /*! * \internal * \brief Set the scheduler request currently being waited on * * \param[in] ref Request to expect reply to (or NULL for none) * * \note This function takes ownership of \p ref. */ void controld_expect_sched_reply(char *ref) { if (ref) { if (controld_sched_timer == NULL) { controld_sched_timer = mainloop_timer_add("scheduler_reply_timer", SCHED_TIMEOUT_MS, FALSE, controld_sched_timeout, NULL); } mainloop_timer_start(controld_sched_timer); } else { controld_stop_sched_timer(); } free(fsa_pe_ref); fsa_pe_ref = ref; } /*! * \internal * \brief Free the scheduler reply timer */ void controld_free_sched_timer(void) { if (controld_sched_timer != NULL) { mainloop_timer_del(controld_sched_timer); controld_sched_timer = NULL; } } /* A_PE_INVOKE */ void do_pe_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (AM_I_DC == FALSE) { crm_err("Not invoking scheduler because not DC: %s", fsa_action2string(action)); return; } if (!pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) { if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Cannot shut down gracefully without the scheduler"); register_fsa_input_before(C_FSA_INTERNAL, I_TERMINATE, NULL); } else { crm_info("Waiting for the scheduler to connect"); crmd_fsa_stall(FALSE); controld_set_fsa_action_flags(A_PE_START); trigger_fsa(); } return; } if (cur_state != S_POLICY_ENGINE) { crm_notice("Not invoking scheduler because in state %s", fsa_state2string(cur_state)); return; } if (!pcmk_is_set(fsa_input_register, R_HAVE_CIB)) { crm_err("Attempted to invoke scheduler without consistent Cluster Information Base!"); /* start the join from scratch */ register_fsa_input_before(C_FSA_INTERNAL, I_ELECTION, NULL); return; } fsa_pe_query = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local); crm_debug("Query %d: Requesting the current CIB: %s", fsa_pe_query, fsa_state2string(fsa_state)); controld_expect_sched_reply(NULL); fsa_register_cib_callback(fsa_pe_query, FALSE, NULL, do_pe_invoke_callback); } static void force_local_option(xmlNode *xml, const char *attr_name, const char *attr_value) { int max = 0; int lpc = 0; const char *xpath_base = NULL; char *xpath_string = NULL; xmlXPathObjectPtr xpathObj = NULL; xpath_base = pcmk_cib_xpath_for(XML_CIB_TAG_CRMCONFIG); if (xpath_base == NULL) { crm_err(XML_CIB_TAG_CRMCONFIG " CIB element not known (bug?)"); return; } xpath_string = crm_strdup_printf("%s//%s//nvpair[@name='%s']", xpath_base, XML_CIB_TAG_PROPSET, attr_name); xpathObj = xpath_search(xml, xpath_string); max = numXpathResults(xpathObj); free(xpath_string); for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); crm_trace("Forcing %s/%s = %s", ID(match), attr_name, attr_value); crm_xml_add(match, XML_NVPAIR_ATTR_VALUE, attr_value); } if(max == 0) { xmlNode *configuration = NULL; xmlNode *crm_config = NULL; xmlNode *cluster_property_set = NULL; crm_trace("Creating %s-%s for %s=%s", CIB_OPTIONS_FIRST, attr_name, attr_name, attr_value); configuration = pcmk__xe_match(xml, XML_CIB_TAG_CONFIGURATION, NULL, NULL); if (configuration == NULL) { configuration = create_xml_node(xml, XML_CIB_TAG_CONFIGURATION); } crm_config = pcmk__xe_match(configuration, XML_CIB_TAG_CRMCONFIG, NULL, NULL); if (crm_config == NULL) { crm_config = create_xml_node(configuration, XML_CIB_TAG_CRMCONFIG); } cluster_property_set = pcmk__xe_match(crm_config, XML_CIB_TAG_PROPSET, NULL, NULL); if (cluster_property_set == NULL) { cluster_property_set = create_xml_node(crm_config, XML_CIB_TAG_PROPSET); crm_xml_add(cluster_property_set, XML_ATTR_ID, CIB_OPTIONS_FIRST); } xml = create_xml_node(cluster_property_set, XML_CIB_TAG_NVPAIR); crm_xml_set_id(xml, "%s-%s", CIB_OPTIONS_FIRST, attr_name); crm_xml_add(xml, XML_NVPAIR_ATTR_NAME, attr_name); crm_xml_add(xml, XML_NVPAIR_ATTR_VALUE, attr_value); } freeXpathObject(xpathObj); } static void do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *ref = NULL; pid_t watchdog = pcmk__locate_sbd(); if (rc != pcmk_ok) { crm_err("Could not retrieve the Cluster Information Base: %s " CRM_XS " rc=%d call=%d", pcmk_strerror(rc), rc, call_id); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__); return; } else if (call_id != fsa_pe_query) { crm_trace("Skipping superseded CIB query: %d (current=%d)", call_id, fsa_pe_query); return; } else if (!AM_I_DC || !pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) { crm_debug("No need to invoke the scheduler anymore"); return; } else if (fsa_state != S_POLICY_ENGINE) { crm_debug("Discarding scheduler request in state: %s", fsa_state2string(fsa_state)); return; /* this callback counts as 1 */ } else if (num_cib_op_callbacks() > 1) { crm_debug("Re-asking for the CIB: %d other peer updates still pending", (num_cib_op_callbacks() - 1)); sleep(1); controld_set_fsa_action_flags(A_PE_INVOKE); trigger_fsa(); return; } CRM_LOG_ASSERT(output != NULL); /* Refresh the remote node cache and the known node cache when the * scheduler is invoked */ pcmk__refresh_node_caches_from_cib(output); crm_xml_add(output, XML_ATTR_DC_UUID, fsa_our_uuid); - crm_xml_add_int(output, XML_ATTR_HAVE_QUORUM, fsa_has_quorum); + pcmk__xe_set_bool_attr(output, XML_ATTR_HAVE_QUORUM, + pcmk_is_set(controld_globals.flags, + controld_has_quorum)); force_local_option(output, XML_ATTR_HAVE_WATCHDOG, pcmk__btoa(watchdog)); if (pcmk_is_set(controld_globals.flags, controld_ever_had_quorum) && !crm_have_quorum) { crm_xml_add_int(output, XML_ATTR_QUORUM_PANIC, 1); } rc = pcmk_rc2legacy(pcmk_schedulerd_api_graph(schedulerd_api, output, &ref)); if (rc < 0) { crm_err("Could not contact the scheduler: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__); } else { CRM_ASSERT(ref != NULL); controld_expect_sched_reply(ref); - crm_debug("Invoking the scheduler: query=%d, ref=%s, seq=%llu, quorate=%d", - fsa_pe_query, fsa_pe_ref, crm_peer_seq, fsa_has_quorum); + crm_debug("Invoking the scheduler: query=%d, ref=%s, seq=%llu, " + "quorate=%s", fsa_pe_query, fsa_pe_ref, crm_peer_seq, + pcmk__btoa(pcmk_is_set(controld_globals.flags, + controld_has_quorum))); } } diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h index 7e188a87cb..2f74748453 100644 --- a/daemons/controld/controld_utils.h +++ b/daemons/controld/controld_utils.h @@ -1,127 +1,126 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CRMD_UTILS__H # define CRMD_UTILS__H # include # include # include // PCMK__CIB_REQUEST_MODIFY # include // fsa_cib_conn # include # define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" # define fsa_cib_update(section, data, options, call_id, user_name) \ if(fsa_cib_conn != NULL) { \ call_id = cib_internal_op( \ fsa_cib_conn, PCMK__CIB_REQUEST_MODIFY, NULL, section, data, \ NULL, options, user_name); \ \ } else { \ crm_err("No CIB manager connection available"); \ } static inline void fsa_cib_anon_update(const char *section, xmlNode *data) { if (fsa_cib_conn == NULL) { crm_err("No CIB connection available"); } else { int opts = cib_scope_local | cib_quorum_override | cib_can_create; fsa_cib_conn->cmds->modify(fsa_cib_conn, section, data, opts); } } static inline void fsa_cib_anon_update_discard_reply(const char *section, xmlNode *data) { if (fsa_cib_conn == NULL) { crm_err("No CIB connection available"); } else { int opts = cib_scope_local | cib_quorum_override | cib_can_create | cib_discard_reply; fsa_cib_conn->cmds->modify(fsa_cib_conn, section, data, opts); } } -extern gboolean fsa_has_quorum; extern bool controld_shutdown_lock_enabled; extern int last_resource_update; enum node_update_flags { node_update_none = 0x0000, node_update_quick = 0x0001, node_update_cluster = 0x0010, node_update_peer = 0x0020, node_update_join = 0x0040, node_update_expected = 0x0100, node_update_all = node_update_cluster|node_update_peer|node_update_join|node_update_expected, }; crm_exit_t crmd_exit(crm_exit_t exit_code); _Noreturn void crmd_fast_exit(crm_exit_t exit_code); void controld_shutdown_schedulerd_ipc(void); void controld_stop_sched_timer(void); void controld_free_sched_timer(void); void controld_expect_sched_reply(char *ref); void fsa_dump_actions(uint64_t action, const char *text); void fsa_dump_inputs(int log_level, const char *text, long long input_register); gboolean update_dc(xmlNode * msg); void crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase phase); xmlNode *create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, const char *source); void populate_cib_nodes(enum node_update_flags flags, const char *source); void crm_update_quorum(gboolean quorum, gboolean force_update); void controld_close_attrd_ipc(void); void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node); void update_attrd_list(GList *attrs, uint32_t opts); void update_attrd_remote_node_removed(const char *host, const char *user_name); void update_attrd_clear_failures(const char *host, const char *rsc, const char *op, const char *interval_spec, gboolean is_remote_node); int crmd_join_phase_count(enum crm_join_phase phase); void crmd_join_phase_log(int level); void crmd_peer_down(crm_node_t *peer, bool full); unsigned int cib_op_timeout(void); bool feature_set_compatible(const char *dc_version, const char *join_version); bool controld_action_is_recordable(const char *action); // Subsections of node_state enum controld_section_e { controld_section_lrm, controld_section_lrm_unlocked, controld_section_attrs, controld_section_all, controld_section_all_unlocked }; void controld_delete_node_state(const char *uname, enum controld_section_e section, int options); int controld_delete_resource_history(const char *rsc_id, const char *node, const char *user_name, int call_options); const char *get_node_id(xmlNode *lrm_rsc_op); /* Convenience macro for registering a CIB callback * (assumes that data can be freed with free()) */ # define fsa_register_cib_callback(id, flag, data, fn) do { \ CRM_ASSERT(fsa_cib_conn); \ fsa_cib_conn->cmds->register_callback_full( \ fsa_cib_conn, id, cib_op_timeout(), \ flag, data, #fn, fn, free); \ } while(0) #endif diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h index 76e87d09fb..99b388ea3f 100644 --- a/daemons/controld/pacemaker-controld.h +++ b/daemons/controld/pacemaker-controld.h @@ -1,63 +1,66 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CRMD__H # define CRMD__H #include #include #include #include #include #include #include #include #include #include #include #include typedef struct { // Booleans //! Group of \p controld_flags values uint32_t flags; } controld_globals_t; extern GMainLoop *crmd_mainloop; extern bool no_quorum_suicide_escalation; extern controld_globals_t controld_globals; /*! * \internal * \enum controld_flags * \brief Bit flags to store various controller state and configuration info */ enum controld_flags { //! The DC left in a membership change that is being processed controld_dc_left = (1 << 0), //! The FSA is stalled waiting for further input controld_fsa_is_stalled = (1 << 1), //! The local node has been in a quorate partition at some point controld_ever_had_quorum = (1 << 2), + + //! The local node is currently in a quorate partition + controld_has_quorum = (1 << 3), }; void do_cib_updated(const char *event, xmlNode * msg); void do_cib_replaced(const char *event, xmlNode * msg); void crmd_metadata(void); void controld_election_init(const char *uname); void controld_remove_voter(const char *uname); void controld_election_fini(void); void controld_set_election_period(const char *value); void controld_stop_current_election_timeout(void); void controld_disconnect_cib_manager(void); #endif