diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index d578adc932..e3b68f6906 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -1,367 +1,369 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (pcmk__str_eq(sys_from, CRM_SYSTEM_DC, pcmk__str_casei)) { const char *from = crm_element_value(msg, F_ORIG); if (!pcmk__str_eq(from, controld_globals.our_nodename, pcmk__str_casei)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (controld_globals.fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __func__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (pcmk__str_eq(sys_to, CRM_SYSTEM_DC, pcmk__str_casei)) { return; } } /* crm_log_xml_trace(msg, "HA[inbound]"); */ route_message(C_HA_MESSAGE, msg); done: controld_trigger_fsa(); } /*! * \internal * \brief Check whether a node is online * * \param[in] node Node to check * * \retval -1 if completely dead * \retval 0 if partially alive * \retval 1 if completely alive */ static int node_alive(const crm_node_t *node) { if (pcmk_is_set(node->flags, crm_remote_node)) { // Pacemaker Remote nodes can't be partially alive return pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei) ? 1: -1; } else if (crm_is_peer_active(node)) { // Completely up cluster node: both cluster member and peer return 1; } else if (!pcmk_is_set(node->processes, crm_get_cluster_proc()) && !pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)) { // Completely down cluster node: neither cluster member nor peer return -1; } // Partially up cluster node: only cluster member or only peer return 0; } #define state_text(state) ((state)? (const char *)(state) : "in unknown state") void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { uint32_t old = 0; bool appeared = FALSE; bool is_remote = pcmk_is_set(node->flags, crm_remote_node); + controld_node_pending_timer(node); + /* The controller waits to receive some information from the membership * layer before declaring itself operational. If this is being called for a * cluster node, indicate that we have it. */ if (!is_remote) { controld_set_fsa_input_flags(R_PEER_DATA); } if (type == crm_status_processes && pcmk_is_set(node->processes, crm_get_cluster_proc()) && !AM_I_DC && !is_remote) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in crmd_ha_msg_filter() */ xmlNode *query = create_request(CRM_OP_HELLO, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); crm_debug("Sending hello to node %u so that it learns our node name", node->id); send_cluster_message(node, crm_msg_crmd, query, FALSE); free_xml(query); } if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: /* If we've never seen the node, then it also won't be in the status section */ crm_info("%s node %s is now %s", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state)); return; case crm_status_nstate: /* This callback should not be called unless the state actually * changed, but here's a failsafe just in case. */ CRM_CHECK(!pcmk__str_eq(data, node->state, pcmk__str_casei), return); crm_info("%s node %s is now %s (was %s)", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state), state_text(data)); if (pcmk__str_eq(CRM_NODE_MEMBER, node->state, pcmk__str_casei)) { appeared = TRUE; if (!is_remote) { remove_stonith_cleanup(node->uname); } } else { controld_remove_failed_sync_node(node->uname); controld_remove_voter(node->uname); } crmd_alert_node_event(node); break; case crm_status_processes: CRM_CHECK(data != NULL, return); old = *(const uint32_t *)data; appeared = pcmk_is_set(node->processes, crm_get_cluster_proc()); { const char *dc_s = controld_globals.dc_name; if ((dc_s == NULL) && AM_I_DC) { dc_s = "true"; } crm_info("Node %s is %s a peer " CRM_XS " DC=%s old=%#07x new=%#07x", node->uname, (appeared? "now" : "no longer"), pcmk__s(dc_s, ""), old, node->processes); } if (!pcmk_is_set((node->processes ^ old), crm_get_cluster_proc())) { /* Peer status did not change. This should not be possible, * since we don't track process flags other than peer status. */ crm_trace("Process flag %#7x did not change from %#7x to %#7x", crm_get_cluster_proc(), old, node->processes); return; } if (!appeared) { node->peer_lost = time(NULL); controld_remove_failed_sync_node(node->uname); controld_remove_voter(node->uname); } if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) { crm_trace("Ignoring peer status change because not connected to CIB"); return; } else if (controld_globals.fsa_state == S_STOPPING) { crm_trace("Ignoring peer status change because stopping"); return; } if (!appeared && pcmk__str_eq(node->uname, controld_globals.our_nodename, pcmk__str_casei)) { /* Did we get evicted? */ crm_notice("Our peer connection failed"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL); } else if (pcmk__str_eq(node->uname, controld_globals.dc_name, pcmk__str_casei) && !crm_is_peer_active(node)) { /* Did the DC leave us? */ crm_notice("Our peer on the DC (%s) is dead", controld_globals.dc_name); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't * want to fence it. Newer DCs will send their shutdown request * to all peers, who will update the DC's expected state to * down, thus avoiding fencing. We can safely erase the DC's * transient attributes when it leaves in that case. However, * the only way to avoid fencing older DCs is to leave the * transient attributes intact until it rejoins. */ if (compare_version(controld_globals.dc_version, "3.0.9") > 0) { controld_delete_node_state(node->uname, controld_section_attrs, cib_scope_local); } } else if (AM_I_DC || pcmk_is_set(controld_globals.flags, controld_dc_left) || (controld_globals.dc_name == NULL)) { /* This only needs to be done once, so normally the DC should do * it. However if there is no DC, every node must do it, since * there is no other way to ensure some one node does it. */ if (appeared) { te_trigger_stonith_history_sync(FALSE); } else { controld_delete_node_state(node->uname, controld_section_attrs, cib_scope_local); } } break; } if (AM_I_DC) { xmlNode *update = NULL; int flags = node_update_peer; int alive = node_alive(node); pcmk__graph_action_t *down = match_down_event(node->uuid); crm_trace("Alive=%d, appeared=%d, down=%d", alive, appeared, (down? down->id : -1)); if (appeared && (alive > 0) && !is_remote) { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { /* tengine_stonith_callback() confirms fence actions */ crm_trace("Updating CIB %s fencer reported fencing of %s complete", (pcmk_is_set(down->flags, pcmk__graph_action_confirmed)? "after" : "before"), node->uname); } else if (!appeared && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { // Shutdown actions are immediately confirmed (i.e. no_wait) if (!is_remote) { flags |= node_update_join | node_update_expected; crmd_peer_down(node, FALSE); check_join_state(controld_globals.fsa_state, __func__); } if (alive >= 0) { crm_info("%s of peer %s is in progress " CRM_XS " action=%d", task, node->uname, down->id); } else { crm_notice("%s of peer %s is complete " CRM_XS " action=%d", task, node->uname, down->id); pcmk__update_graph(controld_globals.transition_graph, down); trigger_graph(); } } else { crm_trace("Node %s is %s, was expected to %s (op %d)", node->uname, ((alive > 0)? "alive" : ((alive < 0)? "dead" : "partially alive")), task, down->id); } } else if (appeared == FALSE) { if ((controld_globals.transition_graph == NULL) || (controld_globals.transition_graph->id == -1)) { crm_info("Stonith/shutdown of node %s is unknown to the " "current DC", node->uname); } else { crm_warn("Stonith/shutdown of node %s was not expected", node->uname); } if (!is_remote) { crm_update_peer_join(__func__, node, crm_join_none); check_join_state(controld_globals.fsa_state, __func__); } abort_transition(INFINITY, pcmk__graph_restart, "Node failure", NULL); fail_incompletable_actions(controld_globals.transition_graph, node->uuid); } else { crm_trace("Node %s came up, was not expected to be down", node->uname); } if (is_remote) { /* A pacemaker_remote node won't have its cluster status updated * in the CIB by membership-layer callbacks, so do it here. */ flags |= node_update_cluster; /* Trigger resource placement on newly integrated nodes */ if (appeared) { abort_transition(INFINITY, pcmk__graph_restart, "Pacemaker Remote node integrated", NULL); } } /* Update the CIB node state */ update = create_node_state_update(node, flags, NULL, __func__); if (update == NULL) { crm_debug("Node state update not yet possible for %s", node->uname); } else { fsa_cib_anon_update(XML_CIB_TAG_STATUS, update); } free_xml(update); } controld_trigger_fsa(); } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(controld_globals.fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(controld_globals.fsa_message_queue)); return TRUE; } diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index b78bf9c3e7..bfcd88f7b7 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,861 +1,862 @@ /* * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include static qb_ipcs_service_t *ipcs = NULL; static crm_trigger_t *config_read_trigger = NULL; #if SUPPORT_COROSYNC extern gboolean crm_connect_corosync(crm_cluster_t * cluster); #endif void crm_shutdown(int nsig); static gboolean crm_read_options(gpointer user_data); /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; static crm_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = pcmk_cluster_new(); } if (action & A_HA_DISCONNECT) { crm_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); controld_set_fsa_input_flags(R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); crm_set_autoreap(FALSE); #if SUPPORT_COROSYNC if (is_corosync_cluster()) { registered = crm_connect_corosync(cluster); } #endif // SUPPORT_COROSYNC if (registered) { controld_election_init(cluster->uname); controld_globals.our_nodename = cluster->uname; controld_globals.our_uuid = cluster->uuid; if(cluster->uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } } if (!registered) { controld_set_fsa_input_flags(R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __func__); controld_clear_fsa_input_flags(R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __func__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ controld_set_fsa_input_flags(R_SHUTDOWN); controld_disconnect_fencer(FALSE); } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; controld_set_fsa_input_flags(R_SHUTDOWN); //controld_set_fsa_input_flags(R_STAYDOWN); crm_info("Sending shutdown request to all peers (DC is %s)", pcmk__s(controld_globals.dc_name, "not set")); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } void crmd_fast_exit(crm_exit_t exit_code) { if (pcmk_is_set(controld_globals.fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn "CRM_XS" remapping exit code %d to %d", exit_code, CRM_EX_FATAL); exit_code = CRM_EX_FATAL; } else if ((exit_code == CRM_EX_OK) && pcmk_is_set(controld_globals.fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = CRM_EX_ERROR; } if (controld_globals.logger_out != NULL) { controld_globals.logger_out->finish(controld_globals.logger_out, exit_code, true, NULL); pcmk__output_free(controld_globals.logger_out); controld_globals.logger_out = NULL; } crm_exit(exit_code); } crm_exit_t crmd_exit(crm_exit_t exit_code) { GMainLoop *mloop = controld_globals.mainloop; static bool in_progress = FALSE; if (in_progress && (exit_code == CRM_EX_OK)) { crm_debug("Exit is already in progress"); return exit_code; } else if(in_progress) { crm_notice("Error during shutdown process, exiting now with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } in_progress = TRUE; crm_trace("Preparing to exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); /* Suppress secondary errors resulting from us disconnecting everything */ controld_set_fsa_input_flags(R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } controld_close_attrd_ipc(); controld_shutdown_schedulerd_ipc(); controld_disconnect_fencer(TRUE); if ((exit_code == CRM_EX_OK) && (controld_globals.mainloop == NULL)) { crm_debug("No mainloop detected"); exit_code = CRM_EX_ERROR; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if (exit_code != CRM_EX_OK) { crm_notice("Forcing immediate exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } /* Clean up as much memory as possible for valgrind */ for (GList *iter = controld_globals.fsa_message_queue; iter != NULL; iter = iter->next) { fsa_data_t *fsa_data = (fsa_data_t *) iter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(controld_globals.fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } controld_clear_fsa_input_flags(R_MEMBERSHIP); g_list_free(controld_globals.fsa_message_queue); controld_globals.fsa_message_queue = NULL; + controld_free_node_pending_timers(); controld_election_fini(); /* Tear down the CIB manager connection, but don't free it yet -- it could * be used when we drain the mainloop later. */ controld_disconnect_cib_manager(); verify_stopped(controld_globals.fsa_state, LOG_WARNING); controld_clear_fsa_input_flags(R_LRM_CONNECTED); lrm_state_destroy_all(); mainloop_destroy_trigger(config_read_trigger); config_read_trigger = NULL; controld_destroy_fsa_trigger(); controld_destroy_transition_trigger(); pcmk__client_cleanup(); crm_peer_destroy(); controld_free_fsa_timers(); te_cleanup_stonith_history_sync(NULL, TRUE); controld_free_sched_timer(); free(controld_globals.our_nodename); controld_globals.our_nodename = NULL; free(controld_globals.our_uuid); controld_globals.our_uuid = NULL; free(controld_globals.dc_name); controld_globals.dc_name = NULL; free(controld_globals.dc_version); controld_globals.dc_version = NULL; free(controld_globals.cluster_name); controld_globals.cluster_name = NULL; free(controld_globals.te_uuid); controld_globals.te_uuid = NULL; free_max_generation(); controld_destroy_cib_replacements_table(); controld_destroy_failed_sync_table(); controld_destroy_outside_events_table(); mainloop_destroy_signal(SIGPIPE); mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); /* leave SIGCHLD engaged as we might still want to drain some service-actions */ if (mloop) { GMainContext *ctx = g_main_loop_get_context(controld_globals.mainloop); /* Don't re-enter this block */ controld_globals.mainloop = NULL; /* no signals on final draining anymore */ mainloop_destroy_signal(SIGCHLD); crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); { int lpc = 0; while((g_main_context_pending(ctx) && lpc < 10)) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); } else { mainloop_destroy_signal(SIGCHLD); } cib_delete(controld_globals.cib_conn); controld_globals.cib_conn = NULL; throttle_fini(); /* Graceful */ crm_trace("Done preparing for exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); return exit_code; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { log_level = LOG_ERR; exit_type = "forcefully"; exit_code = CRM_EX_ERROR; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the controller", fsa_action2string(action), exit_type); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } static void sigpipe_ignore(int nsig) { return; } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); mainloop_add_signal(SIGPIPE, sigpipe_ignore); config_read_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); controld_init_fsa_trigger(); controld_init_transition_trigger(); crm_debug("Creating CIB manager and executor objects"); controld_globals.cib_conn = cib_new(); lrm_state_init_local(); if (controld_init_fsa_timers() == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } // \return libqb error code (0 on success, -errno on error) static int32_t accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Accepting new IPC client connection"); if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } // \return libqb error code (0 on success, -errno on error) static int32_t dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; pcmk__client_t *client = pcmk__find_client(c); xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags); if (msg == NULL) { pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_PROTOCOL); return 0; } pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_INDETERMINATE); CRM_ASSERT(client->user != NULL); pcmk__update_acl_user(msg, F_CRM_USER, client->user); crm_xml_add(msg, F_CRM_SYS_FROM, client->id); if (controld_authorize_ipc_message(msg, client, NULL)) { crm_trace("Processing IPC message from client %s", pcmk__client_name(client)); route_message(C_IPC_MESSAGE, msg); } controld_trigger_fsa(); free_xml(msg); return 0; } static int32_t ipc_client_disconnected(qb_ipcs_connection_t *c) { pcmk__client_t *client = pcmk__find_client(c); if (client) { crm_trace("Disconnecting %sregistered client %s (%p/%p)", (client->userdata? "" : "un"), pcmk__client_name(client), c, client); free(client->userdata); pcmk__free_client(client); controld_trigger_fsa(); } return 0; } static void ipc_connection_destroyed(qb_ipcs_connection_t *c) { crm_trace("Connection %p", c); ipc_client_disconnected(c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = accept_controller_client, .connection_created = NULL, .msg_process = dispatch_controller_ipc, .connection_closed = ipc_client_disconnected, .connection_destroyed = ipc_connection_destroyed }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_MEMBERSHIP)) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) { crm_info("Delaying start, not connected to executor (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_READ_CONFIG)) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (!pcmk_is_set(controld_globals.fsa_input_register, R_PEER_DATA)) { crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = pcmk__serve_controld_ipc(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_notice("Pacemaker controller successfully started and accepting connections"); } controld_trigger_fencer_connect(); controld_clear_fsa_input_flags(R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { controld_set_fsa_input_flags(R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } static pcmk__cluster_option_t controller_options[] = { /* name, old name, type, allowed values, * default value, validator, * short description, * long description */ { "dc-version", NULL, "string", NULL, PCMK__VALUE_NONE, NULL, N_("Pacemaker version on cluster node elected Designated Controller (DC)"), N_("Includes a hash which identifies the exact changeset the code was " "built from. Used for diagnostic purposes.") }, { "cluster-infrastructure", NULL, "string", NULL, "corosync", NULL, N_("The messaging stack on which Pacemaker is currently running"), N_("Used for informational and diagnostic purposes.") }, { "cluster-name", NULL, "string", NULL, NULL, NULL, N_("An arbitrary name for the cluster"), N_("This optional value is mostly for users' convenience as desired " "in administration, but may also be used in Pacemaker " "configuration rules via the #cluster-name node attribute, and " "by higher-level tools and resource agents.") }, { XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time", NULL, "20s", pcmk__valid_interval_spec, N_("How long to wait for a response from other nodes during start-up"), N_("The optimal value will depend on the speed and load of your network " "and the type of switches used.") }, { XML_CONFIG_ATTR_RECHECK, NULL, "time", N_("Zero disables polling, while positive values are an interval in seconds" "(unless other units are specified, for example \"5min\")"), "15min", pcmk__valid_interval_spec, N_("Polling interval to recheck cluster state and evaluate rules " "with date specifications"), N_("Pacemaker is primarily event-driven, and looks ahead to know when to " "recheck cluster state for failure timeouts and most time-based " "rules. However, it will also recheck the cluster after this " "amount of inactivity, to evaluate rules with date specifications " "and serve as a fail-safe for certain types of scheduler bugs.") }, { "load-threshold", NULL, "percentage", NULL, "80%", pcmk__valid_percentage, N_("Maximum amount of system load that should be used by cluster nodes"), N_("The cluster will slow down its recovery process when the amount of " "system resources used (currently CPU) approaches this limit"), }, { "node-action-limit", NULL, "integer", NULL, "0", pcmk__valid_number, N_("Maximum number of jobs that can be scheduled per node " "(defaults to 2x cores)") }, { XML_CONFIG_ATTR_FENCE_REACTION, NULL, "string", NULL, "stop", NULL, N_("How a cluster node should react if notified of its own fencing"), N_("A cluster node may receive notification of its own fencing if fencing " "is misconfigured, or if fabric fencing is in use that doesn't cut " "cluster communication. Allowed values are \"stop\" to attempt to " "immediately stop Pacemaker and stay stopped, or \"panic\" to attempt " "to immediately reboot the local node, falling back to stop on failure.") }, { XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL, "2min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("Declare an election failed if it is not decided within this much " "time. If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL, "20min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("Exit immediately if shutdown does not complete within this much " "time. If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "join-integration-timeout", "crmd-integration-timeout", "time", NULL, "3min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "join-finalization-timeout", "crmd-finalization-timeout", "time", NULL, "30min", pcmk__valid_interval_spec, "*** Advanced Use Only ***", N_("If you need to adjust this value, it probably indicates " "the presence of a bug.") }, { "transition-delay", "crmd-transition-delay", "time", NULL, "0s", pcmk__valid_interval_spec, N_("*** Advanced Use Only *** Enabling this option will slow down " "cluster recovery under all conditions"), N_("Delay cluster recovery for this much time to allow for additional " "events to occur. Useful if your configuration is sensitive to " "the order in which ping updates arrive.") }, { "stonith-watchdog-timeout", NULL, "time", NULL, "0", controld_verify_stonith_watchdog_timeout, N_("How long before nodes can be assumed to be safely down when " "watchdog-based self-fencing via SBD is in use"), N_("If this is set to a positive value, lost nodes are assumed to " "self-fence using watchdog-based SBD within this much time. This " "does not require a fencing resource to be explicitly configured, " "though a fence_watchdog resource can be configured, to limit use " "to specific nodes. If this is set to 0 (the default), the cluster " "will never assume watchdog-based self-fencing. If this is set to a " "negative value, the cluster will use twice the local value of the " "`SBD_WATCHDOG_TIMEOUT` environment variable if that is positive, " "or otherwise treat this as 0. WARNING: When used, this timeout " "must be larger than `SBD_WATCHDOG_TIMEOUT` on all nodes that use " "watchdog-based SBD, and Pacemaker will refuse to start on any of " "those nodes where this is not true for the local value or SBD is " "not active. When this is set to a negative value, " "`SBD_WATCHDOG_TIMEOUT` must be set to the same value on all nodes " "that use SBD, otherwise data corruption or loss could occur.") }, { "stonith-max-attempts", NULL, "integer", NULL, "10", pcmk__valid_positive_number, N_("How many times fencing can fail before it will no longer be " "immediately re-attempted on a target") }, // Already documented in libpe_status (other values must be kept identical) { "no-quorum-policy", NULL, "select", "stop, freeze, ignore, demote, suicide", "stop", pcmk__valid_quorum, N_("What to do when the cluster does not have quorum"), NULL }, { XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL, "false", pcmk__valid_boolean, N_("Whether to lock resources to a cleanly shut down node"), N_("When true, resources active on a node when it is cleanly shut down " "are kept \"locked\" to that node (not allowed to run elsewhere) " "until they start again on that node after it rejoins (or for at " "most shutdown-lock-limit, if set). Stonith resources and " "Pacemaker Remote connections are never locked. Clone and bundle " "instances and the promoted role of promotable clones are " "currently never locked, though support could be added in a future " "release.") }, { XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT, NULL, "time", NULL, "0", pcmk__valid_interval_spec, N_("Do not lock resources to a cleanly shut down node longer than " "this"), N_("If shutdown-lock is true and this is set to a nonzero time " "duration, shutdown locks will expire after this much time has " "passed since the shutdown was initiated, even if the node has not " "rejoined.") }, }; void crmd_metadata(void) { const char *desc_short = "Pacemaker controller options"; const char *desc_long = "Cluster options used by Pacemaker's controller"; gchar *s = pcmk__format_option_metadata("pacemaker-controld", desc_short, desc_long, controller_options, PCMK__NELEM(controller_options)); printf("%s", s); g_free(s); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); xmlNode *crmconfig = NULL; xmlNode *alerts = NULL; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); controld_set_fsa_input_flags(R_STAYDOWN); } goto bail; } crmconfig = output; if ((crmconfig) && (crm_element_name(crmconfig)) && (strcmp(crm_element_name(crmconfig), XML_CIB_TAG_CRMCONFIG) != 0)) { crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG); } if (!crmconfig) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query for " XML_CIB_TAG_CRMCONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = pcmk__strkey_table(free, free); pe_unpack_nvpairs(crmconfig, crmconfig, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL); // Validate all options, and use defaults if not already present in hash pcmk__validate_cluster_options(config_hash, controller_options, PCMK__NELEM(controller_options)); value = g_hash_table_lookup(config_hash, "no-quorum-policy"); if (pcmk__str_eq(value, "suicide", pcmk__str_casei) && pcmk__locate_sbd()) { controld_set_global_flags(controld_no_quorum_suicide); } value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK); if (crm_is_true(value)) { controld_set_global_flags(controld_shutdown_lock_enabled); } else { controld_clear_global_flags(controld_shutdown_lock_enabled); } value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT); controld_globals.shutdown_lock_limit = crm_parse_interval_spec(value) / 1000; value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT); controld_globals.node_pending_timeout = crm_parse_interval_spec(value) / 1000; value = g_hash_table_lookup(config_hash, "cluster-name"); pcmk__str_update(&(controld_globals.cluster_name), value); // Let subcomponents initialize their own static variables controld_configure_election(config_hash); controld_configure_fencing(config_hash); controld_configure_fsa_timers(config_hash); controld_configure_throttle(config_hash); alerts = first_named_child(output, XML_CIB_TAG_ALERTS); crmd_unpack_alerts(alerts); controld_set_fsa_input_flags(R_READ_CONFIG); controld_trigger_fsa(); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } /*! * \internal * \brief Trigger read and processing of the configuration * * \param[in] fn Calling function name * \param[in] line Line number where call occurred */ void controld_trigger_config_as(const char *fn, int line) { if (config_read_trigger != NULL) { crm_trace("%s:%d - Triggered config processing", fn, line); mainloop_set_trigger(config_read_trigger); } } gboolean crm_read_options(gpointer user_data) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = cib_conn->cmds->query(cib_conn, "//" XML_CIB_TAG_CRMCONFIG " | //" XML_CIB_TAG_ALERTS, NULL, cib_xpath|cib_scope_local); fsa_register_cib_callback(call_id, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); controld_trigger_config(); } void crm_shutdown(int nsig) { const char *value = NULL; guint default_period_ms = 0; if ((controld_globals.mainloop == NULL) || !g_main_loop_is_running(controld_globals.mainloop)) { crmd_exit(CRM_EX_OK); return; } if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); return; } controld_set_fsa_input_flags(R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); /* If shutdown timer doesn't have a period set, use the default * * @TODO: Evaluate whether this is still necessary. As long as * config_query_callback() has been run at least once, it doesn't look like * anything could have changed the timer period since then. */ value = pcmk__cluster_option(NULL, controller_options, PCMK__NELEM(controller_options), XML_CONFIG_ATTR_FORCE_QUIT); default_period_ms = crm_parse_interval_spec(value); controld_shutdown_start_countdown(default_period_ms); } diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c index 9009778171..0c1b8acf75 100644 --- a/daemons/controld/controld_te_utils.c +++ b/daemons/controld/controld_te_utils.c @@ -1,369 +1,501 @@ /* * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include //! Triggers transition graph processing static crm_trigger_t *transition_trigger = NULL; +static GHashTable *node_pending_timers = NULL; + gboolean stop_te_timer(pcmk__graph_action_t *action) { if (action == NULL) { return FALSE; } if (action->timer != 0) { crm_trace("Stopping action timer"); g_source_remove(action->timer); action->timer = 0; } else { crm_trace("Action timer was already stopped"); return FALSE; } return TRUE; } static gboolean te_graph_trigger(gpointer user_data) { if (controld_globals.transition_graph == NULL) { crm_debug("Nothing to do"); return TRUE; } crm_trace("Invoking graph %d in state %s", controld_globals.transition_graph->id, fsa_state2string(controld_globals.fsa_state)); switch (controld_globals.fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: return TRUE; default: break; } if (!controld_globals.transition_graph->complete) { enum pcmk__graph_status graph_rc; int orig_limit = controld_globals.transition_graph->batch_limit; int throttled_limit = throttle_get_total_job_limit(orig_limit); controld_globals.transition_graph->batch_limit = throttled_limit; graph_rc = pcmk__execute_graph(controld_globals.transition_graph); controld_globals.transition_graph->batch_limit = orig_limit; if (graph_rc == pcmk__graph_active) { crm_trace("Transition not yet complete"); return TRUE; } else if (graph_rc == pcmk__graph_pending) { crm_trace("Transition not yet complete - no actions fired"); return TRUE; } if (graph_rc != pcmk__graph_complete) { crm_warn("Transition failed: %s", pcmk__graph_status2text(graph_rc)); pcmk__log_graph(LOG_NOTICE, controld_globals.transition_graph); } } crm_debug("Transition %d is now complete", controld_globals.transition_graph->id); controld_globals.transition_graph->complete = true; notify_crmd(controld_globals.transition_graph); return TRUE; } /*! * \internal * \brief Initialize transition trigger */ void controld_init_transition_trigger(void) { transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger, NULL); } /*! * \internal * \brief Destroy transition trigger */ void controld_destroy_transition_trigger(void) { mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; } void controld_trigger_graph_as(const char *fn, int line) { crm_trace("%s:%d - Triggered graph processing", fn, line); mainloop_set_trigger(transition_trigger); } static struct abort_timer_s { bool aborted; guint id; int priority; enum pcmk__graph_next action; const char *text; } abort_timer = { 0, }; static gboolean abort_timer_popped(gpointer data) { struct abort_timer_s *abort_timer = (struct abort_timer_s *) data; if (AM_I_DC && (abort_timer->aborted == FALSE)) { abort_transition(abort_timer->priority, abort_timer->action, abort_timer->text, NULL); } abort_timer->id = 0; return FALSE; // do not immediately reschedule timer } /*! * \internal * \brief Abort transition after delay, if not already aborted in that time * * \param[in] abort_text Must be literal string */ void abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, guint delay_ms) { if (abort_timer.id) { // Timer already in progress, stop and reschedule g_source_remove(abort_timer.id); } abort_timer.aborted = FALSE; abort_timer.priority = abort_priority; abort_timer.action = abort_action; abort_timer.text = abort_text; abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, &abort_timer); } +static void +free_node_pending_timer(gpointer data) +{ + struct abort_timer_s *node_pending_timer = (struct abort_timer_s *) data; + + if (node_pending_timer->id != 0) { + g_source_remove(node_pending_timer->id); + node_pending_timer->id = 0; + } + + free(node_pending_timer); +} + +static gboolean +node_pending_timer_popped(gpointer key) +{ + struct abort_timer_s *node_pending_timer = NULL; + + if (node_pending_timers == NULL) { + return FALSE; + } + + node_pending_timer = g_hash_table_lookup(node_pending_timers, key); + if (node_pending_timer == NULL) { + return FALSE; + } + + crm_warn("Node with id '%s' pending timed out (%us) on joining the process " + "group", + (const char *) key, controld_globals.node_pending_timeout); + + abort_timer_popped(node_pending_timer); + + g_hash_table_remove(node_pending_timers, key); + + return FALSE; // do not reschedule timer +} + +static void +init_node_pending_timer(const crm_node_t *node, guint timeout) +{ + struct abort_timer_s *node_pending_timer = NULL; + char *key = NULL; + + if (node->uuid == NULL) { + return; + } + + if (node_pending_timers == NULL) { + node_pending_timers = pcmk__strikey_table(free, + free_node_pending_timer); + + // The timer is somehow already existing + } else if (g_hash_table_lookup(node_pending_timers, node->uuid) != NULL) { + return; + } + + crm_notice("Waiting for pending %s with id '%s' to join the process " + "group (timeout=%us)", + node->uname ? node->uname : "node", node->uuid, + controld_globals.node_pending_timeout); + + node_pending_timer = calloc(1, sizeof(struct abort_timer_s)); + CRM_ASSERT(node_pending_timer != NULL); + + node_pending_timer->aborted = FALSE; + node_pending_timer->priority = INFINITY; + node_pending_timer->action = pcmk__graph_restart; + node_pending_timer->text = "Node pending timed out"; + + key = strdup(node->uuid); + CRM_ASSERT(key != NULL); + + g_hash_table_replace(node_pending_timers, key, node_pending_timer); + + node_pending_timer->id = g_timeout_add_seconds(timeout, + node_pending_timer_popped, + key); + CRM_ASSERT(node_pending_timer->id != 0); +} + +static void +remove_node_pending_timer(const char *node_uuid) +{ + if (node_pending_timers == NULL) { + return; + } + + g_hash_table_remove(node_pending_timers, node_uuid); +} + +void +controld_node_pending_timer(const crm_node_t *node) +{ + long long remaining_timeout = 0; + + /* Node is either not even a cluster member or it's as well online in CPG. + * Free any node pending timer of it. + */ + if (node->when_member <= 0 || node->when_online != 0) { + remove_node_pending_timer(node->uuid); + return; + } + // Node is a cluster member but offline in CPG + + remaining_timeout = node->when_member - time(NULL) + + controld_globals.node_pending_timeout; + + /* It already passed node pending timeout somehow. + * Free any node pending timer of it. + */ + if (remaining_timeout <= 0) { + remove_node_pending_timer(node->uuid); + return; + } + + init_node_pending_timer(node, remaining_timeout); +} + +void +controld_free_node_pending_timers(void) +{ + if (node_pending_timers == NULL) { + return; + } + + g_hash_table_destroy(node_pending_timers); + node_pending_timers = NULL; +} + static const char * abort2text(enum pcmk__graph_next abort_action) { switch (abort_action) { case pcmk__graph_done: return "done"; case pcmk__graph_wait: return "stop"; case pcmk__graph_restart: return "restart"; case pcmk__graph_shutdown: return "shutdown"; } return "unknown"; } static bool update_abort_priority(pcmk__graph_t *graph, int priority, enum pcmk__graph_next action, const char *abort_reason) { bool change = FALSE; if (graph == NULL) { return change; } if (graph->abort_priority < priority) { crm_debug("Abort priority upgraded from %d to %d", graph->abort_priority, priority); graph->abort_priority = priority; if (graph->abort_reason != NULL) { crm_debug("'%s' abort superseded by %s", graph->abort_reason, abort_reason); } graph->abort_reason = abort_reason; change = TRUE; } if (graph->completion_action < action) { crm_debug("Abort action %s superseded by %s: %s", abort2text(graph->completion_action), abort2text(action), abort_reason); graph->completion_action = action; change = TRUE; } return change; } void abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, const xmlNode *reason, const char *fn, int line) { int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; int level = LOG_INFO; const xmlNode *diff = NULL; const xmlNode *change = NULL; CRM_CHECK(controld_globals.transition_graph != NULL, return); switch (controld_globals.fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: crm_info("Abort %s suppressed: state=%s (%scomplete)", abort_text, fsa_state2string(controld_globals.fsa_state), (controld_globals.transition_graph->complete? "" : "in")); return; default: break; } abort_timer.aborted = TRUE; controld_expect_sched_reply(NULL); if (!controld_globals.transition_graph->complete && update_abort_priority(controld_globals.transition_graph, abort_priority, abort_action, abort_text)) { level = LOG_NOTICE; } if (reason != NULL) { const xmlNode *search = NULL; for(search = reason; search; search = search->parent) { if (pcmk__str_eq(XML_TAG_DIFF, TYPE(search), pcmk__str_casei)) { diff = search; break; } } if(diff) { xml_patch_versions(diff, add, del); for(search = reason; search; search = search->parent) { if (pcmk__str_eq(XML_DIFF_CHANGE, TYPE(search), pcmk__str_casei)) { change = search; break; } } } } if (reason == NULL) { do_crm_log(level, "Transition %d aborted: %s " CRM_XS " source=%s:%d " "complete=%s", controld_globals.transition_graph->id, abort_text, fn, line, pcmk__btoa(controld_globals.transition_graph->complete)); } else if(change == NULL) { GString *local_path = pcmk__element_xpath(reason); CRM_ASSERT(local_path != NULL); do_crm_log(level, "Transition %d aborted by %s.%s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, TYPE(reason), ID(reason), abort_text, add[0], add[1], add[2], fn, line, (const char *) local_path->str, pcmk__btoa(controld_globals.transition_graph->complete)); g_string_free(local_path, TRUE); } else { const char *kind = NULL; const char *op = crm_element_value(change, XML_DIFF_OP); const char *path = crm_element_value(change, XML_DIFF_PATH); if(change == reason) { if(strcmp(op, "create") == 0) { reason = reason->children; } else if(strcmp(op, "modify") == 0) { reason = first_named_child(reason, XML_DIFF_RESULT); if(reason) { reason = reason->children; } } } kind = TYPE(reason); if(strcmp(op, "delete") == 0) { const char *shortpath = strrchr(path, '/'); do_crm_log(level, "Transition %d aborted by deletion of %s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, (shortpath? (shortpath + 1) : path), abort_text, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); } else if (pcmk__str_eq(XML_CIB_TAG_NVPAIR, kind, pcmk__str_none)) { do_crm_log(level, "Transition %d aborted by %s doing %s %s=%s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, crm_element_value(reason, XML_ATTR_ID), op, crm_element_value(reason, XML_NVPAIR_ATTR_NAME), crm_element_value(reason, XML_NVPAIR_ATTR_VALUE), abort_text, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); } else if (pcmk__str_eq(XML_LRM_TAG_RSC_OP, kind, pcmk__str_none)) { const char *magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); do_crm_log(level, "Transition %d aborted by operation %s '%s' on %s: %s " CRM_XS " magic=%s cib=%d.%d.%d source=%s:%d complete=%s", controld_globals.transition_graph->id, crm_element_value(reason, XML_LRM_ATTR_TASK_KEY), op, crm_element_value(reason, XML_LRM_ATTR_TARGET), abort_text, magic, add[0], add[1], add[2], fn, line, pcmk__btoa(controld_globals.transition_graph->complete)); } else if (pcmk__str_any_of(kind, XML_CIB_TAG_STATE, XML_CIB_TAG_NODE, NULL)) { const char *uname = crm_peer_uname(ID(reason)); do_crm_log(level, "Transition %d aborted by %s '%s' on %s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d complete=%s", controld_globals.transition_graph->id, kind, op, (uname? uname : ID(reason)), abort_text, add[0], add[1], add[2], fn, line, pcmk__btoa(controld_globals.transition_graph->complete)); } else { const char *id = ID(reason); do_crm_log(level, "Transition %d aborted by %s.%s '%s': %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, TYPE(reason), (id? id : ""), (op? op : "change"), abort_text, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); } } if (controld_globals.transition_graph->complete) { if (controld_get_period_transition_timer() > 0) { controld_stop_transition_timer(); controld_start_transition_timer(); } else { register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); } return; } trigger_graph(); } diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h index 2da4221901..0655bd9055 100644 --- a/daemons/controld/controld_transition.h +++ b/daemons/controld/controld_transition.h @@ -1,63 +1,65 @@ /* * Copyright 2004-2023 the Pacemaker project contributors * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef TENGINE__H # define TENGINE__H # include # include # include # include /* tengine */ pcmk__graph_action_t *match_down_event(const char *target); pcmk__graph_action_t *get_cancel_action(const char *id, const char *node); bool confirm_cancel_action(const char *id, const char *node_id); void controld_record_action_timeout(pcmk__graph_action_t *action); void controld_destroy_outside_events_table(void); void controld_remove_all_outside_events(void); gboolean fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node); void process_graph_event(xmlNode *event, const char *event_node); /* utils */ pcmk__graph_action_t *controld_get_action(int id); gboolean stop_te_timer(pcmk__graph_action_t *action); const char *get_rsc_state(const char *task, enum pcmk_exec_status status); void process_te_message(xmlNode *msg, xmlNode *xml_data); void controld_register_graph_functions(void); void notify_crmd(pcmk__graph_t * graph); void cib_action_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data); gboolean action_timer_callback(gpointer data); void te_update_diff(const char *event, xmlNode *msg); void controld_init_transition_trigger(void); void controld_destroy_transition_trigger(void); void controld_trigger_graph_as(const char *fn, int line); void abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, guint delay_ms); +void controld_node_pending_timer(const crm_node_t *node); +void controld_free_node_pending_timers(void); void abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, const xmlNode *reason, const char *fn, int line); # define trigger_graph() controld_trigger_graph_as(__func__, __LINE__) # define abort_transition(pri, action, text, reason) \ abort_transition_graph(pri, action, text, reason,__func__,__LINE__); void te_action_confirmed(pcmk__graph_action_t *action, pcmk__graph_t *graph); void te_reset_job_counts(void); #endif