diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index e9dbbdc6c8..1a42983135 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -1,361 +1,361 @@ /* * Copyright 2004-2020 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (pcmk__str_eq(sys_from, CRM_SYSTEM_DC, pcmk__str_casei)) { const char *from = crm_element_value(msg, F_ORIG); if (!pcmk__str_eq(from, fsa_our_uname, pcmk__str_casei)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __func__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (pcmk__str_eq(sys_to, CRM_SYSTEM_DC, pcmk__str_casei)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(); } /*! * \internal * \brief Check whether a node is online * * \param[in] node Node to check * * \retval -1 if completely dead * \retval 0 if partially alive * \retval 1 if completely alive */ static int node_alive(const crm_node_t *node) { if (pcmk_is_set(node->flags, crm_remote_node)) { // Pacemaker Remote nodes can't be partially alive return pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei) ? 1: -1; } else if (crm_is_peer_active(node)) { // Completely up cluster node: both cluster member and peer return 1; } else if (!pcmk_is_set(node->processes, crm_get_cluster_proc()) && !pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)) { // Completely down cluster node: neither cluster member nor peer return -1; } // Partially up cluster node: only cluster member or only peer return 0; } #define state_text(state) ((state)? (const char *)(state) : "in unknown state") bool controld_dc_left = false; void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { uint32_t old = 0; bool appeared = FALSE; bool is_remote = pcmk_is_set(node->flags, crm_remote_node); /* The controller waits to receive some information from the membership * layer before declaring itself operational. If this is being called for a * cluster node, indicate that we have it. */ if (!is_remote) { controld_set_fsa_input_flags(R_PEER_DATA); } if (type == crm_status_processes && pcmk_is_set(node->processes, crm_get_cluster_proc()) && !AM_I_DC && !is_remote) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in crmd_ha_msg_filter() */ xmlNode *query = create_request(CRM_OP_HELLO, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); crm_debug("Sending hello to node %u so that it learns our node name", node->id); send_cluster_message(node, crm_msg_crmd, query, FALSE); free_xml(query); } if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: /* If we've never seen the node, then it also won't be in the status section */ crm_info("%s node %s is now %s", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state)); return; case crm_status_nstate: /* This callback should not be called unless the state actually * changed, but here's a failsafe just in case. */ CRM_CHECK(!pcmk__str_eq(data, node->state, pcmk__str_casei), return); crm_info("%s node %s is now %s (was %s)", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state), state_text(data)); if (pcmk__str_eq(CRM_NODE_MEMBER, node->state, pcmk__str_casei)) { appeared = TRUE; if (!is_remote) { remove_stonith_cleanup(node->uname); } } else { controld_remove_voter(node->uname); } crmd_alert_node_event(node); break; case crm_status_processes: CRM_CHECK(data != NULL, return); old = *(const uint32_t *)data; appeared = pcmk_is_set(node->processes, crm_get_cluster_proc()); crm_info("Node %s is %s a peer " CRM_XS " DC=%s old=0x%07x new=0x%07x", node->uname, (appeared? "now" : "no longer"), (AM_I_DC? "true" : (fsa_our_dc? fsa_our_dc : "")), old, node->processes); if (!pcmk_is_set((node->processes ^ old), crm_get_cluster_proc())) { /* Peer status did not change. This should not be possible, * since we don't track process flags other than peer status. */ crm_trace("Process flag 0x%7x did not change from 0x%7x to 0x%7x", crm_get_cluster_proc(), old, node->processes); return; } if (!appeared) { controld_remove_voter(node->uname); } if (!pcmk_is_set(fsa_input_register, R_CIB_CONNECTED)) { crm_trace("Ignoring peer status change because not connected to CIB"); return; } else if (fsa_state == S_STOPPING) { crm_trace("Ignoring peer status change because stopping"); return; } if (pcmk__str_eq(node->uname, fsa_our_uname, pcmk__str_casei) && !appeared) { /* Did we get evicted? */ crm_notice("Our peer connection failed"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL); } else if (pcmk__str_eq(node->uname, fsa_our_dc, pcmk__str_casei) && crm_is_peer_active(node) == FALSE) { /* Did the DC leave us? */ crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't * want to fence it. Newer DCs will send their shutdown request * to all peers, who will update the DC's expected state to * down, thus avoiding fencing. We can safely erase the DC's * transient attributes when it leaves in that case. However, * the only way to avoid fencing older DCs is to leave the * transient attributes intact until it rejoins. */ if (compare_version(fsa_our_dc_version, "3.0.9") > 0) { controld_delete_node_state(node->uname, controld_section_attrs, cib_scope_local); } } else if (AM_I_DC || controld_dc_left || (fsa_our_dc == NULL)) { /* This only needs to be done once, so normally the DC should do * it. However if there is no DC, every node must do it, since * there is no other way to ensure some one node does it. */ if (appeared) { te_trigger_stonith_history_sync(FALSE); } else { controld_delete_node_state(node->uname, controld_section_attrs, cib_scope_local); } } break; } if (AM_I_DC) { xmlNode *update = NULL; int flags = node_update_peer; int alive = node_alive(node); crm_action_t *down = match_down_event(node->uuid); crm_trace("Alive=%d, appeared=%d, down=%d", alive, appeared, (down? down->id : -1)); if (appeared && (alive > 0) && !is_remote) { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { /* tengine_stonith_callback() confirms fence actions */ crm_trace("Updating CIB %s fencer reported fencing of %s complete", (down->confirmed? "after" : "before"), node->uname); } else if (!appeared && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { // Shutdown actions are immediately confirmed (i.e. no_wait) if (!is_remote) { flags |= node_update_join | node_update_expected; crmd_peer_down(node, FALSE); check_join_state(fsa_state, __func__); } if (alive >= 0) { crm_info("%s of peer %s is in progress " CRM_XS " action=%d", task, node->uname, down->id); } else { crm_notice("%s of peer %s is complete " CRM_XS " action=%d", task, node->uname, down->id); - update_graph(transition_graph, down); + pcmk__update_graph(transition_graph, down); trigger_graph(); } } else { crm_trace("Node %s is %s, was expected to %s (op %d)", node->uname, ((alive > 0)? "alive" : ((alive < 0)? "dead" : "partially alive")), task, down->id); } } else if (appeared == FALSE) { crm_warn("Stonith/shutdown of node %s was not expected", node->uname); if (!is_remote) { crm_update_peer_join(__func__, node, crm_join_none); check_join_state(fsa_state, __func__); } abort_transition(INFINITY, tg_restart, "Node failure", NULL); fail_incompletable_actions(transition_graph, node->uuid); } else { crm_trace("Node %s came up, was not expected to be down", node->uname); } if (is_remote) { /* A pacemaker_remote node won't have its cluster status updated * in the CIB by membership-layer callbacks, so do it here. */ flags |= node_update_cluster; /* Trigger resource placement on newly integrated nodes */ if (appeared) { abort_transition(INFINITY, tg_restart, "pacemaker_remote node integrated", NULL); } } /* Update the CIB node state */ update = create_node_state_update(node, flags, NULL, __func__); if (update == NULL) { crm_debug("Node state update not yet possible for %s", node->uname); } else { fsa_cib_anon_update(XML_CIB_TAG_STATUS, update); } free_xml(update); } trigger_fsa(); } void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(); fsa_cib_conn->state = cib_disconnected; if (!pcmk_is_set(fsa_input_register, R_CIB_CONNECTED)) { crm_info("Connection to the CIB manager terminated"); return; } // @TODO This should trigger a reconnect, not a shutdown crm_crit("Lost connection to the CIB manager, shutting down"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); controld_clear_fsa_input_flags(R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 6c2a6c5508..fd3f5b67f6 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -1,997 +1,997 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); /* * stonith failure counting * * We don't want to get stuck in a permanent fencing loop. Keep track of the * number of fencing failures for each target node, and the most we'll restart a * transition for. */ struct st_fail_rec { int count; }; static bool fence_reaction_panic = FALSE; static unsigned long int stonith_max_attempts = 10; static GHashTable *stonith_failures = NULL; void update_stonith_max_attempts(const char *value) { stonith_max_attempts = char2score(value); if (stonith_max_attempts < 1UL) { stonith_max_attempts = 10UL; } } void set_fence_reaction(const char *reaction_s) { if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) { fence_reaction_panic = TRUE; } else { if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) { crm_warn("Invalid value '%s' for %s, using 'stop'", reaction_s, XML_CONFIG_ATTR_FENCE_REACTION); } fence_reaction_panic = FALSE; } } static gboolean too_many_st_failures(const char *target) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if (stonith_failures == NULL) { return FALSE; } if (target == NULL) { g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { if (value->count >= stonith_max_attempts) { target = (const char*)key; goto too_many; } } } else { value = g_hash_table_lookup(stonith_failures, target); if ((value != NULL) && (value->count >= stonith_max_attempts)) { goto too_many; } } return FALSE; too_many: crm_warn("Too many failures (%d) to fence %s, giving up", value->count, target); return TRUE; } /*! * \internal * \brief Reset a stonith fail count * * \param[in] target Name of node to reset, or NULL for all */ void st_fail_count_reset(const char *target) { if (stonith_failures == NULL) { return; } if (target) { struct st_fail_rec *rec = NULL; rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count = 0; } } else { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *rec = NULL; g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &rec)) { rec->count = 0; } } } static void st_fail_count_increment(const char *target) { struct st_fail_rec *rec = NULL; if (stonith_failures == NULL) { stonith_failures = pcmk__strkey_table(free, free); } rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); if(rec == NULL) { return; } rec->count = 1; g_hash_table_insert(stonith_failures, strdup(target), rec); } } /* end stonith fail count functions */ static void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } } static void send_stonith_update(crm_action_t *action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* We (usually) rely on the membership layer to do node_update_cluster, * and the peer status callback to do node_update_peer, because the node * might have already rejoined before we get the stonith result here. */ int flags = node_update_join | node_update_expected; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate */ peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); CRM_CHECK(peer != NULL, return); if (peer->state == NULL) { /* Usually, we rely on the membership layer to update the cluster state * in the CIB. However, if the node has never been seen, do it here, so * the node is not considered unclean. */ flags |= node_update_cluster; } if (peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crmd_peer_down(peer, TRUE); /* Generate a node state update for the CIB */ node_state = create_node_state_update(peer, flags, NULL, __func__); /* we have to mark whether or not remote nodes have already been fenced */ if (peer->flags & crm_remote_node) { char *now_s = pcmk__ttoa(time(NULL)); crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); free(now_s); } /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ controld_delete_node_state(peer->uname, controld_section_all, cib_scope_local); free_xml(node_state); return; } /*! * \internal * \brief Abort transition due to stonith failure * * \param[in] abort_action Whether to restart or stop transition * \param[in] target Don't restart if this (NULL for any) has too many failures * \param[in] reason Log this stonith action XML as abort reason (or NULL) */ static void abort_for_stonith_failure(enum transition_action abort_action, const char *target, xmlNode *reason) { /* If stonith repeatedly fails, we eventually give up on starting a new * transition for that reason. */ if ((abort_action != tg_stop) && too_many_st_failures(target)) { abort_action = tg_stop; } abort_transition(INFINITY, abort_action, "Stonith failed", reason); } /* * stonith cleanup list * * If the DC is shot, proper notifications might not go out. * The stonith cleanup list allows the cluster to (re-)send * notifications once a new DC is elected. */ static GList *stonith_cleanup_list = NULL; /*! * \internal * \brief Add a node to the stonith cleanup list * * \param[in] target Name of node to add */ void add_stonith_cleanup(const char *target) { stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); } /*! * \internal * \brief Remove a node from the stonith cleanup list * * \param[in] Name of node to remove */ void remove_stonith_cleanup(const char *target) { GList *iter = stonith_cleanup_list; while (iter != NULL) { GList *tmp = iter; char *iter_name = tmp->data; iter = iter->next; if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) { crm_trace("Removing %s from the cleanup list", iter_name); stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); free(iter_name); } } } /*! * \internal * \brief Purge all entries from the stonith cleanup list */ void purge_stonith_cleanup() { if (stonith_cleanup_list) { GList *iter = NULL; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_info("Purging %s from stonith cleanup list", target); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } } /*! * \internal * \brief Send stonith updates for all entries in cleanup list, then purge it */ void execute_stonith_cleanup() { GList *iter; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_node_t *target_node = crm_get_peer(0, target); const char *uuid = crm_peer_uuid(target_node); crm_notice("Marking %s, target of a previous stonith action, as clean", target); send_stonith_update(NULL, target, uuid); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } /* end stonith cleanup list functions */ /* stonith API client * * Functions that need to interact directly with the fencer via its API */ static stonith_t *stonith_api = NULL; static crm_trigger_t *stonith_reconnect = NULL; static char *te_client_id = NULL; static gboolean fail_incompletable_stonith(crm_graph_t *graph) { GList *lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GList *lpc2 = NULL; synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { crm_action_t *action = (crm_action_t *) lpc2->data; if (action->type != action_type_crm || action->confirmed) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { action->failed = TRUE; last_action = action->xml; - update_graph(graph, action); + pcmk__update_graph(graph, action); crm_notice("Failing action %d (%s): fencer terminated", action->id, ID(action->xml)); } } } if (last_action != NULL) { crm_warn("Fencer failure resulted in unrunnable actions"); abort_for_stonith_failure(tg_restart, NULL, last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) { te_cleanup_stonith_history_sync(st, FALSE); if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencing daemon disconnected"); } if (stonith_api) { /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(st); } stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (AM_I_DC) { fail_incompletable_stonith(transition_graph); trigger_graph(); } } static void tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) { if (te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } crmd_alert_fencing_op(st_event); if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner? st_event->executioner : "", st_event->origin); /* TODO: Hook up st_event->device */ return; } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { crm_err("Unfencing of %s by %s failed: %s (%d)", st_event->target, st_event->executioner? st_event->executioner : "", pcmk_strerror(st_event->result), st_event->result); return; } else if ((st_event->result == pcmk_ok) && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { /* We were notified of our own fencing. Most likely, either fencing was * misconfigured, or fabric fencing that doesn't cut cluster * communication is in use. * * Either way, shutting down the local host is a good idea, to require * administrator intervention. Also, other nodes would otherwise likely * set our status to lost because of the fencing callback and discard * our subsequent election votes as "not part of our cluster". */ crm_crit("We were allegedly just fenced by %s for %s!", st_event->executioner? st_event->executioner : "the cluster", st_event->origin); /* Dumps blackbox if enabled */ if (fence_reaction_panic) { pcmk__panic(__func__); } else { crm_exit(CRM_EX_FATAL); } return; } /* Update the count of stonith failures for this target, in case we become * DC later. The current DC has already updated its fail count in * tengine_stonith_callback(). */ if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { if (st_event->result == pcmk_ok) { st_fail_count_reset(st_event->target); } else { st_fail_count_increment(st_event->target); } } crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " CRM_XS " initiator=%s ref=%s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "", (st_event->client_origin? st_event->client_origin : ""), pcmk_strerror(st_event->result), st_event->origin, st_event->id); if (st_event->result == pcmk_ok) { crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, CRM_GET_PEER_ANY); const char *uuid = NULL; gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, fsa_our_uname, pcmk__str_casei); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); /* @TODO Ideally, at this point, we'd check whether the fenced node * hosted any guest nodes, and call remote_node_down() for them. * Unfortunately, the controller doesn't have a simple, reliable way * to map hosts to guests. It might be possible to track this in the * peer cache via crm_remote_peer_cache_refresh(). For now, we rely * on the scheduler creating fence pseudo-events for the guests. */ if (st_event->client_origin && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we don't currently have one */ } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) && !pcmk_is_set(peer->flags, crm_remote_node)) { crm_notice("Fencing target %s %s our leader", st_event->target, (fsa_our_dc? "was" : "may have been")); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } add_stonith_cleanup(st_event->target); } /* If the target is a remote node, and we host its connection, * immediately fail all monitors so it can be recovered quickly. * The connection won't necessarily drop when a remote node is fenced, * so the failure might not otherwise be detected until the next poke. */ if (pcmk_is_set(peer->flags, crm_remote_node)) { remote_ra_fail(st_event->target); } crmd_peer_down(peer, TRUE); } } /*! * \brief Connect to fencer * * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop * * \return TRUE * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ static gboolean te_connect_stonith(gpointer user_data) { int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); if (stonith_api == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); return TRUE; } } if (stonith_api->state != stonith_disconnected) { crm_trace("Already connected to fencer, no need to retry"); return TRUE; } if (user_data == NULL) { // Blocking (retry failures now until successful) rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); if (rc != pcmk_ok) { crm_err("Could not connect to fencer in 30 attempts: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); } } else { // Non-blocking (retry failures later in main loop) rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (rc != pcmk_ok) { if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) { crm_notice("Fencer connection failed (will retry): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencer connection failed (ignoring because no longer required): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); } return TRUE; } } if (rc == pcmk_ok) { stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, tengine_stonith_notify); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED, tengine_stonith_history_synced); te_trigger_stonith_history_sync(TRUE); crm_notice("Fencer successfully connected"); } return TRUE; } /*! \internal \brief Schedule fencer connection attempt in main loop */ void controld_trigger_fencer_connect() { if (stonith_reconnect == NULL) { stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, GINT_TO_POINTER(TRUE)); } controld_set_fsa_input_flags(R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); } void controld_disconnect_fencer(bool destroy) { if (stonith_api) { // Prevent fencer connection from coming up again controld_clear_fsa_input_flags(R_ST_REQUIRED); if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(stonith_api); } stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (destroy) { if (stonith_api) { stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (stonith_reconnect) { mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; } if (te_client_id) { free(te_client_id); te_client_id = NULL; } } } static gboolean do_stonith_history_sync(gpointer user_data) { if (stonith_api && (stonith_api->state != stonith_disconnected)) { stonith_history_t *history = NULL; te_cleanup_stonith_history_sync(stonith_api, FALSE); stonith_api->cmds->history(stonith_api, st_opt_sync_call | st_opt_broadcast, NULL, &history, 5); stonith_history_free(history); return TRUE; } else { crm_info("Skip triggering stonith history-sync as stonith is disconnected"); return FALSE; } } static void tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) { char *uuid = NULL; int stonith_id = -1; int transition_id = -1; crm_action_t *action = NULL; int call_id = data->call_id; int rc = data->rc; char *userdata = data->userdata; CRM_CHECK(userdata != NULL, return); crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, pcmk_strerror(rc), rc); if (AM_I_DC == FALSE) { return; } /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ /* op->call_id, op->optype, op->node_name, op->op_result, */ /* (char *)op->node_list, op->private_data); */ /* filter out old STONITH actions */ CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), goto bail); if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei) || transition_graph->id != transition_id) { crm_info("Ignoring STONITH action initiated outside of the current transition"); goto bail; } action = controld_get_action(stonith_id); if (action == NULL) { crm_err("Stonith action not matched"); goto bail; } stop_te_timer(action->timer); if (rc == pcmk_ok) { const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); const char *op = crm_meta_value(action->params, "stonith_action"); crm_info("Stonith operation %d for %s passed", call_id, target); if (action->confirmed == FALSE) { te_action_confirmed(action, NULL); if (pcmk__str_eq("on", op, pcmk__str_casei)) { const char *value = NULL; char *now = pcmk__ttoa(time(NULL)); gboolean is_remote_node = FALSE; /* This check is not 100% reliable, since this node is not * guaranteed to have the remote node cached. However, it * doesn't have to be reliable, since the attribute manager can * learn a node's "remoteness" by other means sooner or later. * This allows it to learn more quickly if this node does have * the information. */ if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) { is_remote_node = TRUE; } update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, is_remote_node); free(now); value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, is_remote_node); value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, is_remote_node); } else if (action->sent_update == FALSE) { send_stonith_update(action, target, uuid); action->sent_update = TRUE; } } st_fail_count_reset(target); } else { const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); enum transition_action abort_action = tg_restart; action->failed = TRUE; crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", call_id, target, pcmk_strerror(rc)); /* If no fence devices were available, there's no use in immediately * checking again, so don't start a new transition in that case. */ if (rc == -ENODEV) { crm_warn("No devices found in cluster to fence %s, giving up", target); abort_action = tg_stop; } /* Increment the fail count now, so abort_for_stonith_failure() can * check it. Non-DC nodes will increment it in tengine_stonith_notify(). */ st_fail_count_increment(target); abort_for_stonith_failure(abort_action, target, NULL); } - update_graph(transition_graph, action); + pcmk__update_graph(transition_graph, action); trigger_graph(); bail: free(userdata); free(uuid); return; } static int fence_with_delay(const char *target, const char *type, const char *delay) { uint32_t options = st_opt_none; // Group of enum stonith_call_options int timeout_sec = (int) (transition_graph->stonith_timeout / 1000); int delay_i; if (crmd_join_phase_count(crm_join_confirmed) == 1) { stonith__set_call_options(options, target, st_opt_allow_suicide); } pcmk__scan_min_int(delay, &delay_i, 0); return stonith_api->cmds->fence_with_delay(stonith_api, options, target, type, timeout_sec, 0, delay_i); } gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action) { int rc = 0; const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; char *transition_key = NULL; const char *priority_delay = NULL; gboolean invalid_action = FALSE; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return FALSE; } priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY); crm_notice("Requesting fencing (%s) of node %s " CRM_XS " action=%s timeout=%u%s%s", type, target, id, transition_graph->stonith_timeout, priority_delay ? " priority_delay=" : "", priority_delay ? priority_delay : ""); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); rc = fence_with_delay(target, type, priority_delay); transition_key = pcmk__transition_key(transition_graph->id, action->id, 0, te_uuid), stonith_api->cmds->register_callback(stonith_api, rc, (int) (transition_graph->stonith_timeout / 1000), st_opt_timeout_updates, transition_key, "tengine_stonith_callback", tengine_stonith_callback); return TRUE; } bool controld_verify_stonith_watchdog_timeout(const char *value) { gboolean rv = TRUE; if (stonith_api && (stonith_api->state != stonith_disconnected) && stonith__watchdog_fencing_enabled_for_node_api(stonith_api, fsa_our_uname)) { rv = pcmk__valid_sbd_timeout(value); } return rv; } /* end stonith API client functions */ /* * stonith history synchronization * * Each node's fencer keeps track of a cluster-wide fencing history. When a node * joins or leaves, we need to synchronize the history across all nodes. */ static crm_trigger_t *stonith_history_sync_trigger = NULL; static mainloop_timer_t *stonith_history_sync_timer_short = NULL; static mainloop_timer_t *stonith_history_sync_timer_long = NULL; void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) { if (free_timers) { mainloop_timer_del(stonith_history_sync_timer_short); stonith_history_sync_timer_short = NULL; mainloop_timer_del(stonith_history_sync_timer_long); stonith_history_sync_timer_long = NULL; } else { mainloop_timer_stop(stonith_history_sync_timer_short); mainloop_timer_stop(stonith_history_sync_timer_long); } if (st) { st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED); } } static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) { te_cleanup_stonith_history_sync(st, FALSE); crm_debug("Fence-history synced - cancel all timers"); } static gboolean stonith_history_sync_set_trigger(gpointer user_data) { mainloop_set_trigger(stonith_history_sync_trigger); return FALSE; } void te_trigger_stonith_history_sync(bool long_timeout) { /* trigger a sync in 5s to give more nodes the * chance to show up so that we don't create * unnecessary stonith-history-sync traffic * * the long timeout of 30s is there as a fallback * so that after a successful connection to fenced * we will wait for 30s for the DC to trigger a * history-sync * if this doesn't happen we trigger a sync locally * (e.g. fenced segfaults and is restarted by pacemakerd) */ /* as we are finally checking the stonith-connection * in do_stonith_history_sync we should be fine * leaving stonith_history_sync_time & stonith_history_sync_trigger * around */ if (stonith_history_sync_trigger == NULL) { stonith_history_sync_trigger = mainloop_add_trigger(G_PRIORITY_LOW, do_stonith_history_sync, NULL); } if (long_timeout) { if(stonith_history_sync_timer_long == NULL) { stonith_history_sync_timer_long = mainloop_timer_add("history_sync_long", 30000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); mainloop_timer_start(stonith_history_sync_timer_long); } else { if(stonith_history_sync_timer_short == NULL) { stonith_history_sync_timer_short = mainloop_timer_add("history_sync_short", 5000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); mainloop_timer_start(stonith_history_sync_timer_short); } } /* end stonith history synchronization functions */ diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c index 2204b6ef34..a492703983 100644 --- a/daemons/controld/controld_te_actions.c +++ b/daemons/controld/controld_te_actions.c @@ -1,643 +1,643 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include // lrmd_event_data_t, lrmd_free_event() #include #include #include #include #include char *te_uuid = NULL; GHashTable *te_targets = NULL; void send_rsc_command(crm_action_t * action); static void te_update_job_count(crm_action_t * action, int offset); static void te_start_action_timer(crm_graph_t * graph, crm_action_t * action) { action->timer = calloc(1, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; action->timer->action = action; action->timer->source_id = g_timeout_add(action->timer->timeout + graph->network_delay, action_timer_callback, (void *)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { const char *task = crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK); /* send to peers as well? */ if (pcmk__str_eq(task, CRM_OP_MAINTENANCE_NODES, pcmk__str_casei)) { GHashTableIter iter; crm_node_t *node = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *cmd = NULL; if (pcmk__str_eq(fsa_our_uname, node->uname, pcmk__str_casei)) { continue; } cmd = create_request(task, pseudo->xml, node->uname, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); send_cluster_message(node, crm_msg_crmd, cmd, FALSE); free_xml(cmd); } remote_ra_process_maintenance_nodes(pseudo->xml); } else { /* Check action for Pacemaker Remote node side effects */ remote_ra_process_pseudo(pseudo->xml); } crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id, crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY)); te_action_confirmed(pseudo, graph); return TRUE; } static int get_target_rc(crm_action_t * action) { int exit_status; pcmk__scan_min_int(crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC), &exit_status, 0); return exit_status; } static gboolean te_crm_command(crm_graph_t * graph, crm_action_t * action) { char *counter = NULL; xmlNode *cmd = NULL; gboolean is_local = FALSE; const char *id = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; if (pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_casei)) { const char *mode = crm_element_value(action->xml, PCMK__XA_MODE); if (pcmk__str_eq(mode, XML_TAG_CIB, pcmk__str_casei)) { router_node = fsa_our_uname; } } } CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); if (pcmk__str_eq(router_node, fsa_our_uname, pcmk__str_casei)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } crm_info("Executing crm-event (%s)%s%s: %s on %s", crm_str(id), (is_local? " locally" : ""), (no_wait? " without waiting" : ""), crm_str(task), on_node); if (is_local && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { /* defer until everything else completes */ crm_info("crm-event (%s) is a local shutdown", crm_str(id)); graph->completion_action = tg_shutdown; graph->abort_reason = "local shutdown"; te_action_confirmed(action, graph); return TRUE; } else if (pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { crm_node_t *peer = crm_get_peer(0, router_node); pcmk__update_peer_expected(__func__, peer, CRMD_JOINSTATE_DOWN); } cmd = create_request(task, action->xml, router_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = pcmk__transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_crmd, cmd, TRUE); free(counter); free_xml(cmd); if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { te_action_confirmed(action, graph); } else { if (action->timeout <= 0) { crm_err("Action %d: %s on %s had an invalid timeout (%dms). Using %ums instead", action->id, task, on_node, action->timeout, graph->network_delay); action->timeout = (int) graph->network_delay; } te_start_action_timer(graph, action); } return TRUE; } void controld_record_action_timeout(crm_action_t *action) { lrmd_event_data_t *op = NULL; xmlNode *state = NULL; xmlNode *rsc = NULL; xmlNode *xml_op = NULL; xmlNode *action_rsc = NULL; int rc = pcmk_ok; const char *rsc_id = NULL; const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override | cib_scope_local; int target_rc = get_target_rc(action); crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if (action_rsc == NULL) { return; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); crm_copy_xml_element(action_rsc, rsc, XML_ATTR_TYPE); crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS); crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER); /* If the executor gets a timeout while waiting for the action to complete, * that will be reported via the usual callback. This timeout means that we * didn't hear from the executor or the controller that relayed the action * to the executor. * * @TODO Using PCMK_OCF_UNKNOWN_ERROR instead of PCMK_OCF_TIMEOUT is one way * to distinguish those situations, but perhaps PCMK_OCF_TIMEOUT would be * preferable anyway. */ op = convert_graph_action(NULL, action, PCMK_LRM_OP_TIMEOUT, PCMK_OCF_UNKNOWN_ERROR); op->call_id = -1; op->user_data = pcmk__transition_key(transition_graph->id, action->id, target_rc, te_uuid); xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, __func__, LOG_INFO); lrmd_free_event(op); crm_log_xml_trace(xml_op, "Action timeout"); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); free_xml(state); crm_trace("Sent CIB update (call ID %d) for timeout of action %d (%s on %s)", rc, action->id, task_uuid, target); action->sent_update = TRUE; } static gboolean te_rsc_command(crm_graph_t * graph, crm_action_t * action) { /* never overwrite stop actions in the CIB with * anything other than completed results * * Writing pending stops makes it look like the * resource is running again */ xmlNode *cmd = NULL; xmlNode *rsc_op = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; gboolean is_local = FALSE; char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); action->executed = FALSE; on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command(id=%s) %s: no node", ID(action->xml), crm_str(task)); return FALSE); rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); router_node = crm_element_value(rsc_op, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; } counter = pcmk__transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); if (pcmk__str_eq(router_node, fsa_our_uname, pcmk__str_casei)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } crm_notice("Initiating %s operation %s%s on %s%s "CRM_XS" action %d", task, task_uuid, (is_local? " locally" : ""), on_node, (no_wait? " without waiting" : ""), action->id); cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, router_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); if (is_local) { /* shortcut local resource commands */ ha_msg_input_t data = { .msg = cmd, .xml = rsc_op, }; fsa_data_t msg = { .id = 0, .data = &data, .data_type = fsa_dt_ha_msg, .fsa_input = I_NULL, .fsa_cause = C_FSA_INTERNAL, .actions = A_LRM_INVOKE, .origin = __func__, }; do_lrm_invoke(A_LRM_INVOKE, C_FSA_INTERNAL, fsa_state, I_NULL, &msg); } else { rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_lrmd, cmd, TRUE); } free(counter); free_xml(cmd); action->executed = TRUE; if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { crm_info("Action %d confirmed - no wait", action->id); action->confirmed = TRUE; /* Just mark confirmed. * Don't bump the job count only to immediately decrement it */ - update_graph(transition_graph, action); + pcmk__update_graph(transition_graph, action); trigger_graph(); } else if (action->confirmed == TRUE) { crm_debug("Action %d: %s %s on %s(timeout %dms) was already confirmed.", action->id, task, task_uuid, on_node, action->timeout); } else { if (action->timeout <= 0) { crm_err("Action %d: %s %s on %s had an invalid timeout (%dms). Using %ums instead", action->id, task, task_uuid, on_node, action->timeout, graph->network_delay); action->timeout = (int) graph->network_delay; } te_update_job_count(action, 1); te_start_action_timer(graph, action); } return TRUE; } struct te_peer_s { char *name; int jobs; int migrate_jobs; }; static void te_peer_free(gpointer p) { struct te_peer_s *peer = p; free(peer->name); free(peer); } void te_reset_job_counts(void) { GHashTableIter iter; struct te_peer_s *peer = NULL; if(te_targets == NULL) { te_targets = pcmk__strkey_table(NULL, te_peer_free); } g_hash_table_iter_init(&iter, te_targets); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & peer)) { peer->jobs = 0; peer->migrate_jobs = 0; } } static void te_update_job_count_on(const char *target, int offset, bool migrate) { struct te_peer_s *r = NULL; if(target == NULL || te_targets == NULL) { return; } r = g_hash_table_lookup(te_targets, target); if(r == NULL) { r = calloc(1, sizeof(struct te_peer_s)); r->name = strdup(target); g_hash_table_insert(te_targets, r->name, r); } r->jobs += offset; if(migrate) { r->migrate_jobs += offset; } crm_trace("jobs[%s] = %d", target, r->jobs); } static void te_update_job_count(crm_action_t * action, int offset) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); if (action->type != action_type_rsc || target == NULL) { /* No limit on these */ return; } /* if we have a router node, this means the action is performing * on a remote node. For now, we count all actions occurring on a * remote node against the job list on the cluster node hosting * the connection resources */ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { const char *t1 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); const char *t2 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); te_update_job_count_on(t1, offset, TRUE); te_update_job_count_on(t2, offset, TRUE); return; } else if (target == NULL) { target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); } te_update_job_count_on(target, offset, FALSE); } static gboolean te_should_perform_action_on(crm_graph_t * graph, crm_action_t * action, const char *target) { int limit = 0; struct te_peer_s *r = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *id = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if(target == NULL) { /* No limit on these */ return TRUE; } else if(te_targets == NULL) { return FALSE; } r = g_hash_table_lookup(te_targets, target); limit = throttle_get_job_limit(target); if(r == NULL) { r = calloc(1, sizeof(struct te_peer_s)); r->name = strdup(target); g_hash_table_insert(te_targets, r->name, r); } if(limit <= r->jobs) { crm_trace("Peer %s is over their job limit of %d (%d): deferring %s", target, limit, r->jobs, id); return FALSE; } else if(graph->migration_limit > 0 && r->migrate_jobs >= graph->migration_limit) { if (pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { crm_trace("Peer %s is over their migration job limit of %d (%d): deferring %s", target, graph->migration_limit, r->migrate_jobs, id); return FALSE; } } crm_trace("Peer %s has not hit their limit yet. current jobs = %d limit= %d limit", target, r->jobs, limit); return TRUE; } static gboolean te_should_perform_action(crm_graph_t * graph, crm_action_t * action) { const char *target = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (action->type != action_type_rsc) { /* No limit on these */ return TRUE; } /* if we have a router node, this means the action is performing * on a remote node. For now, we count all actions occurring on a * remote node against the job list on the cluster node hosting * the connection resources */ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); if(te_should_perform_action_on(graph, action, target) == FALSE) { return FALSE; } target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); } else if (target == NULL) { target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); } return te_should_perform_action_on(graph, action, target); } /*! * \brief Confirm a graph action (and optionally update graph) * * \param[in] action Action to confirm * \param[in] graph Update and trigger this graph (if non-NULL) */ void te_action_confirmed(crm_action_t *action, crm_graph_t *graph) { if (action->confirmed == FALSE) { if ((action->type == action_type_rsc) && (crm_element_value(action->xml, XML_LRM_ATTR_TARGET) != NULL)) { te_update_job_count(action, -1); } action->confirmed = TRUE; } if (graph) { - update_graph(graph, action); + pcmk__update_graph(graph, action); trigger_graph(); } } crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, te_fence_node, te_should_perform_action, }; void notify_crmd(crm_graph_t * graph) { const char *type = "unknown"; enum crmd_fsa_input event = I_NULL; crm_debug("Processing transition completion in state %s", fsa_state2string(fsa_state)); if (graph->complete == FALSE) { CRM_CHECK(graph->complete,); graph->complete = TRUE; } switch (graph->completion_action) { case tg_stop: type = "stop"; if (fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_done: type = "done"; if (fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_restart: type = "restart"; if (fsa_state == S_TRANSITION_ENGINE) { if (transition_timer->period_ms > 0) { controld_stop_timer(transition_timer); controld_start_timer(transition_timer); } else { event = I_PE_CALC; } } else if (fsa_state == S_POLICY_ENGINE) { controld_set_fsa_action_flags(A_PE_INVOKE); trigger_fsa(); } break; case tg_shutdown: type = "shutdown"; if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { event = I_STOP; } else { crm_err("We didn't ask to be shut down, yet the scheduler is telling us to"); event = I_TERMINATE; } } crm_debug("Transition %d status: %s - %s", graph->id, type, crm_str(graph->abort_reason)); graph->abort_reason = NULL; graph->completion_action = tg_done; controld_clear_fsa_input_flags(R_IN_TRANSITION); if (event != I_NULL) { register_fsa_input(C_FSA_INTERNAL, event, NULL); } else if (fsa_source) { mainloop_set_trigger(fsa_source); } } diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c index 03d84e50e6..186844992f 100644 --- a/daemons/controld/controld_te_events.c +++ b/daemons/controld/controld_te_events.c @@ -1,507 +1,507 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node) { const char *target_uuid = NULL; const char *router = NULL; const char *router_uuid = NULL; xmlNode *last_action = NULL; GList *gIter = NULL; GList *gIter2 = NULL; if (graph == NULL || graph->complete) { return FALSE; } gIter = graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; if (synapse->confirmed || synapse->failed) { /* We've already been here */ continue; } gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->type == action_type_pseudo || action->confirmed) { continue; } else if (action->type == action_type_crm) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { continue; } } target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (router) { crm_node_t *node = crm_get_peer(0, router); if (node) { router_uuid = node->uuid; } } if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) { action->failed = TRUE; synapse->failed = TRUE; last_action = action->xml; stop_te_timer(action->timer); - update_graph(graph, action); + pcmk__update_graph(graph, action); if (synapse->executed) { crm_notice("Action %d (%s) was pending on %s (offline)", action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node); } else { crm_info("Action %d (%s) is scheduled for %s (offline)", action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node); } } } } if (last_action != NULL) { crm_info("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } /*! * \internal * \brief Update failure-related node attributes if warranted * * \param[in] event XML describing operation that (maybe) failed * \param[in] event_node_uuid Node that event occurred on * \param[in] rc Actual operation return code * \param[in] target_rc Expected operation return code * \param[in] do_update If TRUE, do update regardless of operation type * \param[in] ignore_failures If TRUE, update last failure but not fail count * * \return TRUE if this was not a direct nack, success or lrm status refresh */ static gboolean update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update, gboolean ignore_failures) { guint interval_ms = 0; char *task = NULL; char *rsc_id = NULL; const char *value = NULL; const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); const char *on_uname = crm_peer_uname(event_node_uuid); const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); // Nothing needs to be done for success or status refresh if (rc == target_rc) { return FALSE; } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) { crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", id, rc, on_uname); return FALSE; } /* Sanity check */ CRM_CHECK(on_uname != NULL, return TRUE); CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms), crm_err("Couldn't parse: %s", ID(event)); goto bail); /* Decide whether update is necessary and what value to use */ if ((interval_ms > 0) || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_casei) || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_casei)) { do_update = TRUE; } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_casei)) { do_update = TRUE; if (failed_start_offset == NULL) { failed_start_offset = strdup(CRM_INFINITY_S); } value = failed_start_offset; } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_casei)) { do_update = TRUE; if (failed_stop_offset == NULL) { failed_stop_offset = strdup(CRM_INFINITY_S); } value = failed_stop_offset; } /* Fail count will be either incremented or set to infinity */ if (!pcmk_str_is_infinity(value)) { value = XML_NVPAIR_ATTR_VALUE "++"; } if (do_update) { char *now = pcmk__ttoa(time(NULL)); char *attr_name = NULL; gboolean is_remote_node = FALSE; if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) { is_remote_node = TRUE; } crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)", (ignore_failures? "last failure" : "failcount"), rsc_id, on_uname, task, rc, value, now); /* Update the fail count, if we're not ignoring failures */ if (!ignore_failures) { attr_name = pcmk__failcount_name(rsc_id, task, interval_ms); update_attrd(on_uname, attr_name, value, NULL, is_remote_node); free(attr_name); } /* Update the last failure time (even if we're ignoring failures, * so that failure can still be detected and shown, e.g. by crm_mon) */ attr_name = pcmk__lastfailure_name(rsc_id, task, interval_ms); update_attrd(on_uname, attr_name, now, NULL, is_remote_node); free(attr_name); free(now); } bail: free(rsc_id); free(task); return TRUE; } crm_action_t * controld_get_action(int id) { for (GList *item = transition_graph->synapses; item; item = item->next) { synapse_t *synapse = (synapse_t *) item->data; for (GList *item2 = synapse->actions; item2; item2 = item2->next) { crm_action_t *action = (crm_action_t *) item2->data; if (action->id == id) { return action; } } } return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { GList *gIter = NULL; GList *gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { const char *task = NULL; const char *target = NULL; crm_action_t *action = (crm_action_t *) gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if (!pcmk__str_eq(task, id, pcmk__str_casei)) { crm_trace("Wrong key %s for %s on %s", task, id, node); continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) { crm_trace("Wrong node %s for %s on %s", target, id, node); continue; } crm_trace("Found %s on %s", id, node); return action; } } return NULL; } bool confirm_cancel_action(const char *id, const char *node_id) { const char *op_key = NULL; const char *node_name = NULL; crm_action_t *cancel = get_cancel_action(id, node_id); if (cancel == NULL) { return FALSE; } op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY); node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET); stop_te_timer(cancel->timer); te_action_confirmed(cancel, transition_graph); crm_info("Cancellation of %s on %s confirmed (action %d)", op_key, node_name, cancel->id); return TRUE; } /* downed nodes are listed like: ... */ #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \ "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']" /*! * \brief Find a transition event that would have made a specified node down * * \param[in] target UUID of node to match * * \return Matching event if found, NULL otherwise */ crm_action_t * match_down_event(const char *target) { crm_action_t *match = NULL; xmlXPathObjectPtr xpath_ret = NULL; GList *gIter, *gIter2; char *xpath = crm_strdup_printf(XPATH_DOWNED, target); for (gIter = transition_graph->synapses; gIter != NULL && match == NULL; gIter = gIter->next) { for (gIter2 = ((synapse_t*)gIter->data)->actions; gIter2 != NULL && match == NULL; gIter2 = gIter2->next) { match = (crm_action_t*)gIter2->data; if (match->executed) { xpath_ret = xpath_search(match->xml, xpath); if (numXpathResults(xpath_ret) < 1) { match = NULL; } freeXpathObject(xpath_ret); } else { // Only actions that were actually started can match match = NULL; } } } free(xpath); if (match != NULL) { crm_debug("Shutdown action %d (%s) found for node %s", match->id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); } else { crm_debug("No reason to expect node %s to be down", target); } return match; } void process_graph_event(xmlNode *event, const char *event_node) { int rc = -1; // Actual result int target_rc = -1; // Expected result int status = -1; // Executor status int callid = -1; // Executor call ID int transition_num = -1; // Transition number int action_num = -1; // Action number within transition char *update_te_uuid = NULL; bool ignore_failures = FALSE; const char *id = NULL; const char *desc = NULL; const char *magic = NULL; const char *uname = NULL; CRM_ASSERT(event != NULL); /* */ magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY); if (magic == NULL) { /* non-change */ return; } crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status); if (status == PCMK_LRM_OP_PENDING) { return; } id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); crm_element_value_int(event, XML_LRM_ATTR_RC, &rc); crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid); rc = pcmk__effective_rc(rc); if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action_num, &target_rc) == FALSE) { // decode_transition_key() already logged the bad key crm_err("Can't process action %s result: Incompatible versions? " CRM_XS " call-id=%d", id, callid); abort_transition(INFINITY, tg_restart, "Bad event", event); return; } if (transition_num == -1) { // E.g. crm_resource --fail desc = "initiated outside of the cluster"; abort_transition(INFINITY, tg_restart, "Unexpected event", event); } else if ((action_num < 0) || !pcmk__str_eq(update_te_uuid, te_uuid, pcmk__str_none)) { desc = "initiated by a different DC"; abort_transition(INFINITY, tg_restart, "Foreign event", event); } else if ((transition_graph->id != transition_num) || (transition_graph->complete)) { // Action is not from currently active transition guint interval_ms = 0; if (parse_op_key(id, NULL, NULL, &interval_ms) && (interval_ms != 0)) { /* Recurring actions have the transition number they were first * scheduled in. */ if (status == PCMK_LRM_OP_CANCELLED) { confirm_cancel_action(id, get_node_id(event)); goto bail; } desc = "arrived after initial scheduling"; abort_transition(INFINITY, tg_restart, "Change in recurring result", event); } else if (transition_graph->id != transition_num) { desc = "arrived really late"; abort_transition(INFINITY, tg_restart, "Old event", event); } else { desc = "arrived late"; abort_transition(INFINITY, tg_restart, "Inactive graph", event); } } else { // Event is result of an action from currently active transition crm_action_t *action = controld_get_action(action_num); if (action == NULL) { // Should never happen desc = "unknown"; abort_transition(INFINITY, tg_restart, "Unknown event", event); } else if (action->confirmed == TRUE) { /* Nothing further needs to be done if the action has already been * confirmed. This can happen e.g. when processing both an * "xxx_last_0" or "xxx_last_failure_0" record as well as the main * history record, which would otherwise result in incorrectly * bumping the fail count twice. */ crm_log_xml_debug(event, "Event already confirmed:"); goto bail; } else { /* An action result needs to be confirmed. * (This is the only case where desc == NULL.) */ if (pcmk__str_eq(crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore", pcmk__str_casei)) { ignore_failures = TRUE; } else if (rc != target_rc) { action->failed = TRUE; } stop_te_timer(action->timer); te_action_confirmed(action, transition_graph); if (action->failed) { abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event); } } } if (id == NULL) { id = "unknown action"; } uname = crm_element_value(event, XML_LRM_ATTR_TARGET); if (uname == NULL) { uname = "unknown node"; } if (status == PCMK_LRM_OP_INVALID) { // We couldn't attempt the action crm_info("Transition %d action %d (%s on %s): %s", transition_num, action_num, id, uname, services_lrm_status_str(status)); } else if (desc && update_failcount(event, event_node, rc, target_rc, (transition_num == -1), FALSE)) { crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " CRM_XS " target-rc=%d rc=%d call-id=%d event='%s'", transition_num, action_num, id, uname, services_ocf_exitcode_str(target_rc), services_ocf_exitcode_str(rc), target_rc, rc, callid, desc); } else if (desc) { crm_info("Transition %d action %d (%s on %s): %s " CRM_XS " rc=%d target-rc=%d call-id=%d", transition_num, action_num, id, uname, desc, rc, target_rc, callid); } else if (rc == target_rc) { crm_info("Transition %d action %d (%s on %s) confirmed: %s " CRM_XS " rc=%d call-id=%d", transition_num, action_num, id, uname, services_ocf_exitcode_str(rc), rc, callid); } else { update_failcount(event, event_node, rc, target_rc, (transition_num == -1), ignore_failures); crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " CRM_XS " target-rc=%d rc=%d call-id=%d", transition_num, action_num, id, uname, services_ocf_exitcode_str(target_rc), services_ocf_exitcode_str(rc), target_rc, rc, callid); } bail: free(update_te_uuid); } diff --git a/include/pcmki/pcmki_transition.h b/include/pcmki/pcmki_transition.h index a3ec811f13..bfa2f25e28 100644 --- a/include/pcmki/pcmki_transition.h +++ b/include/pcmki/pcmki_transition.h @@ -1,141 +1,141 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CRM_TRANSITION__H # define CRM_TRANSITION__H #ifdef __cplusplus extern "C" { #endif #include #include #include #include typedef enum { action_type_pseudo, action_type_rsc, action_type_crm } action_type_e; typedef struct te_timer_s crm_action_timer_t; typedef struct crm_graph_s crm_graph_t; typedef struct synapse_s { int id; int priority; gboolean ready; gboolean failed; gboolean executed; gboolean confirmed; GList *actions; /* crm_action_t* */ GList *inputs; /* crm_action_t* */ } synapse_t; typedef struct crm_action_s { int id; int timeout; guint interval_ms; GHashTable *params; action_type_e type; crm_action_timer_t *timer; synapse_t *synapse; gboolean sent_update; /* sent to the CIB */ gboolean executed; /* sent to the CRM */ gboolean confirmed; gboolean failed; gboolean can_fail; //! \deprecated Will be removed in a future release xmlNode *xml; } crm_action_t; struct te_timer_s { int source_id; int timeout; crm_action_t *action; }; /* order matters here */ enum transition_action { tg_done, tg_stop, tg_restart, tg_shutdown, }; struct crm_graph_s { int id; char *source; int abort_priority; gboolean complete; const char *abort_reason; enum transition_action completion_action; int num_actions; int num_synapses; int batch_limit; guint network_delay; guint stonith_timeout; int fired; int pending; int skipped; int completed; int incomplete; GList *synapses; /* synapse_t* */ int migration_limit; }; typedef struct crm_graph_functions_s { gboolean(*pseudo) (crm_graph_t * graph, crm_action_t * action); gboolean(*rsc) (crm_graph_t * graph, crm_action_t * action); gboolean(*crmd) (crm_graph_t * graph, crm_action_t * action); gboolean(*stonith) (crm_graph_t * graph, crm_action_t * action); gboolean(*allowed) (crm_graph_t * graph, crm_action_t * action); } crm_graph_functions_t; enum transition_status { transition_active, transition_pending, /* active but no actions performed this time */ transition_complete, transition_stopped, transition_terminated, transition_action_failed, transition_failed, }; void set_graph_functions(crm_graph_functions_t * fns); crm_graph_t *unpack_graph(xmlNode * xml_graph, const char *reference); int run_graph(crm_graph_t * graph); -gboolean update_graph(crm_graph_t * graph, crm_action_t * action); +void pcmk__update_graph(crm_graph_t *graph, crm_action_t *action); void destroy_graph(crm_graph_t * graph); const char *transition_status(enum transition_status state); void print_graph(unsigned int log_level, crm_graph_t * graph); void print_action(int log_level, const char *prefix, crm_action_t * action); bool update_abort_priority(crm_graph_t * graph, int priority, enum transition_action action, const char *abort_reason); lrmd_event_data_t *convert_graph_action(xmlNode * resource, crm_action_t * action, int status, int rc); #ifdef __cplusplus } #endif #endif diff --git a/lib/pacemaker/pcmk_graph_consumer.c b/lib/pacemaker/pcmk_graph_consumer.c index 350901a4ae..e23e023262 100644 --- a/lib/pacemaker/pcmk_graph_consumer.c +++ b/lib/pacemaker/pcmk_graph_consumer.c @@ -1,758 +1,764 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include static crm_graph_functions_t *graph_fns = NULL; -static gboolean -update_synapse_ready(synapse_t *synapse, int action_id) -{ - GList *lpc = NULL; - gboolean updates = FALSE; - CRM_CHECK(synapse->executed == FALSE, return FALSE); - CRM_CHECK(synapse->confirmed == FALSE, return FALSE); +/* + * Functions for updating graph + */ - synapse->ready = TRUE; - for (lpc = synapse->inputs; lpc != NULL; lpc = lpc->next) { +/*! + * \internal + * \brief Update synapse after completed prerequisite + * + * A synapse is ready to be executed once all its prerequisite actions (inputs) + * complete. Given a completed action, check whether it is an input for a given + * synapse, and if so, mark the input as confirmed, and mark the synapse as + * ready if appropriate. + * + * \param[in] synapse Transition graph synapse to update + * \param[in] action_id ID of an action that completed + * + * \note The only substantial effect here is confirming synapse inputs. + * should_fire_synapse() will recalculate synapse->ready, so the only + * thing that uses the synapse->ready value from here is + * synapse_state_str(). + */ +static void +update_synapse_ready(synapse_t *synapse, int action_id) +{ + if (synapse->ready) { + return; // All inputs have already been confirmed + } + synapse->ready = TRUE; // Presume ready until proven otherwise + for (GList *lpc = synapse->inputs; lpc != NULL; lpc = lpc->next) { crm_action_t *prereq = (crm_action_t *) lpc->data; - crm_trace("Processing input %d", prereq->id); - if (prereq->id == action_id) { - crm_trace("Marking input %d of synapse %d confirmed", action_id, synapse->id); + crm_trace("Confirming input %d of synapse %d", + action_id, synapse->id); prereq->confirmed = TRUE; - updates = TRUE; - } else if (prereq->confirmed == FALSE) { + } else if (!(prereq->confirmed)) { synapse->ready = FALSE; + crm_trace("Synapse %d still not ready after action %d", + synapse->id, action_id); } - } - - if (updates) { - crm_trace("Updated synapse %d", synapse->id); + if (synapse->ready) { + crm_trace("Synapse %d is now ready to execute", synapse->id); } - return updates; } -static gboolean -update_synapse_confirmed(synapse_t * synapse, int action_id) +/*! + * \internal + * \brief Update action and synapse confirmation after action completion + * + * \param[in] synapse Transition graph synapse that action belongs to + * \param[in] action_id ID of action that completed + */ +static void +update_synapse_confirmed(synapse_t *synapse, int action_id) { - GList *lpc = NULL; - gboolean updates = FALSE; - gboolean is_confirmed = TRUE; - - CRM_CHECK(synapse->executed, return FALSE); - CRM_CHECK(synapse->confirmed == FALSE, return TRUE); + bool all_confirmed = true; - is_confirmed = TRUE; - for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { + for (GList *lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; - crm_trace("Processing action %d", action->id); - if (action->id == action_id) { - crm_trace("Confirmed: Action %d of Synapse %d", action_id, synapse->id); + crm_trace("Confirmed action %d of synapse %d", + action_id, synapse->id); action->confirmed = TRUE; - updates = TRUE; - } else if (action->confirmed == FALSE) { - is_confirmed = FALSE; - crm_trace("Synapse %d still not confirmed after action %d", synapse->id, action_id); + } else if (all_confirmed && !(action->confirmed)) { + all_confirmed = false; + crm_trace("Synapse %d still not confirmed after action %d", + synapse->id, action_id); } } - if (is_confirmed && synapse->confirmed == FALSE) { - crm_trace("Confirmed: Synapse %d", synapse->id); + if (all_confirmed && !(synapse->confirmed)) { + crm_trace("Confirmed synapse %d", synapse->id); synapse->confirmed = TRUE; - updates = TRUE; } - - if (updates) { - crm_trace("Updated synapse %d", synapse->id); - } - return updates; } -gboolean -update_graph(crm_graph_t * graph, crm_action_t * action) +/*! + * \internal + * \brief Update the transition graph with a completed action result + * + * \param[in,out] graph Transition graph to update + * \param[in] action Action that completed + */ +void +pcmk__update_graph(crm_graph_t *graph, crm_action_t *action) { - gboolean rc = FALSE; - gboolean updates = FALSE; - GList *lpc = NULL; - - for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { + for (GList *lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed || synapse->failed) { - crm_trace("Synapse complete"); + continue; // This synapse already completed } else if (synapse->executed) { - crm_trace("Synapse executed"); - rc = update_synapse_confirmed(synapse, action->id); + update_synapse_confirmed(synapse, action->id); - } else if (action->failed == FALSE || synapse->priority == INFINITY) { - rc = update_synapse_ready(synapse, action->id); + } else if (!(action->failed) || (synapse->priority == INFINITY)) { + update_synapse_ready(synapse, action->id); } - updates = updates || rc; - } - - if (updates) { - crm_trace("Updated graph with completed action %d", action->id); } - return updates; } static gboolean should_fire_synapse(crm_graph_t * graph, synapse_t * synapse) { GList *lpc = NULL; CRM_CHECK(synapse->executed == FALSE, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return FALSE); crm_trace("Checking pre-reqs for synapse %d", synapse->id); /* lookup prereqs */ synapse->ready = TRUE; for (lpc = synapse->inputs; lpc != NULL; lpc = lpc->next) { crm_action_t *prereq = (crm_action_t *) lpc->data; crm_trace("Processing input %d", prereq->id); if (prereq->confirmed == FALSE) { crm_trace("Input %d for synapse %d not satisfied: not confirmed", prereq->id, synapse->id); synapse->ready = FALSE; break; } else if(prereq->failed && prereq->can_fail == FALSE) { crm_trace("Input %d for synapse %d not satisfied: failed", prereq->id, synapse->id); synapse->ready = FALSE; break; } } for (lpc = synapse->actions; synapse->ready && lpc != NULL; lpc = lpc->next) { crm_action_t *a = (crm_action_t *) lpc->data; if (a->type == action_type_pseudo) { /* None of the below applies to pseudo ops */ } else if (synapse->priority < graph->abort_priority) { crm_trace("Skipping synapse %d: abort level %d", synapse->id, graph->abort_priority); graph->skipped++; return FALSE; } else if(graph_fns->allowed && graph_fns->allowed(graph, a) == FALSE) { crm_trace("Deferring synapse %d: allowed", synapse->id); return FALSE; } } return synapse->ready; } static gboolean initiate_action(crm_graph_t * graph, crm_action_t * action) { const char *id = NULL; CRM_CHECK(action->executed == FALSE, return FALSE); id = ID(action->xml); CRM_CHECK(id != NULL, return FALSE); action->executed = TRUE; if (action->type == action_type_pseudo) { crm_trace("Executing pseudo-event: %s (%d)", id, action->id); return graph_fns->pseudo(graph, action); } else if (action->type == action_type_rsc) { crm_trace("Executing rsc-event: %s (%d)", id, action->id); return graph_fns->rsc(graph, action); } else if (action->type == action_type_crm) { const char *task = NULL; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); CRM_CHECK(task != NULL, return FALSE); if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { crm_trace("Executing STONITH-event: %s (%d)", id, action->id); return graph_fns->stonith(graph, action); } crm_trace("Executing crm-event: %s (%d)", id, action->id); return graph_fns->crmd(graph, action); } crm_err("Failed on unsupported command type: %s (id=%s)", crm_element_name(action->xml), id); return FALSE; } static gboolean fire_synapse(crm_graph_t * graph, synapse_t * synapse) { GList *lpc = NULL; CRM_CHECK(synapse != NULL, return FALSE); CRM_CHECK(synapse->ready, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return TRUE); crm_trace("Synapse %d fired", synapse->id); synapse->executed = TRUE; for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; /* allow some leeway */ gboolean passed = FALSE; /* Invoke the action and start the timer */ passed = initiate_action(graph, action); if (passed == FALSE) { crm_err("Failed initiating <%s id=%d> in synapse %d", crm_element_name(action->xml), action->id, synapse->id); synapse->confirmed = TRUE; action->confirmed = TRUE; action->failed = TRUE; return FALSE; } } return TRUE; } static gboolean pseudo_action_dummy(crm_graph_t * graph, crm_action_t * action) { static int fail = -1; if (fail < 0) { long long fail_ll; if ((pcmk__scan_ll(getenv("PE_fail"), &fail_ll, 0LL) == pcmk_rc_ok) && (fail_ll > 0LL) && (fail_ll <= INT_MAX)) { fail = (int) fail_ll; } else { fail = 0; } } crm_trace("Dummy event handler: action %d executed", action->id); if (action->id == fail) { crm_err("Dummy event handler: pretending action %d failed", action->id); action->failed = TRUE; graph->abort_priority = INFINITY; } action->confirmed = TRUE; - update_graph(graph, action); + pcmk__update_graph(graph, action); return TRUE; } static crm_graph_functions_t default_fns = { pseudo_action_dummy, pseudo_action_dummy, pseudo_action_dummy, pseudo_action_dummy }; int run_graph(crm_graph_t * graph) { GList *lpc = NULL; int stat_log_level = LOG_DEBUG; int pass_result = transition_active; const char *status = "In-progress"; if (graph_fns == NULL) { graph_fns = &default_fns; } if (graph == NULL) { return transition_complete; } graph->fired = 0; graph->pending = 0; graph->skipped = 0; graph->completed = 0; graph->incomplete = 0; crm_trace("Entering graph %d callback", graph->id); /* Pre-calculate the number of completed and in-flight operations */ for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { crm_trace("Synapse %d complete", synapse->id); graph->completed++; } else if (synapse->failed == FALSE && synapse->executed) { crm_trace("Synapse %d: confirmation pending", synapse->id); graph->pending++; } } /* Now check if there is work to do */ for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (graph->batch_limit > 0 && graph->pending >= graph->batch_limit) { crm_debug("Throttling output: batch limit (%d) reached", graph->batch_limit); break; } else if (synapse->failed) { graph->skipped++; continue; } else if (synapse->confirmed || synapse->executed) { /* Already handled */ continue; } if (should_fire_synapse(graph, synapse)) { crm_trace("Synapse %d fired", synapse->id); graph->fired++; if(fire_synapse(graph, synapse) == FALSE) { crm_err("Synapse %d failed to fire", synapse->id); stat_log_level = LOG_ERR; graph->abort_priority = INFINITY; graph->incomplete++; graph->fired--; } if (synapse->confirmed == FALSE) { graph->pending++; } } else { crm_trace("Synapse %d cannot fire", synapse->id); graph->incomplete++; } } if (graph->pending == 0 && graph->fired == 0) { graph->complete = TRUE; stat_log_level = LOG_NOTICE; pass_result = transition_complete; status = "Complete"; if (graph->incomplete != 0 && graph->abort_priority <= 0) { stat_log_level = LOG_WARNING; pass_result = transition_terminated; status = "Terminated"; } else if (graph->skipped != 0) { status = "Stopped"; } } else if (graph->fired == 0) { pass_result = transition_pending; } do_crm_log(stat_log_level, "Transition %d (Complete=%d, Pending=%d," " Fired=%d, Skipped=%d, Incomplete=%d, Source=%s): %s", graph->id, graph->completed, graph->pending, graph->fired, graph->skipped, graph->incomplete, graph->source, status); return pass_result; } static crm_action_t * unpack_action(synapse_t * parent, xmlNode * xml_action) { crm_action_t *action = NULL; const char *value = crm_element_value(xml_action, XML_ATTR_ID); if (value == NULL) { crm_err("Actions must have an id!"); crm_log_xml_trace(xml_action, "Action with missing id"); return NULL; } action = calloc(1, sizeof(crm_action_t)); if (action == NULL) { crm_perror(LOG_CRIT, "Cannot unpack action"); crm_log_xml_trace(xml_action, "Lost action"); return NULL; } pcmk__scan_min_int(value, &(action->id), -1); action->type = action_type_rsc; action->xml = copy_xml(xml_action); action->synapse = parent; if (pcmk__str_eq(crm_element_name(action->xml), XML_GRAPH_TAG_RSC_OP, pcmk__str_casei)) { action->type = action_type_rsc; } else if (pcmk__str_eq(crm_element_name(action->xml), XML_GRAPH_TAG_PSEUDO_EVENT, pcmk__str_casei)) { action->type = action_type_pseudo; } else if (pcmk__str_eq(crm_element_name(action->xml), XML_GRAPH_TAG_CRM_EVENT, pcmk__str_casei)) { action->type = action_type_crm; } action->params = xml2list(action->xml); value = g_hash_table_lookup(action->params, "CRM_meta_timeout"); pcmk__scan_min_int(value, &(action->timeout), 0); /* Take start-delay into account for the timeout of the action timer */ value = g_hash_table_lookup(action->params, "CRM_meta_start_delay"); { int start_delay; pcmk__scan_min_int(value, &start_delay, 0); action->timeout += start_delay; } if (pcmk__guint_from_hash(action->params, CRM_META "_" XML_LRM_ATTR_INTERVAL, 0, &(action->interval_ms)) != pcmk_rc_ok) { action->interval_ms = 0; } value = g_hash_table_lookup(action->params, "CRM_meta_can_fail"); if (value != NULL) { crm_str_to_boolean(value, &(action->can_fail)); #ifndef PCMK__COMPAT_2_0 if (action->can_fail) { crm_warn("Support for the can_fail meta-attribute is deprecated" " and will be removed in a future release"); } #endif } crm_trace("Action %d has timer set to %dms", action->id, action->timeout); return action; } static synapse_t * unpack_synapse(crm_graph_t * new_graph, xmlNode * xml_synapse) { const char *value = NULL; xmlNode *inputs = NULL; xmlNode *action_set = NULL; synapse_t *new_synapse = NULL; CRM_CHECK(xml_synapse != NULL, return NULL); crm_trace("looking in synapse %s", ID(xml_synapse)); new_synapse = calloc(1, sizeof(synapse_t)); pcmk__scan_min_int(ID(xml_synapse), &(new_synapse->id), 0); value = crm_element_value(xml_synapse, XML_CIB_ATTR_PRIORITY); pcmk__scan_min_int(value, &(new_synapse->priority), 0); CRM_CHECK(new_synapse->id >= 0, free(new_synapse); return NULL); new_graph->num_synapses++; crm_trace("look for actions in synapse %s", crm_element_value(xml_synapse, XML_ATTR_ID)); for (action_set = pcmk__xml_first_child(xml_synapse); action_set != NULL; action_set = pcmk__xml_next(action_set)) { if (pcmk__str_eq((const char *)action_set->name, "action_set", pcmk__str_none)) { xmlNode *action = NULL; for (action = pcmk__xml_first_child(action_set); action != NULL; action = pcmk__xml_next(action)) { crm_action_t *new_action = unpack_action(new_synapse, action); if (new_action == NULL) { continue; } new_graph->num_actions++; crm_trace("Adding action %d to synapse %d", new_action->id, new_synapse->id); new_synapse->actions = g_list_append(new_synapse->actions, new_action); } } } crm_trace("look for inputs in synapse %s", ID(xml_synapse)); for (inputs = pcmk__xml_first_child(xml_synapse); inputs != NULL; inputs = pcmk__xml_next(inputs)) { if (pcmk__str_eq((const char *)inputs->name, "inputs", pcmk__str_none)) { xmlNode *trigger = NULL; for (trigger = pcmk__xml_first_child(inputs); trigger != NULL; trigger = pcmk__xml_next(trigger)) { xmlNode *input = NULL; for (input = pcmk__xml_first_child(trigger); input != NULL; input = pcmk__xml_next(input)) { crm_action_t *new_input = unpack_action(new_synapse, input); if (new_input == NULL) { continue; } crm_trace("Adding input %d to synapse %d", new_input->id, new_synapse->id); new_synapse->inputs = g_list_append(new_synapse->inputs, new_input); } } } } return new_synapse; } crm_graph_t * unpack_graph(xmlNode * xml_graph, const char *reference) { /* id = -1; new_graph->abort_priority = 0; new_graph->network_delay = 0; new_graph->stonith_timeout = 0; new_graph->completion_action = tg_done; if (reference) { new_graph->source = strdup(reference); } else { new_graph->source = strdup("unknown"); } if (xml_graph != NULL) { t_id = crm_element_value(xml_graph, "transition_id"); CRM_CHECK(t_id != NULL, free(new_graph); return NULL); pcmk__scan_min_int(t_id, &(new_graph->id), -1); time = crm_element_value(xml_graph, "cluster-delay"); CRM_CHECK(time != NULL, free(new_graph); return NULL); new_graph->network_delay = crm_parse_interval_spec(time); time = crm_element_value(xml_graph, "stonith-timeout"); if (time == NULL) { new_graph->stonith_timeout = new_graph->network_delay; } else { new_graph->stonith_timeout = crm_parse_interval_spec(time); } // Use 0 (dynamic limit) as default/invalid, -1 (no limit) as minimum t_id = crm_element_value(xml_graph, "batch-limit"); if ((t_id == NULL) || (pcmk__scan_min_int(t_id, &(new_graph->batch_limit), -1) != pcmk_rc_ok)) { new_graph->batch_limit = 0; } t_id = crm_element_value(xml_graph, "migration-limit"); pcmk__scan_min_int(t_id, &(new_graph->migration_limit), -1); } for (synapse = pcmk__xml_first_child(xml_graph); synapse != NULL; synapse = pcmk__xml_next(synapse)) { if (pcmk__str_eq((const char *)synapse->name, "synapse", pcmk__str_none)) { synapse_t *new_synapse = unpack_synapse(new_graph, synapse); if (new_synapse != NULL) { new_graph->synapses = g_list_append(new_graph->synapses, new_synapse); } } } crm_debug("Unpacked transition %d: %d actions in %d synapses", new_graph->id, new_graph->num_actions, new_graph->num_synapses); return new_graph; } static void destroy_action(crm_action_t * action) { if (action->timer && action->timer->source_id != 0) { crm_warn("Cancelling timer for action %d (src=%d)", action->id, action->timer->source_id); g_source_remove(action->timer->source_id); } if (action->params) { g_hash_table_destroy(action->params); } free_xml(action->xml); free(action->timer); free(action); } static void destroy_synapse(synapse_t * synapse) { while (synapse->actions != NULL) { crm_action_t *action = g_list_nth_data(synapse->actions, 0); synapse->actions = g_list_remove(synapse->actions, action); destroy_action(action); } while (synapse->inputs != NULL) { crm_action_t *action = g_list_nth_data(synapse->inputs, 0); synapse->inputs = g_list_remove(synapse->inputs, action); destroy_action(action); } free(synapse); } void destroy_graph(crm_graph_t * graph) { if (graph == NULL) { return; } while (graph->synapses != NULL) { synapse_t *synapse = g_list_nth_data(graph->synapses, 0); graph->synapses = g_list_remove(graph->synapses, synapse); destroy_synapse(synapse); } free(graph->source); free(graph); } lrmd_event_data_t * convert_graph_action(xmlNode * resource, crm_action_t * action, int status, int rc) { xmlNode *xop = NULL; lrmd_event_data_t *op = NULL; GHashTableIter iter; const char *name = NULL; const char *value = NULL; xmlNode *action_resource = NULL; CRM_CHECK(action != NULL, return NULL); CRM_CHECK(action->type == action_type_rsc, return NULL); action_resource = first_named_child(action->xml, XML_CIB_TAG_RESOURCE); CRM_CHECK(action_resource != NULL, crm_log_xml_warn(action->xml, "Bad"); return NULL); op = lrmd_new_event(ID(action_resource), crm_element_value(action->xml, XML_LRM_ATTR_TASK), action->interval_ms); op->rc = rc; op->op_status = status; op->t_run = time(NULL); op->t_rcchange = op->t_run; op->params = pcmk__strkey_table(free, free); g_hash_table_iter_init(&iter, action->params); while (g_hash_table_iter_next(&iter, (void **)&name, (void **)&value)) { g_hash_table_insert(op->params, strdup(name), strdup(value)); } for (xop = pcmk__xml_first_child(resource); xop != NULL; xop = pcmk__xml_next(xop)) { int tmp = 0; crm_element_value_int(xop, XML_LRM_ATTR_CALLID, &tmp); crm_debug("Got call_id=%d for %s", tmp, ID(resource)); if (tmp > op->call_id) { op->call_id = tmp; } } op->call_id++; return op; } void set_graph_functions(crm_graph_functions_t * fns) { crm_info("Setting custom graph functions"); graph_fns = fns; CRM_ASSERT(graph_fns != NULL); CRM_ASSERT(graph_fns->rsc != NULL); CRM_ASSERT(graph_fns->crmd != NULL); CRM_ASSERT(graph_fns->pseudo != NULL); CRM_ASSERT(graph_fns->stonith != NULL); } static const char * abort2text(enum transition_action abort_action) { switch (abort_action) { case tg_done: return "done"; case tg_stop: return "stop"; case tg_restart: return "restart"; case tg_shutdown: return "shutdown"; } return "unknown"; } bool update_abort_priority(crm_graph_t * graph, int priority, enum transition_action action, const char *abort_reason) { bool change = FALSE; if (graph == NULL) { return change; } if (graph->abort_priority < priority) { crm_debug("Abort priority upgraded from %d to %d", graph->abort_priority, priority); graph->abort_priority = priority; if (graph->abort_reason != NULL) { crm_debug("'%s' abort superseded by %s", graph->abort_reason, abort_reason); } graph->abort_reason = abort_reason; change = TRUE; } if (graph->completion_action < action) { crm_debug("Abort action %s superseded by %s: %s", abort2text(graph->completion_action), abort2text(action), abort_reason); graph->completion_action = action; change = TRUE; } return change; } diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c index f207a17cfe..5c306c0b8d 100644 --- a/lib/pacemaker/pcmk_sched_transition.c +++ b/lib/pacemaker/pcmk_sched_transition.c @@ -1,854 +1,854 @@ /* * Copyright 2009-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include // lrmd_event_data_t, lrmd_free_event() #include #include #include #include #include #include static pcmk__output_t *out = NULL; static cib_t *fake_cib = NULL; static GList *fake_resource_list = NULL; static GList *fake_op_fail_list = NULL; gboolean bringing_nodes_online = FALSE; #define STATUS_PATH_MAX 512 #define NEW_NODE_TEMPLATE "//"XML_CIB_TAG_NODE"[@uname='%s']" #define NODE_TEMPLATE "//"XML_CIB_TAG_STATE"[@uname='%s']" #define RSC_TEMPLATE "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']" static void inject_transient_attr(xmlNode * cib_node, const char *name, const char *value) { xmlNode *attrs = NULL; xmlNode *instance_attrs = NULL; const char *node_uuid = ID(cib_node); out->message(out, "inject-attr", name, value, cib_node); attrs = first_named_child(cib_node, XML_TAG_TRANSIENT_NODEATTRS); if (attrs == NULL) { attrs = create_xml_node(cib_node, XML_TAG_TRANSIENT_NODEATTRS); crm_xml_add(attrs, XML_ATTR_ID, node_uuid); } instance_attrs = first_named_child(attrs, XML_TAG_ATTR_SETS); if (instance_attrs == NULL) { instance_attrs = create_xml_node(attrs, XML_TAG_ATTR_SETS); crm_xml_add(instance_attrs, XML_ATTR_ID, node_uuid); } crm_create_nvpair_xml(instance_attrs, NULL, name, value); } static void update_failcounts(xmlNode * cib_node, const char *resource, const char *task, guint interval_ms, int rc) { if (rc == 0) { return; } else if ((rc == 7) && (interval_ms == 0)) { return; } else { char *name = NULL; char *now = pcmk__ttoa(time(NULL)); name = pcmk__failcount_name(resource, task, interval_ms); inject_transient_attr(cib_node, name, "value++"); free(name); name = pcmk__lastfailure_name(resource, task, interval_ms); inject_transient_attr(cib_node, name, now); free(name); free(now); } } static void create_node_entry(cib_t * cib_conn, const char *node) { int rc = pcmk_ok; char *xpath = crm_strdup_printf(NEW_NODE_TEMPLATE, node); rc = cib_conn->cmds->query(cib_conn, xpath, NULL, cib_xpath | cib_sync_call | cib_scope_local); if (rc == -ENXIO) { xmlNode *cib_object = create_xml_node(NULL, XML_CIB_TAG_NODE); crm_xml_add(cib_object, XML_ATTR_ID, node); // Use node name as ID crm_xml_add(cib_object, XML_ATTR_UNAME, node); cib_conn->cmds->create(cib_conn, XML_CIB_TAG_NODES, cib_object, cib_sync_call | cib_scope_local); /* Not bothering with subsequent query to see if it exists, we'll bomb out later in the call to query_node_uuid()... */ free_xml(cib_object); } free(xpath); } static lrmd_event_data_t * create_op(xmlNode *cib_resource, const char *task, guint interval_ms, int outcome) { lrmd_event_data_t *op = NULL; xmlNode *xop = NULL; op = lrmd_new_event(ID(cib_resource), task, interval_ms); op->rc = outcome; op->op_status = 0; op->params = NULL; /* TODO: Fill me in */ op->t_run = (unsigned int) time(NULL); op->t_rcchange = op->t_run; op->call_id = 0; for (xop = pcmk__xe_first_child(cib_resource); xop != NULL; xop = pcmk__xe_next(xop)) { int tmp = 0; crm_element_value_int(xop, XML_LRM_ATTR_CALLID, &tmp); if (tmp > op->call_id) { op->call_id = tmp; } } op->call_id++; return op; } static xmlNode * inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) { return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET, target_rc, NULL, crm_system_name, LOG_TRACE); } static xmlNode * inject_node_state(cib_t * cib_conn, const char *node, const char *uuid) { int rc = pcmk_ok; xmlNode *cib_object = NULL; char *xpath = crm_strdup_printf(NODE_TEMPLATE, node); if (bringing_nodes_online) { create_node_entry(cib_conn, node); } rc = cib_conn->cmds->query(cib_conn, xpath, &cib_object, cib_xpath | cib_sync_call | cib_scope_local); if (cib_object && ID(cib_object) == NULL) { crm_err("Detected multiple node_state entries for xpath=%s, bailing", xpath); crm_log_xml_warn(cib_object, "Duplicates"); free(xpath); crm_exit(CRM_EX_SOFTWARE); return NULL; // not reached, but makes static analysis happy } if (rc == -ENXIO) { char *found_uuid = NULL; if (uuid == NULL) { query_node_uuid(cib_conn, node, &found_uuid, NULL); } else { found_uuid = strdup(uuid); } cib_object = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(cib_object, XML_ATTR_UUID, found_uuid); crm_xml_add(cib_object, XML_ATTR_UNAME, node); cib_conn->cmds->create(cib_conn, XML_CIB_TAG_STATUS, cib_object, cib_sync_call | cib_scope_local); free_xml(cib_object); free(found_uuid); rc = cib_conn->cmds->query(cib_conn, xpath, &cib_object, cib_xpath | cib_sync_call | cib_scope_local); crm_trace("injecting node state for %s. rc is %d", node, rc); } free(xpath); CRM_ASSERT(rc == pcmk_ok); return cib_object; } static xmlNode * modify_node(cib_t * cib_conn, char *node, gboolean up) { xmlNode *cib_node = inject_node_state(cib_conn, node, NULL); if (up) { crm_xml_add(cib_node, XML_NODE_IN_CLUSTER, XML_BOOLEAN_YES); crm_xml_add(cib_node, XML_NODE_IS_PEER, ONLINESTATUS); crm_xml_add(cib_node, XML_NODE_JOIN_STATE, CRMD_JOINSTATE_MEMBER); crm_xml_add(cib_node, XML_NODE_EXPECTED, CRMD_JOINSTATE_MEMBER); } else { crm_xml_add(cib_node, XML_NODE_IN_CLUSTER, XML_BOOLEAN_NO); crm_xml_add(cib_node, XML_NODE_IS_PEER, OFFLINESTATUS); crm_xml_add(cib_node, XML_NODE_JOIN_STATE, CRMD_JOINSTATE_DOWN); crm_xml_add(cib_node, XML_NODE_EXPECTED, CRMD_JOINSTATE_DOWN); } crm_xml_add(cib_node, XML_ATTR_ORIGIN, crm_system_name); return cib_node; } static xmlNode * find_resource_xml(xmlNode * cib_node, const char *resource) { xmlNode *match = NULL; const char *node = crm_element_value(cib_node, XML_ATTR_UNAME); char *xpath = crm_strdup_printf(RSC_TEMPLATE, node, resource); match = get_xpath_object(xpath, cib_node, LOG_TRACE); free(xpath); return match; } static xmlNode * inject_resource(xmlNode * cib_node, const char *resource, const char *lrm_name, const char *rclass, const char *rtype, const char *rprovider) { xmlNode *lrm = NULL; xmlNode *container = NULL; xmlNode *cib_resource = NULL; char *xpath = NULL; cib_resource = find_resource_xml(cib_node, resource); if (cib_resource != NULL) { /* If an existing LRM history entry uses the resource name, * continue using it, even if lrm_name is different. */ return cib_resource; } // Check for history entry under preferred name if (strcmp(resource, lrm_name)) { cib_resource = find_resource_xml(cib_node, lrm_name); if (cib_resource != NULL) { return cib_resource; } } /* One day, add query for class, provider, type */ if (rclass == NULL || rtype == NULL) { out->err(out, "Resource %s not found in the status section of %s." " Please supply the class and type to continue", resource, ID(cib_node)); return NULL; } else if (!pcmk__strcase_any_of(rclass, PCMK_RESOURCE_CLASS_OCF, PCMK_RESOURCE_CLASS_STONITH, PCMK_RESOURCE_CLASS_SERVICE, PCMK_RESOURCE_CLASS_UPSTART, PCMK_RESOURCE_CLASS_SYSTEMD, PCMK_RESOURCE_CLASS_LSB, NULL)) { out->err(out, "Invalid class for %s: %s", resource, rclass); return NULL; } else if (pcmk_is_set(pcmk_get_ra_caps(rclass), pcmk_ra_cap_provider) && (rprovider == NULL)) { out->err(out, "Please specify the provider for resource %s", resource); return NULL; } xpath = (char *)xmlGetNodePath(cib_node); crm_info("Injecting new resource %s into %s '%s'", lrm_name, xpath, ID(cib_node)); free(xpath); lrm = first_named_child(cib_node, XML_CIB_TAG_LRM); if (lrm == NULL) { const char *node_uuid = ID(cib_node); lrm = create_xml_node(cib_node, XML_CIB_TAG_LRM); crm_xml_add(lrm, XML_ATTR_ID, node_uuid); } container = first_named_child(lrm, XML_LRM_TAG_RESOURCES); if (container == NULL) { container = create_xml_node(lrm, XML_LRM_TAG_RESOURCES); } cib_resource = create_xml_node(container, XML_LRM_TAG_RESOURCE); // If we're creating a new entry, use the preferred name crm_xml_add(cib_resource, XML_ATTR_ID, lrm_name); crm_xml_add(cib_resource, XML_AGENT_ATTR_CLASS, rclass); crm_xml_add(cib_resource, XML_AGENT_ATTR_PROVIDER, rprovider); crm_xml_add(cib_resource, XML_ATTR_TYPE, rtype); return cib_resource; } #define XPATH_MAX 1024 static int find_ticket_state(cib_t * the_cib, const char *ticket_id, xmlNode ** ticket_state_xml) { int offset = 0; int rc = pcmk_ok; xmlNode *xml_search = NULL; char *xpath_string = NULL; CRM_ASSERT(ticket_state_xml != NULL); *ticket_state_xml = NULL; xpath_string = calloc(1, XPATH_MAX); offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "%s", "/cib/status/tickets"); if (ticket_id) { offset += snprintf(xpath_string + offset, XPATH_MAX - offset, "/%s[@id=\"%s\"]", XML_CIB_TAG_TICKET_STATE, ticket_id); } CRM_LOG_ASSERT(offset > 0); rc = the_cib->cmds->query(the_cib, xpath_string, &xml_search, cib_sync_call | cib_scope_local | cib_xpath); if (rc != pcmk_ok) { goto bail; } crm_log_xml_debug(xml_search, "Match"); if (xml_has_children(xml_search)) { if (ticket_id) { out->err(out, "Multiple ticket_states match ticket_id=%s", ticket_id); } *ticket_state_xml = xml_search; } else { *ticket_state_xml = xml_search; } bail: free(xpath_string); return rc; } static int set_ticket_state_attr(const char *ticket_id, const char *attr_name, const char *attr_value, cib_t * cib, int cib_options) { int rc = pcmk_ok; xmlNode *xml_top = NULL; xmlNode *ticket_state_xml = NULL; rc = find_ticket_state(cib, ticket_id, &ticket_state_xml); if (rc == pcmk_ok) { crm_debug("Found a match state for ticket: id=%s", ticket_id); xml_top = ticket_state_xml; } else if (rc != -ENXIO) { return rc; } else { xmlNode *xml_obj = NULL; xml_top = create_xml_node(NULL, XML_CIB_TAG_STATUS); xml_obj = create_xml_node(xml_top, XML_CIB_TAG_TICKETS); ticket_state_xml = create_xml_node(xml_obj, XML_CIB_TAG_TICKET_STATE); crm_xml_add(ticket_state_xml, XML_ATTR_ID, ticket_id); } crm_xml_add(ticket_state_xml, attr_name, attr_value); crm_log_xml_debug(xml_top, "Update"); rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, xml_top, cib_options); free_xml(xml_top); return rc; } void modify_configuration(pe_working_set_t * data_set, cib_t *cib, const char *quorum, const char *watchdog, GList *node_up, GList *node_down, GList *node_fail, GList *op_inject, GList *ticket_grant, GList *ticket_revoke, GList *ticket_standby, GList *ticket_activate) { int rc = pcmk_ok; GList *gIter = NULL; xmlNode *cib_op = NULL; xmlNode *cib_node = NULL; xmlNode *cib_resource = NULL; lrmd_event_data_t *op = NULL; out = data_set->priv; out->message(out, "inject-modify-config", quorum, watchdog); if (quorum) { xmlNode *top = create_xml_node(NULL, XML_TAG_CIB); /* crm_xml_add(top, XML_ATTR_DC_UUID, dc_uuid); */ crm_xml_add(top, XML_ATTR_HAVE_QUORUM, quorum); rc = cib->cmds->modify(cib, NULL, top, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } if (watchdog) { rc = update_attr_delegate(cib, cib_sync_call | cib_scope_local, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, XML_ATTR_HAVE_WATCHDOG, watchdog, FALSE, NULL, NULL); CRM_ASSERT(rc == pcmk_ok); } for (gIter = node_up; gIter != NULL; gIter = gIter->next) { char *node = (char *)gIter->data; out->message(out, "inject-modify-node", "Online", node); cib_node = modify_node(cib, node, TRUE); CRM_ASSERT(cib_node != NULL); rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); free_xml(cib_node); } for (gIter = node_down; gIter != NULL; gIter = gIter->next) { char xpath[STATUS_PATH_MAX]; char *node = (char *)gIter->data; out->message(out, "inject-modify-node", "Offline", node); cib_node = modify_node(cib, node, FALSE); CRM_ASSERT(cib_node != NULL); rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); free_xml(cib_node); snprintf(xpath, STATUS_PATH_MAX, "//node_state[@uname='%s']/%s", node, XML_CIB_TAG_LRM); cib->cmds->remove(cib, xpath, NULL, cib_xpath | cib_sync_call | cib_scope_local); snprintf(xpath, STATUS_PATH_MAX, "//node_state[@uname='%s']/%s", node, XML_TAG_TRANSIENT_NODEATTRS); cib->cmds->remove(cib, xpath, NULL, cib_xpath | cib_sync_call | cib_scope_local); } for (gIter = node_fail; gIter != NULL; gIter = gIter->next) { char *node = (char *)gIter->data; out->message(out, "inject-modify-node", "Failing", node); cib_node = modify_node(cib, node, TRUE); crm_xml_add(cib_node, XML_NODE_IN_CLUSTER, XML_BOOLEAN_NO); CRM_ASSERT(cib_node != NULL); rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); free_xml(cib_node); } for (gIter = ticket_grant; gIter != NULL; gIter = gIter->next) { char *ticket_id = (char *)gIter->data; out->message(out, "inject-modify-ticket", "Granting", ticket_id); rc = set_ticket_state_attr(ticket_id, "granted", "true", cib, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } for (gIter = ticket_revoke; gIter != NULL; gIter = gIter->next) { char *ticket_id = (char *)gIter->data; out->message(out, "inject-modify-ticket", "Revoking", ticket_id); rc = set_ticket_state_attr(ticket_id, "granted", "false", cib, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } for (gIter = ticket_standby; gIter != NULL; gIter = gIter->next) { char *ticket_id = (char *)gIter->data; out->message(out, "inject-modify-ticket", "Standby", ticket_id); rc = set_ticket_state_attr(ticket_id, "standby", "true", cib, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } for (gIter = ticket_activate; gIter != NULL; gIter = gIter->next) { char *ticket_id = (char *)gIter->data; out->message(out, "inject-modify-ticket", "Activating", ticket_id); rc = set_ticket_state_attr(ticket_id, "standby", "false", cib, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } for (gIter = op_inject; gIter != NULL; gIter = gIter->next) { char *spec = (char *)gIter->data; int rc = 0; int outcome = 0; guint interval_ms = 0; char *key = NULL; char *node = NULL; char *task = NULL; char *resource = NULL; const char *rtype = NULL; const char *rclass = NULL; const char *rprovider = NULL; pe_resource_t *rsc = NULL; out->message(out, "inject-spec", spec); key = calloc(1, strlen(spec) + 1); node = calloc(1, strlen(spec) + 1); rc = sscanf(spec, "%[^@]@%[^=]=%d", key, node, &outcome); if (rc != 3) { out->err(out, "Invalid operation spec: %s. Only found %d fields", spec, rc); free(key); free(node); continue; } parse_op_key(key, &resource, &task, &interval_ms); rsc = pe_find_resource(data_set->resources, resource); if (rsc == NULL) { out->err(out, "Invalid resource name: %s", resource); } else { rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); rtype = crm_element_value(rsc->xml, XML_ATTR_TYPE); rprovider = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); cib_node = inject_node_state(cib, node, NULL); CRM_ASSERT(cib_node != NULL); update_failcounts(cib_node, resource, task, interval_ms, outcome); cib_resource = inject_resource(cib_node, resource, resource, rclass, rtype, rprovider); CRM_ASSERT(cib_resource != NULL); op = create_op(cib_resource, task, interval_ms, outcome); CRM_ASSERT(op != NULL); cib_op = inject_op(cib_resource, op, 0); CRM_ASSERT(cib_op != NULL); lrmd_free_event(op); rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); } free(task); free(node); free(key); } if (!out->is_quiet(out)) { out->end_list(out); } } static gboolean exec_pseudo_action(crm_graph_t * graph, crm_action_t * action) { const char *node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); action->confirmed = TRUE; out->message(out, "inject-pseudo-action", node, task); - update_graph(graph, action); + pcmk__update_graph(graph, action); return TRUE; } static gboolean exec_rsc_action(crm_graph_t * graph, crm_action_t * action) { int rc = 0; GList *gIter = NULL; lrmd_event_data_t *op = NULL; int target_outcome = 0; const char *rtype = NULL; const char *rclass = NULL; const char *resource = NULL; const char *rprovider = NULL; const char *lrm_name = NULL; const char *operation = crm_element_value(action->xml, "operation"); const char *target_rc_s = crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC); xmlNode *cib_node = NULL; xmlNode *cib_resource = NULL; xmlNode *action_rsc = first_named_child(action->xml, XML_CIB_TAG_RESOURCE); char *node = crm_element_value_copy(action->xml, XML_LRM_ATTR_TARGET); char *uuid = crm_element_value_copy(action->xml, XML_LRM_ATTR_TARGET_UUID); const char *router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (pcmk__strcase_any_of(operation, CRM_OP_PROBED, CRM_OP_REPROBE, NULL)) { crm_info("Skipping %s op for %s", operation, node); goto done; } if (action_rsc == NULL) { crm_log_xml_err(action->xml, "Bad"); free(node); free(uuid); return FALSE; } /* Look for the preferred name * If not found, try the expected 'local' name * If not found use the preferred name anyway */ resource = crm_element_value(action_rsc, XML_ATTR_ID); CRM_ASSERT(resource != NULL); // makes static analysis happy lrm_name = resource; // Preferred name when writing history if (pe_find_resource(fake_resource_list, resource) == NULL) { const char *longname = crm_element_value(action_rsc, XML_ATTR_ID_LONG); if (longname && pe_find_resource(fake_resource_list, longname)) { resource = longname; } } if (pcmk__strcase_any_of(operation, "delete", RSC_METADATA, NULL)) { out->message(out, "inject-rsc-action", resource, operation, node, (guint) 0); goto done; } rclass = crm_element_value(action_rsc, XML_AGENT_ATTR_CLASS); rtype = crm_element_value(action_rsc, XML_ATTR_TYPE); rprovider = crm_element_value(action_rsc, XML_AGENT_ATTR_PROVIDER); pcmk__scan_min_int(target_rc_s, &target_outcome, 0); CRM_ASSERT(fake_cib->cmds->query(fake_cib, NULL, NULL, cib_sync_call | cib_scope_local) == pcmk_ok); cib_node = inject_node_state(fake_cib, node, (router_node? node : uuid)); CRM_ASSERT(cib_node != NULL); cib_resource = inject_resource(cib_node, resource, lrm_name, rclass, rtype, rprovider); if (cib_resource == NULL) { crm_err("invalid resource in transition"); free(node); free(uuid); free_xml(cib_node); return FALSE; } op = convert_graph_action(cib_resource, action, 0, target_outcome); out->message(out, "inject-rsc-action", resource, op->op_type, node, op->interval_ms); for (gIter = fake_op_fail_list; gIter != NULL; gIter = gIter->next) { char *spec = (char *)gIter->data; char *key = NULL; const char *match_name = NULL; // Allow user to specify anonymous clone with or without instance number key = crm_strdup_printf(PCMK__OP_FMT "@%s=", resource, op->op_type, op->interval_ms, node); if (strncasecmp(key, spec, strlen(key)) == 0) { match_name = resource; } free(key); if ((match_name == NULL) && strcmp(resource, lrm_name)) { key = crm_strdup_printf(PCMK__OP_FMT "@%s=", lrm_name, op->op_type, op->interval_ms, node); if (strncasecmp(key, spec, strlen(key)) == 0) { match_name = lrm_name; } free(key); } if (match_name != NULL) { rc = sscanf(spec, "%*[^=]=%d", (int *) &op->rc); // ${match_name}_${task}_${interval_in_ms}@${node}=${rc} if (rc != 1) { out->err(out, "Invalid failed operation spec: %s. Result code must be integer", spec); continue; } action->failed = TRUE; graph->abort_priority = INFINITY; out->info(out, "Pretending action %d failed with rc=%d", action->id, op->rc); update_failcounts(cib_node, match_name, op->op_type, op->interval_ms, op->rc); break; } } inject_op(cib_resource, op, target_outcome); lrmd_free_event(op); rc = fake_cib->cmds->modify(fake_cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); done: free(node); free(uuid); free_xml(cib_node); action->confirmed = TRUE; - update_graph(graph, action); + pcmk__update_graph(graph, action); return TRUE; } static gboolean exec_crmd_action(crm_graph_t * graph, crm_action_t * action) { const char *node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); xmlNode *rsc = first_named_child(action->xml, XML_CIB_TAG_RESOURCE); action->confirmed = TRUE; out->message(out, "inject-cluster-action", node, task, rsc); - update_graph(graph, action); + pcmk__update_graph(graph, action); return TRUE; } static gboolean exec_stonith_action(crm_graph_t * graph, crm_action_t * action) { const char *op = crm_meta_value(action->params, "stonith_action"); char *target = crm_element_value_copy(action->xml, XML_LRM_ATTR_TARGET); out->message(out, "inject-fencing-action", target, op); if(!pcmk__str_eq(op, "on", pcmk__str_casei)) { int rc = 0; char xpath[STATUS_PATH_MAX]; xmlNode *cib_node = modify_node(fake_cib, target, FALSE); crm_xml_add(cib_node, XML_ATTR_ORIGIN, __func__); CRM_ASSERT(cib_node != NULL); rc = fake_cib->cmds->replace(fake_cib, XML_CIB_TAG_STATUS, cib_node, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); snprintf(xpath, STATUS_PATH_MAX, "//node_state[@uname='%s']/%s", target, XML_CIB_TAG_LRM); fake_cib->cmds->remove(fake_cib, xpath, NULL, cib_xpath | cib_sync_call | cib_scope_local); snprintf(xpath, STATUS_PATH_MAX, "//node_state[@uname='%s']/%s", target, XML_TAG_TRANSIENT_NODEATTRS); fake_cib->cmds->remove(fake_cib, xpath, NULL, cib_xpath | cib_sync_call | cib_scope_local); free_xml(cib_node); } action->confirmed = TRUE; - update_graph(graph, action); + pcmk__update_graph(graph, action); free(target); return TRUE; } int run_simulation(pe_working_set_t * data_set, cib_t *cib, GList *op_fail_list) { crm_graph_t *transition = NULL; enum transition_status graph_rc = -1; crm_graph_functions_t exec_fns = { exec_pseudo_action, exec_rsc_action, exec_crmd_action, exec_stonith_action, }; out = data_set->priv; fake_cib = cib; fake_op_fail_list = op_fail_list; if (!out->is_quiet(out)) { out->begin_list(out, NULL, NULL, "Executing Cluster Transition"); } set_graph_functions(&exec_fns); transition = unpack_graph(data_set->graph, crm_system_name); print_graph(LOG_DEBUG, transition); fake_resource_list = data_set->resources; do { graph_rc = run_graph(transition); } while (graph_rc == transition_active); fake_resource_list = NULL; if (graph_rc != transition_complete) { out->err(out, "Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_ERR, transition); } destroy_graph(transition); if (graph_rc != transition_complete) { out->err(out, "An invalid transition was produced"); } if (!out->is_quiet(out)) { xmlNode *cib_object = NULL; int rc = fake_cib->cmds->query(fake_cib, NULL, &cib_object, cib_sync_call | cib_scope_local); CRM_ASSERT(rc == pcmk_ok); pe_reset_working_set(data_set); data_set->input = cib_object; out->end_list(out); } if (graph_rc != transition_complete) { return graph_rc; } return 0; }