diff --git a/crmd/callbacks.c b/crmd/callbacks.c index 38fb30b955..68e2d576b7 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -1,297 +1,284 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void crmd_ha_connection_destroy(gpointer user_data); /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_connection_destroy(gpointer user_data) { crm_trace("Invoked"); if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { /* we signed out, so this is expected */ crm_info("Heartbeat disconnection complete"); return; } crm_crit("Lost connection to heartbeat service!"); register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); trigger_fsa(fsa_source); } void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if (safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } #define state_text(state) ((state)? (const char *)(state) : "in unknown state") void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { uint32_t old = 0; uint32_t changed = 0; bool appeared = FALSE; const char *status = NULL; set_bit(fsa_input_register, R_PEER_DATA); if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: /* If we've never seen the node, then it also wont be in the status section */ crm_info("%s is now %s", node->uname, state_text(node->state)); return; case crm_status_rstate: crm_info("Remote node %s is now %s (was %s)", node->uname, state_text(node->state), state_text(data)); /* Keep going */ case crm_status_nstate: crm_info("%s is now %s (was %s)", node->uname, state_text(node->state), state_text(data)); if (safe_str_eq(data, node->state)) { /* State did not change */ return; } else if(safe_str_eq(CRM_NODE_MEMBER, node->state)) { - GListPtr gIter = stonith_cleanup_list; - appeared = TRUE; - - while (gIter != NULL) { - GListPtr tmp = gIter; - char *target = tmp->data; - - gIter = gIter->next; - if(safe_str_eq(node->uname, target)) { - crm_trace("Removing %s from the cleanup list", target); - stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); - free(target); - } - } + remove_stonith_cleanup(node->uname); } crmd_notify_node_event(node); break; case crm_status_processes: if (data) { old = *(const uint32_t *)data; changed = node->processes ^ old; } status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS; crm_info("Client %s/%s now has status [%s] (DC=%s, changed=%6x)", node->uname, peer2text(proc_flags), status, AM_I_DC ? "true" : crm_str(fsa_our_dc), changed); if ((changed & proc_flags) == 0) { /* Peer process did not change */ crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags); return; } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) { crm_trace("Not connected"); return; } else if (fsa_state == S_STOPPING) { crm_trace("Stopping"); return; } appeared = (node->processes & proc_flags) != 0; if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) { /* Did we get evicted? */ crm_notice("Our peer connection failed"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL); } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) { /* Did the DC leave us? */ crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); if (compare_version(fsa_our_dc_version, "3.0.9") > 0) { erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); } } else if(AM_I_DC && appeared == FALSE) { crm_info("Peer %s left us", node->uname); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); /* crm_update_peer_join(__FUNCTION__, node, crm_join_none); */ } break; } if (AM_I_DC) { xmlNode *update = NULL; int flags = node_update_peer; gboolean alive = crm_is_peer_active(node); crm_action_t *down = match_down_event(0, node->uuid, NULL, appeared); crm_trace("Alive=%d, appear=%d, down=%p", alive, appeared, down); if (alive && type == crm_status_processes) { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (alive && safe_str_eq(task, CRM_OP_FENCE)) { crm_info("Node return implies stonith of %s (action %d) completed", node->uname, down->id); st_fail_count_reset(node->uname); erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */ down->sent_update = TRUE; /* Prevent tengine_stonith_callback() from calling send_stonith_update() */ } else if (safe_str_eq(task, CRM_OP_FENCE)) { crm_trace("Waiting for stonithd to report the fencing of %s is complete", node->uname); /* via tengine_stonith_callback() */ } else if (alive == FALSE) { crm_notice("%s of %s (op %d) is complete", task, node->uname, down->id); /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */ stop_te_timer(down->timer); flags |= node_update_join | node_update_expected; crmd_peer_down(node, FALSE); check_join_state(fsa_state, __FUNCTION__); update_graph(transition_graph, down); trigger_graph(); } else { crm_trace("Other %p", down); } } else if (appeared == FALSE) { crm_notice("Stonith/shutdown of %s not matched", node->uname); crm_update_peer_join(__FUNCTION__, node, crm_join_none); check_join_state(fsa_state, __FUNCTION__); abort_transition(INFINITY, tg_restart, "Node failure", NULL); fail_incompletable_actions(transition_graph, node->uuid); } else { crm_trace("Other %p", down); } update = do_update_node_cib(node, flags, NULL, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); } trigger_fsa(fsa_source); } void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/crmd/election.c b/crmd/election.c index adab4e3a7c..dee475069b 100644 --- a/crmd/election.c +++ b/crmd/election.c @@ -1,279 +1,267 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #define STORM_INTERVAL 2 /* in seconds */ #define STORM_MULTIPLIER 5 /* multiplied by the number of nodes */ /* A_ELECTION_VOTE */ void do_election_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean not_voting = FALSE; /* don't vote if we're in one of these states or wanting to shut down */ switch (cur_state) { case S_STARTING: case S_RECOVERY: case S_STOPPING: case S_TERMINATE: crm_warn("Not voting in election, we're in state %s", fsa_state2string(cur_state)); not_voting = TRUE; break; case S_ELECTION: case S_INTEGRATION: case S_RELEASE_DC: break; default: crm_err("Broken? Voting in state %s", fsa_state2string(cur_state)); break; } if (not_voting == FALSE) { if (is_set(fsa_input_register, R_STARTING)) { not_voting = TRUE; } } if (not_voting) { if (AM_I_DC) { register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); } else { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } return; } election_vote(fsa_election); return; } void do_election_check(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (fsa_state != S_ELECTION) { crm_debug("Ignore election check: we not in an election"); } else if(election_check(fsa_election)) { register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL); } return; } #define loss_dampen 2 /* in seconds */ /* A_ELECTION_COUNT */ void do_election_count_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { enum election_result rc = 0; ha_msg_input_t *vote = fsa_typed_data(fsa_dt_ha_msg); if(crm_peer_cache == NULL) { if(is_not_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Internal error, no peer cache"); } return; } rc = election_count_vote(fsa_election, vote->msg, cur_state != S_STARTING); switch(rc) { case election_start: election_reset(fsa_election); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); break; case election_lost: update_dc(NULL); if (fsa_input_register & R_THE_DC) { register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); } else if (cur_state != S_STARTING) { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } break; case election_in_progress: break; default: crm_err("Unhandled election result: %d", rc); } } /* A_ELECT_TIMER_START, A_ELECTION_TIMEOUT */ /* we won */ void do_election_timer_ctrl(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { } static void feature_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_notice("Update failed: %s (%d)", pcmk_strerror(rc), rc); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /* A_DC_TAKEOVER */ void do_dc_takeover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int rc = pcmk_ok; xmlNode *cib = NULL; - GListPtr gIter = NULL; const char *cluster_type = name_for_cluster_type(get_cluster_type()); const char *watchdog = NULL; crm_info("Taking over DC status for this partition"); set_bit(fsa_input_register, R_THE_DC); - - for (gIter = stonith_cleanup_list; gIter != NULL; gIter = gIter->next) { - char *target = gIter->data; - crm_node_t *target_node = crm_get_peer(0, target); - const char *uuid = crm_peer_uuid(target_node); - - crm_notice("Marking %s, target of a previous stonith action, as clean", target); - send_stonith_update(NULL, target, uuid); - free(target); - } - g_list_free(stonith_cleanup_list); - stonith_cleanup_list = NULL; + execute_stonith_cleanup(); #if SUPPORT_COROSYNC if (is_classic_ais_cluster()) { send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); } #endif election_reset(fsa_election); set_bit(fsa_input_register, R_JOIN_OK); set_bit(fsa_input_register, R_INVOKE_PE); fsa_cib_conn->cmds->set_master(fsa_cib_conn, cib_scope_local); cib = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add(cib, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); fsa_cib_update(XML_TAG_CIB, cib, cib_quorum_override, rc, NULL); fsa_register_cib_callback(rc, FALSE, NULL, feature_update_callback); watchdog = daemon_option("watchdog"); if (watchdog) { update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, XML_ATTR_HAVE_WATCHDOG, watchdog, FALSE, NULL, NULL); } update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "dc-version", PACEMAKER_VERSION "-" BUILD_VERSION, FALSE, NULL, NULL); update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "cluster-infrastructure", cluster_type, FALSE, NULL, NULL); #if SUPPORT_COROSYNC # if !SUPPORT_PLUGIN if (fsa_cluster_name == NULL && is_corosync_cluster()) { char *cluster_name = corosync_cluster_name(); if (cluster_name) { update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "cluster-name", cluster_name, FALSE, NULL, NULL); } free(cluster_name); } # endif #endif mainloop_set_trigger(config_read); free_xml(cib); } /* A_DC_RELEASE */ void do_dc_release(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (action & A_DC_RELEASE) { crm_debug("Releasing the role of DC"); clear_bit(fsa_input_register, R_THE_DC); } else if (action & A_DC_RELEASED) { crm_info("DC role released"); #if 0 if (are there errors) { /* we cant stay up if not healthy */ /* or perhaps I_ERROR and go to S_RECOVER? */ result = I_SHUTDOWN; } #endif if (is_set(fsa_input_register, R_SHUTDOWN)) { xmlNode *update = NULL; crm_node_t *node = crm_get_peer(0, fsa_our_uname); crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN); update = do_update_node_cib(node, node_update_expected, NULL, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); } register_fsa_input(C_FSA_INTERNAL, I_RELEASE_SUCCESS, NULL); } else { crm_err("Unknown action %s", fsa_action2string(action)); } crm_trace("Am I still the DC? %s", AM_I_DC ? XML_BOOLEAN_YES : XML_BOOLEAN_NO); } diff --git a/crmd/fsa.c b/crmd/fsa.c index cb71cad280..cf54d193e8 100644 --- a/crmd/fsa.c +++ b/crmd/fsa.c @@ -1,685 +1,675 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *fsa_our_dc = NULL; cib_t *fsa_cib_conn = NULL; char *fsa_our_dc_version = NULL; char *fsa_our_uuid = NULL; char *fsa_our_uname = NULL; char *fsa_cluster_name = NULL; #if SUPPORT_HEARTBEAT ll_cluster_t *fsa_cluster_conn; #endif election_t *fsa_election = NULL; fsa_timer_t *wait_timer = NULL; /* How long to wait before retrying to connect to the cib/lrmd/ccm */ fsa_timer_t *recheck_timer = NULL; /* Periodically re-run the PE to account for time based rules/preferences */ fsa_timer_t *election_trigger = NULL; /* How long to wait at startup, or after an election, for the DC to make contact */ fsa_timer_t *transition_timer = NULL; /* How long to delay the start of a new transition with the expectation something else might happen too */ fsa_timer_t *integration_timer = NULL; fsa_timer_t *finalization_timer = NULL; fsa_timer_t *shutdown_escalation_timer = NULL; /* How long to wait for the DC to stop all resources and give us the all-clear to shut down */ volatile gboolean do_fsa_stall = FALSE; volatile long long fsa_input_register = 0; volatile long long fsa_actions = A_NOTHING; volatile enum crmd_fsa_state fsa_state = S_STARTING; extern uint highest_born_on; extern uint num_join_invites; extern void initialize_join(gboolean before); #define DOT_PREFIX "actions:trace: " #define do_dot_log(fmt, args...) crm_trace( fmt, ##args) long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data); void dump_rsc_info(void); void dump_rsc_info_callback(const xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); void ghash_print_node(gpointer key, gpointer value, gpointer user_data); void s_crmd_fsa_actions(fsa_data_t * fsa_data); void log_fsa_input(fsa_data_t * stored_msg); void init_dotfile(void); void init_dotfile(void) { do_dot_log(DOT_PREFIX "digraph \"g\" {"); do_dot_log(DOT_PREFIX " size = \"30,30\""); do_dot_log(DOT_PREFIX " graph ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " bb = \"0,0,398.922306,478.927856\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " node ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " shape = \"ellipse\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " edge ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// special nodes"); do_dot_log(DOT_PREFIX " \"S_PENDING\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"blue\""); do_dot_log(DOT_PREFIX " fontcolor = \"blue\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " \"S_TERMINATE\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"red\""); do_dot_log(DOT_PREFIX " fontcolor = \"red\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// DC only nodes"); do_dot_log(DOT_PREFIX " \"S_INTEGRATION\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_POLICY_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_TRANSITION_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_RELEASE_DC\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_IDLE\" [ fontcolor = \"green\" ]"); } static void do_fsa_action(fsa_data_t * fsa_data, long long an_action, void (*function) (long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)) { fsa_actions &= ~an_action; crm_trace(DOT_PREFIX "\t// %s", fsa_action2string(an_action)); function(an_action, fsa_data->fsa_cause, fsa_state, fsa_data->fsa_input, fsa_data); } static long long startup_actions = A_STARTUP | A_CIB_START | A_LRM_CONNECT | A_CCM_CONNECT | A_HA_CONNECT | A_READCONFIG | A_STARTED | A_CL_JOIN_QUERY; enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause) { fsa_data_t *fsa_data = NULL; long long register_copy = fsa_input_register; long long new_actions = A_NOTHING; enum crmd_fsa_state last_state; crm_trace("FSA invoked with Cause: %s\tState: %s", fsa_cause2string(cause), fsa_state2string(fsa_state)); fsa_dump_actions(fsa_actions, "Initial"); do_fsa_stall = FALSE; if (is_message() == FALSE && fsa_actions != A_NOTHING) { /* fake the first message so we can get into the loop */ fsa_data = calloc(1, sizeof(fsa_data_t)); fsa_data->fsa_input = I_NULL; fsa_data->fsa_cause = C_FSA_INTERNAL; fsa_data->origin = __FUNCTION__; fsa_data->data_type = fsa_dt_none; fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); fsa_data = NULL; } while (is_message() && do_fsa_stall == FALSE) { crm_trace("Checking messages (%d remaining)", g_list_length(fsa_message_queue)); fsa_data = get_message(); if(fsa_data == NULL) { continue; } log_fsa_input(fsa_data); /* add any actions back to the queue */ fsa_actions |= fsa_data->actions; fsa_dump_actions(fsa_data->actions, "Restored actions"); /* get the next batch of actions */ new_actions = crmd_fsa_actions[fsa_data->fsa_input][fsa_state]; fsa_actions |= new_actions; fsa_dump_actions(new_actions, "New actions"); if (fsa_data->fsa_input != I_NULL && fsa_data->fsa_input != I_ROUTER) { crm_debug("Processing %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); } /* logging : *before* the state is changed */ if (is_set(fsa_actions, A_ERROR)) { do_fsa_action(fsa_data, A_ERROR, do_log); } if (is_set(fsa_actions, A_WARN)) { do_fsa_action(fsa_data, A_WARN, do_log); } if (is_set(fsa_actions, A_LOG)) { do_fsa_action(fsa_data, A_LOG, do_log); } /* update state variables */ last_state = fsa_state; fsa_state = crmd_fsa_state[fsa_data->fsa_input][fsa_state]; /* * Remove certain actions during shutdown */ if (fsa_state == S_STOPPING || ((fsa_input_register & R_SHUTDOWN) == R_SHUTDOWN)) { clear_bit(fsa_actions, startup_actions); } /* * Hook for change of state. * Allows actions to be added or removed when entering a state */ if (last_state != fsa_state) { fsa_actions = do_state_transition(fsa_actions, last_state, fsa_state, fsa_data); } else { do_dot_log(DOT_PREFIX "\t// FSA input: State=%s \tCause=%s" " \tInput=%s \tOrigin=%s() \tid=%d", fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_input2string(fsa_data->fsa_input), fsa_data->origin, fsa_data->id); } /* start doing things... */ s_crmd_fsa_actions(fsa_data); delete_fsa_input(fsa_data); fsa_data = NULL; } if (g_list_length(fsa_message_queue) > 0 || fsa_actions != A_NOTHING || do_fsa_stall) { crm_debug("Exiting the FSA: queue=%d, fsa_actions=0x%llx, stalled=%s", g_list_length(fsa_message_queue), fsa_actions, do_fsa_stall ? "true" : "false"); } else { crm_trace("Exiting the FSA"); } /* cleanup inputs? */ if (register_copy != fsa_input_register) { long long same = register_copy & fsa_input_register; fsa_dump_inputs(LOG_DEBUG, "Added", fsa_input_register ^ same); fsa_dump_inputs(LOG_DEBUG, "Removed", register_copy ^ same); } fsa_dump_actions(fsa_actions, "Remaining"); fsa_dump_queue(LOG_DEBUG); return fsa_state; } void s_crmd_fsa_actions(fsa_data_t * fsa_data) { /* * Process actions in order of priority but do only one * action at a time to avoid complicating the ordering. */ CRM_CHECK(fsa_data != NULL, return); while (fsa_actions != A_NOTHING && do_fsa_stall == FALSE) { /* regular action processing in order of action priority * * Make sure all actions that connect to required systems * are performed first */ if (fsa_actions & A_ERROR) { do_fsa_action(fsa_data, A_ERROR, do_log); } else if (fsa_actions & A_WARN) { do_fsa_action(fsa_data, A_WARN, do_log); } else if (fsa_actions & A_LOG) { do_fsa_action(fsa_data, A_LOG, do_log); /* get out of here NOW! before anything worse happens */ } else if (fsa_actions & A_EXIT_1) { do_fsa_action(fsa_data, A_EXIT_1, do_exit); /* sub-system restart */ } else if ((fsa_actions & O_LRM_RECONNECT) == O_LRM_RECONNECT) { do_fsa_action(fsa_data, O_LRM_RECONNECT, do_lrm_control); } else if ((fsa_actions & O_CIB_RESTART) == O_CIB_RESTART) { do_fsa_action(fsa_data, O_CIB_RESTART, do_cib_control); } else if ((fsa_actions & O_PE_RESTART) == O_PE_RESTART) { do_fsa_action(fsa_data, O_PE_RESTART, do_pe_control); } else if ((fsa_actions & O_TE_RESTART) == O_TE_RESTART) { do_fsa_action(fsa_data, O_TE_RESTART, do_te_control); /* essential start tasks */ } else if (fsa_actions & A_STARTUP) { do_fsa_action(fsa_data, A_STARTUP, do_startup); } else if (fsa_actions & A_CIB_START) { do_fsa_action(fsa_data, A_CIB_START, do_cib_control); } else if (fsa_actions & A_HA_CONNECT) { do_fsa_action(fsa_data, A_HA_CONNECT, do_ha_control); } else if (fsa_actions & A_READCONFIG) { do_fsa_action(fsa_data, A_READCONFIG, do_read_config); /* sub-system start/connect */ } else if (fsa_actions & A_LRM_CONNECT) { do_fsa_action(fsa_data, A_LRM_CONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_CONNECT) { #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { do_fsa_action(fsa_data, A_CCM_CONNECT, do_ccm_control); } #endif fsa_actions &= ~A_CCM_CONNECT; } else if (fsa_actions & A_TE_START) { do_fsa_action(fsa_data, A_TE_START, do_te_control); } else if (fsa_actions & A_PE_START) { do_fsa_action(fsa_data, A_PE_START, do_pe_control); /* Timers */ /* else if(fsa_actions & O_DC_TIMER_RESTART) { do_fsa_action(fsa_data, O_DC_TIMER_RESTART, do_timer_control) */ ; } else if (fsa_actions & A_DC_TIMER_STOP) { do_fsa_action(fsa_data, A_DC_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_STOP) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_START) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_START, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_STOP) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_START) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_START, do_timer_control); /* * Highest priority actions */ } else if (fsa_actions & A_MSG_ROUTE) { do_fsa_action(fsa_data, A_MSG_ROUTE, do_msg_route); } else if (fsa_actions & A_RECOVER) { do_fsa_action(fsa_data, A_RECOVER, do_recover); } else if (fsa_actions & A_CL_JOIN_RESULT) { do_fsa_action(fsa_data, A_CL_JOIN_RESULT, do_cl_join_finalize_respond); } else if (fsa_actions & A_CL_JOIN_REQUEST) { do_fsa_action(fsa_data, A_CL_JOIN_REQUEST, do_cl_join_offer_respond); } else if (fsa_actions & A_SHUTDOWN_REQ) { do_fsa_action(fsa_data, A_SHUTDOWN_REQ, do_shutdown_req); } else if (fsa_actions & A_ELECTION_VOTE) { do_fsa_action(fsa_data, A_ELECTION_VOTE, do_election_vote); } else if (fsa_actions & A_ELECTION_COUNT) { do_fsa_action(fsa_data, A_ELECTION_COUNT, do_election_count_vote); } else if (fsa_actions & A_LRM_EVENT) { do_fsa_action(fsa_data, A_LRM_EVENT, do_lrm_event); /* * High priority actions */ } else if (fsa_actions & A_STARTED) { do_fsa_action(fsa_data, A_STARTED, do_started); } else if (fsa_actions & A_CL_JOIN_QUERY) { do_fsa_action(fsa_data, A_CL_JOIN_QUERY, do_cl_join_query); } else if (fsa_actions & A_DC_TIMER_START) { do_fsa_action(fsa_data, A_DC_TIMER_START, do_timer_control); /* * Medium priority actions * - Membership */ } else if (fsa_actions & A_DC_TAKEOVER) { do_fsa_action(fsa_data, A_DC_TAKEOVER, do_dc_takeover); } else if (fsa_actions & A_DC_RELEASE) { do_fsa_action(fsa_data, A_DC_RELEASE, do_dc_release); } else if (fsa_actions & A_DC_JOIN_FINAL) { do_fsa_action(fsa_data, A_DC_JOIN_FINAL, do_dc_join_final); } else if (fsa_actions & A_ELECTION_CHECK) { do_fsa_action(fsa_data, A_ELECTION_CHECK, do_election_check); } else if (fsa_actions & A_ELECTION_START) { do_fsa_action(fsa_data, A_ELECTION_START, do_election_vote); } else if (fsa_actions & A_DC_JOIN_OFFER_ALL) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ALL, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_OFFER_ONE) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ONE, do_dc_join_offer_one); } else if (fsa_actions & A_DC_JOIN_PROCESS_REQ) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_REQ, do_dc_join_filter_offer); } else if (fsa_actions & A_DC_JOIN_PROCESS_ACK) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_ACK, do_dc_join_ack); } else if (fsa_actions & A_DC_JOIN_FINALIZE) { do_fsa_action(fsa_data, A_DC_JOIN_FINALIZE, do_dc_join_finalize); } else if (fsa_actions & A_CL_JOIN_ANNOUNCE) { do_fsa_action(fsa_data, A_CL_JOIN_ANNOUNCE, do_cl_join_announce); /* * Low(er) priority actions * Make sure the CIB is always updated before invoking the * PE, and the PE before the TE */ } else if (fsa_actions & A_TE_HALT) { do_fsa_action(fsa_data, A_TE_HALT, do_te_invoke); } else if (fsa_actions & A_TE_CANCEL) { do_fsa_action(fsa_data, A_TE_CANCEL, do_te_invoke); } else if (fsa_actions & A_LRM_INVOKE) { do_fsa_action(fsa_data, A_LRM_INVOKE, do_lrm_invoke); } else if (fsa_actions & A_PE_INVOKE) { do_fsa_action(fsa_data, A_PE_INVOKE, do_pe_invoke); } else if (fsa_actions & A_TE_INVOKE) { do_fsa_action(fsa_data, A_TE_INVOKE, do_te_invoke); /* Shutdown actions */ } else if (fsa_actions & A_DC_RELEASED) { do_fsa_action(fsa_data, A_DC_RELEASED, do_dc_release); } else if (fsa_actions & A_PE_STOP) { do_fsa_action(fsa_data, A_PE_STOP, do_pe_control); } else if (fsa_actions & A_TE_STOP) { do_fsa_action(fsa_data, A_TE_STOP, do_te_control); } else if (fsa_actions & A_SHUTDOWN) { do_fsa_action(fsa_data, A_SHUTDOWN, do_shutdown); } else if (fsa_actions & A_LRM_DISCONNECT) { do_fsa_action(fsa_data, A_LRM_DISCONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_DISCONNECT) { #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { do_fsa_action(fsa_data, A_CCM_DISCONNECT, do_ccm_control); } #endif fsa_actions &= ~A_CCM_DISCONNECT; } else if (fsa_actions & A_HA_DISCONNECT) { do_fsa_action(fsa_data, A_HA_DISCONNECT, do_ha_control); } else if (fsa_actions & A_CIB_STOP) { do_fsa_action(fsa_data, A_CIB_STOP, do_cib_control); } else if (fsa_actions & A_STOP) { do_fsa_action(fsa_data, A_STOP, do_stop); /* exit gracefully */ } else if (fsa_actions & A_EXIT_0) { do_fsa_action(fsa_data, A_EXIT_0, do_exit); /* Error checking and reporting */ } else { crm_err("Action %s (0x%llx) not supported ", fsa_action2string(fsa_actions), fsa_actions); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, fsa_data, NULL, __FUNCTION__); } } } void log_fsa_input(fsa_data_t * stored_msg) { CRM_ASSERT(stored_msg); crm_trace("Processing queued input %d", stored_msg->id); if (stored_msg->fsa_cause == C_CCM_CALLBACK) { crm_trace("FSA processing CCM callback from %s", stored_msg->origin); } else if (stored_msg->fsa_cause == C_LRM_OP_CALLBACK) { crm_trace("FSA processing LRM callback from %s", stored_msg->origin); } else if (stored_msg->data == NULL) { crm_trace("FSA processing input from %s", stored_msg->origin); } else { ha_msg_input_t *ha_input = fsa_typed_data_adv(stored_msg, fsa_dt_ha_msg, __FUNCTION__); crm_trace("FSA processing XML message from %s", stored_msg->origin); crm_log_xml_trace(ha_input->xml, "FSA message data"); } } long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data) { int level = LOG_INFO; long long tmp = actions; gboolean clear_recovery_bit = TRUE; enum crmd_fsa_cause cause = msg_data->fsa_cause; enum crmd_fsa_input current_input = msg_data->fsa_input; const char *state_from = fsa_state2string(cur_state); const char *state_to = fsa_state2string(next_state); const char *input = fsa_input2string(current_input); CRM_LOG_ASSERT(cur_state != next_state); do_dot_log(DOT_PREFIX "\t%s -> %s [ label=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); if (cur_state == S_IDLE || next_state == S_IDLE) { level = LOG_NOTICE; } else if (cur_state == S_NOT_DC || next_state == S_NOT_DC) { level = LOG_NOTICE; } else if (cur_state == S_ELECTION) { level = LOG_NOTICE; } else if (cur_state == S_STARTING) { level = LOG_NOTICE; } else if (next_state == S_RECOVERY) { level = LOG_WARNING; } do_crm_log(level, "State transition %s -> %s [ input=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); /* the last two clauses might cause trouble later */ if (next_state != S_ELECTION && cur_state != S_RELEASE_DC) { election_timeout_stop(fsa_election); /* } else { */ /* crm_timer_start(election_timeout); */ } #if 0 if ((fsa_input_register & R_SHUTDOWN)) { set_bit(tmp, A_DC_TIMER_STOP); } #endif if (next_state == S_INTEGRATION) { set_bit(tmp, A_INTEGRATE_TIMER_START); } else { set_bit(tmp, A_INTEGRATE_TIMER_STOP); } if (next_state == S_FINALIZE_JOIN) { set_bit(tmp, A_FINALIZE_TIMER_START); } else { set_bit(tmp, A_FINALIZE_TIMER_STOP); } if (next_state != S_PENDING) { set_bit(tmp, A_DC_TIMER_STOP); } if (next_state != S_ELECTION) { highest_born_on = 0; } if (next_state != S_IDLE) { crm_timer_stop(recheck_timer); } if (cur_state == S_FINALIZE_JOIN && next_state == S_POLICY_ENGINE) { populate_cib_nodes(node_update_quick|node_update_all, __FUNCTION__); } switch (next_state) { case S_PENDING: fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); /* fall through */ case S_ELECTION: crm_trace("Resetting our DC to NULL on transition to %s", fsa_state2string(next_state)); update_dc(NULL); break; case S_NOT_DC: election_trigger->counter = 0; - if (stonith_cleanup_list) { - GListPtr gIter = NULL; + purge_stonith_cleanup(); - for (gIter = stonith_cleanup_list; gIter != NULL; gIter = gIter->next) { - char *target = gIter->data; - - crm_info("Purging %s from stonith cleanup list", target); - free(target); - } - g_list_free(stonith_cleanup_list); - stonith_cleanup_list = NULL; - } if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we have a new DC"); set_bit(tmp, A_SHUTDOWN_REQ); } CRM_LOG_ASSERT(fsa_our_dc != NULL); if (fsa_our_dc == NULL) { crm_err("Reached S_NOT_DC without a DC" " being recorded"); } break; case S_RECOVERY: clear_recovery_bit = FALSE; break; case S_FINALIZE_JOIN: CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_warn("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (crmd_join_phase_count(crm_join_welcomed) > 0) { crm_warn("%u cluster nodes failed to respond" " to the join offer.", crmd_join_phase_count(crm_join_welcomed)); crmd_join_phase_log(LOG_NOTICE); } else { crm_debug("All %d cluster nodes responded to the join offer.", crmd_join_phase_count(crm_join_integrated)); } break; case S_POLICY_ENGINE: election_trigger->counter = 0; CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_info("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (crmd_join_phase_count(crm_join_finalized) > 0) { crm_err("%u cluster nodes failed to confirm their join.", crmd_join_phase_count(crm_join_finalized)); crmd_join_phase_log(LOG_NOTICE); } else if (crmd_join_phase_count(crm_join_confirmed) == crm_active_peers()) { crm_debug("All %u cluster nodes are" " eligible to run resources.", crm_active_peers()); } else if (crmd_join_phase_count(crm_join_confirmed) > crm_active_peers()) { crm_err("We have more confirmed nodes than our membership does: %d vs. %d", crmd_join_phase_count(crm_join_confirmed), crm_active_peers()); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if (saved_ccm_membership_id != crm_peer_seq) { crm_info("Membership changed: %llu -> %llu - join restart", saved_ccm_membership_id, crm_peer_seq); register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } else { crm_warn("Only %u of %u cluster " "nodes are eligible to run resources - continue %d", crmd_join_phase_count(crm_join_confirmed), crm_active_peers(), crmd_join_phase_count(crm_join_welcomed)); } /* initialize_join(FALSE); */ break; case S_STOPPING: case S_TERMINATE: /* possibly redundant */ set_bit(fsa_input_register, R_SHUTDOWN); break; case S_IDLE: CRM_LOG_ASSERT(AM_I_DC); dump_rsc_info(); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we are the DC"); set_bit(tmp, A_SHUTDOWN_REQ); } if (recheck_timer->period_ms > 0) { crm_debug("Starting %s", get_timer_desc(recheck_timer)); crm_timer_start(recheck_timer); } break; default: break; } if (clear_recovery_bit && next_state != S_PENDING) { tmp &= ~A_RECOVER; } else if (clear_recovery_bit == FALSE) { tmp |= A_RECOVER; } if (tmp != actions) { /* fsa_dump_actions(actions ^ tmp, "New actions"); */ actions = tmp; } return actions; } void dump_rsc_info(void) { } void ghash_print_node(gpointer key, gpointer value, gpointer user_data) { const char *text = user_data; const char *uname = key; const char *value_s = value; crm_info("%s: %s %s", text, uname, value_s); } diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 22551ba0d3..d3d8f71a5a 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -1,548 +1,636 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include crm_trigger_t *stonith_reconnect = NULL; -GListPtr stonith_cleanup_list = NULL; + +/* + * stonith cleanup list + * + * If the DC is shot, proper notifications might not go out. + * The stonith cleanup list allows the cluster to (re-)send + * notifications once a new DC is elected. + */ + +static GListPtr stonith_cleanup_list = NULL; + +/*! + * \internal + * \brief Add a node to the stonith cleanup list + * + * \param[in] target Name of node to add + */ +void +add_stonith_cleanup(const char *target) { + stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); +} + +/*! + * \internal + * \brief Remove a node from the stonith cleanup list + * + * \param[in] Name of node to remove + */ +void +remove_stonith_cleanup(const char *target) +{ + GListPtr iter = stonith_cleanup_list; + + while (iter != NULL) { + GListPtr tmp = iter; + char *iter_name = tmp->data; + + iter = iter->next; + if (safe_str_eq(target, iter_name)) { + crm_trace("Removing %s from the cleanup list", iter_name); + stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); + free(iter_name); + } + } +} + +/*! + * \internal + * \brief Purge all entries from the stonith cleanup list + */ +void +purge_stonith_cleanup() +{ + if (stonith_cleanup_list) { + GListPtr iter = NULL; + + for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { + char *target = iter->data; + + crm_info("Purging %s from stonith cleanup list", target); + free(target); + } + g_list_free(stonith_cleanup_list); + stonith_cleanup_list = NULL; + } +} + +/*! + * \internal + * \brief Send stonith updates for all entries in cleanup list, then purge it + */ +void +execute_stonith_cleanup() +{ + GListPtr iter; + + for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { + char *target = iter->data; + crm_node_t *target_node = crm_get_peer(0, target); + const char *uuid = crm_peer_uuid(target_node); + + crm_notice("Marking %s, target of a previous stonith action, as clean", target); + send_stonith_update(NULL, target, uuid); + free(target); + } + g_list_free(stonith_cleanup_list); + stonith_cleanup_list = NULL; +} + +/* end stonith cleanup list functions */ static gboolean fail_incompletable_stonith(crm_graph_t * graph) { GListPtr lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GListPtr lpc2 = NULL; synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { crm_action_t *action = (crm_action_t *) lpc2->data; if (action->type != action_type_crm || action->confirmed) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (task && safe_str_eq(task, CRM_OP_FENCE)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Failing action %d (%s): STONITHd terminated", action->id, ID(action->xml)); } } } if (last_action != NULL) { crm_warn("STONITHd failure resulted in un-runnable actions"); abort_transition(INFINITY, tg_restart, "Stonith failure", last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencing daemon disconnected"); } /* cbchan will be garbage at this point, arrange for it to be reset */ if(stonith_api) { stonith_api->state = stonith_disconnected; } if (AM_I_DC) { fail_incompletable_stonith(transition_graph); trigger_graph(); } } #if SUPPORT_CMAN # include #endif char *te_client_id = NULL; #ifdef HAVE_SYS_REBOOT_H # include # include #endif static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%d", crm_system_name, getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } crmd_notify_fencing_op(st_event); if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); /* TODO: Hook up st_event->device */ return; } else if (safe_str_eq("on", st_event->action)) { crm_err("Unfencing of %s by %s failed: %s (%d)", st_event->target, st_event->executioner ? st_event->executioner : "", pcmk_strerror(st_event->result), st_event->result); return; } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were allegedly just fenced by %s for %s!", st_event->executioner ? st_event->executioner : "", st_event->origin); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(100); /* None of our wrappers since we already called qb_log_fini() */ return; } if (st_event->result == pcmk_ok && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { st_fail_count_reset(st_event->target); } crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "", st_event->origin, pcmk_strerror(st_event->result), st_event->id, st_event->client_origin ? st_event->client_origin : ""); #if SUPPORT_CMAN if (st_event->result == pcmk_ok && is_cman_cluster()) { int local_rc = 0; int confirm = 0; char *target_copy = strdup(st_event->target); /* In case fenced hasn't noticed yet * * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect */ local_rc = fenced_external(target_copy); if (local_rc != 0) { crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target, local_rc); } else { crm_notice("Notified CMAN that '%s' is now fenced", st_event->target); } /* In case fenced is already trying to shoot it */ confirm = open("/var/run/cluster/fenced_override", O_NONBLOCK|O_WRONLY); if (confirm > 0) { int ignore = 0; int len = strlen(target_copy); errno = 0; local_rc = write(confirm, target_copy, len); ignore = write(confirm, "\n", 1); if(ignore < 0 && errno == EBADF) { crm_trace("CMAN not expecting %s to be fenced (yet)", st_event->target); } else if (local_rc < len) { crm_perror(LOG_ERR, "Confirmation of CMAN fencing event for '%s' failed: %d", st_event->target, local_rc); } else { fsync(confirm); crm_notice("Confirmed CMAN fencing event for '%s'", st_event->target); } close(confirm); } free(target_copy); } #endif if (st_event->result == pcmk_ok) { crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER); const char *uuid = NULL; gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we dont currently have one */ } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : ""); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } - stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target)); - + add_stonith_cleanup(st_event->target); } crmd_peer_down(peer, TRUE); } } gboolean te_connect_stonith(gpointer user_data) { int lpc = 0; int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); } if (stonith_api->state != stonith_disconnected) { crm_trace("Still connected"); return TRUE; } for (lpc = 0; lpc < 30; lpc++) { crm_debug("Attempting connection to fencing daemon..."); sleep(1); rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (rc == pcmk_ok) { break; } if (user_data != NULL) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_err("Sign-in failed: triggered a retry"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Sign-in failed, but no longer required"); } return TRUE; } crm_err("Sign-in failed: pausing and trying again in 2s..."); sleep(1); } CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */ stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, tengine_stonith_notify); crm_trace("Connected"); return TRUE; } gboolean stop_te_timer(crm_action_timer_t * timer) { const char *timer_desc = "action timer"; if (timer == NULL) { return FALSE; } if (timer->reason == timeout_abort) { timer_desc = "global timer"; crm_trace("Stopping %s", timer_desc); } if (timer->source_id != 0) { crm_trace("Stopping %s", timer_desc); g_source_remove(timer->source_id); timer->source_id = 0; } else { crm_trace("%s was already stopped", timer_desc); return FALSE; } return TRUE; } gboolean te_graph_trigger(gpointer user_data) { enum transition_status graph_rc = -1; if (transition_graph == NULL) { crm_debug("Nothing to do"); return TRUE; } crm_trace("Invoking graph %d in state %s", transition_graph->id, fsa_state2string(fsa_state)); switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: return TRUE; break; default: break; } if (transition_graph->complete == FALSE) { int limit = transition_graph->batch_limit; transition_graph->batch_limit = throttle_get_total_job_limit(limit); graph_rc = run_graph(transition_graph); transition_graph->batch_limit = limit; /* Restore the configured value */ /* significant overhead... */ /* print_graph(LOG_DEBUG_3, transition_graph); */ if (graph_rc == transition_active) { crm_trace("Transition not yet complete"); return TRUE; } else if (graph_rc == transition_pending) { crm_trace("Transition not yet complete - no actions fired"); return TRUE; } if (graph_rc != transition_complete) { crm_warn("Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_NOTICE, transition_graph); } } crm_debug("Transition %d is now complete", transition_graph->id); transition_graph->complete = TRUE; notify_crmd(transition_graph); return TRUE; } void trigger_graph_processing(const char *fn, int line) { crm_trace("%s:%d - Triggered graph processing", fn, line); mainloop_set_trigger(transition_trigger); } void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line) { int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; int level = LOG_INFO; xmlNode *diff = NULL; xmlNode *change = NULL; CRM_CHECK(transition_graph != NULL, return); switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: crm_info("Abort %s suppressed: state=%s (complete=%d)", abort_text, fsa_state2string(fsa_state), transition_graph->complete); return; default: break; } /* Make sure any queued calculations are discarded ASAP */ free(fsa_pe_ref); fsa_pe_ref = NULL; if (transition_graph->complete == FALSE) { if(update_abort_priority(transition_graph, abort_priority, abort_action, abort_text)) { level = LOG_NOTICE; } } if(reason) { xmlNode *search = NULL; for(search = reason; search; search = search->parent) { if (safe_str_eq(XML_TAG_DIFF, TYPE(search))) { diff = search; break; } } if(diff) { xml_patch_versions(diff, add, del); for(search = reason; search; search = search->parent) { if (safe_str_eq(XML_DIFF_CHANGE, TYPE(search))) { change = search; break; } } } } if(reason == NULL) { do_crm_log(level, "Transition aborted: %s (source=%s:%d, %d)", abort_text, fn, line, transition_graph->complete); } else if(change == NULL) { char *local_path = xml_get_path(reason); do_crm_log(level, "Transition aborted by %s.%s: %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", TYPE(reason), ID(reason), abort_text, add[0], add[1], add[2], fn, line, local_path, transition_graph->complete); free(local_path); } else { const char *kind = NULL; const char *op = crm_element_value(change, XML_DIFF_OP); const char *path = crm_element_value(change, XML_DIFF_PATH); if(change == reason) { if(strcmp(op, "create") == 0) { reason = reason->children; } else if(strcmp(op, "modify") == 0) { reason = first_named_child(reason, XML_DIFF_RESULT); if(reason) { reason = reason->children; } } } kind = TYPE(reason); if(strcmp(op, "delete") == 0) { const char *shortpath = strrchr(path, '/'); do_crm_log(level, "Transition aborted by deletion of %s: %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", shortpath?shortpath+1:path, abort_text, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } else if (safe_str_eq(XML_CIB_TAG_NVPAIR, kind)) { do_crm_log(level, "Transition aborted by %s, %s=%s: %s (%s cib=%d.%d.%d, source=%s:%d, path=%s, %d)", crm_element_value(reason, XML_ATTR_ID), crm_element_value(reason, XML_NVPAIR_ATTR_NAME), crm_element_value(reason, XML_NVPAIR_ATTR_VALUE), abort_text, op, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } else if (safe_str_eq(XML_LRM_TAG_RSC_OP, kind)) { const char *magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); do_crm_log(level, "Transition aborted by %s '%s' on %s: %s (magic=%s, cib=%d.%d.%d, source=%s:%d, %d)", crm_element_value(reason, XML_LRM_ATTR_TASK_KEY), op, crm_element_value(reason, XML_LRM_ATTR_TARGET), abort_text, magic, add[0], add[1], add[2], fn, line, transition_graph->complete); } else if (safe_str_eq(XML_CIB_TAG_STATE, kind) || safe_str_eq(XML_CIB_TAG_NODE, kind)) { const char *uname = crm_peer_uname(ID(reason)); do_crm_log(level, "Transition aborted by %s '%s' on %s: %s (cib=%d.%d.%d, source=%s:%d, %d)", kind, op, uname ? uname : ID(reason), abort_text, add[0], add[1], add[2], fn, line, transition_graph->complete); } else { do_crm_log(level, "Transition aborted by %s.%s '%s': %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", TYPE(reason), ID(reason), op?op:"change", abort_text, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } } if (transition_graph->complete) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else { register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); } return; } mainloop_set_trigger(transition_trigger); } diff --git a/crmd/tengine.h b/crmd/tengine.h index 1765dd439e..94a869314a 100644 --- a/crmd/tengine.h +++ b/crmd/tengine.h @@ -1,78 +1,83 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef TENGINE__H # define TENGINE__H # include # include # include # include extern stonith_t *stonith_api; -extern GListPtr stonith_cleanup_list; extern void send_stonith_update(crm_action_t * stonith_action, const char *target, const char *uuid); +/* stonith cleanup list */ +void add_stonith_cleanup(const char *target); +void remove_stonith_cleanup(const char *target); +void purge_stonith_cleanup(void); +void execute_stonith_cleanup(void); + /* tengine */ extern crm_action_t *match_down_event(int rc, const char *target, const char *filter, bool quiet); extern crm_action_t *get_cancel_action(const char *id, const char *node); extern gboolean cib_action_update(crm_action_t * action, int status, int op_rc); extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node); extern gboolean process_graph_event(xmlNode * event, const char *event_node); /* utils */ extern crm_action_t *get_action(int id, gboolean confirmed); extern gboolean start_global_timer(crm_action_timer_t * timer, int timeout); extern gboolean stop_te_timer(crm_action_timer_t * timer); extern const char *get_rsc_state(const char *task, enum op_status status); /* unpack */ extern gboolean process_te_message(xmlNode * msg, xmlNode * xml_data); extern crm_graph_t *transition_graph; extern crm_trigger_t *transition_trigger; extern char *te_uuid; extern void notify_crmd(crm_graph_t * graph); # include extern void trigger_graph_processing(const char *fn, int line); extern void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line); # define trigger_graph() trigger_graph_processing(__FUNCTION__, __LINE__) # define abort_transition(pri, action, text, reason) \ abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__); extern gboolean te_connect_stonith(gpointer user_data); extern crm_trigger_t *transition_trigger; extern crm_trigger_t *stonith_reconnect; extern char *failed_stop_offset; extern char *failed_start_offset; extern int active_timeout; extern int stonith_op_active; void te_action_confirmed(crm_action_t * action); void te_reset_job_counts(void); #endif