diff --git a/crmd/callbacks.c b/crmd/callbacks.c index 6d0dffe437..8c8ba6a5cf 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -1,668 +1,670 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include void crmd_ha_connection_destroy(gpointer user_data); void crmd_ha_msg_filter(xmlNode *msg); /* From join_dc... */ extern gboolean check_join_state( enum crmd_fsa_state cur_state, const char *source); #define trigger_fsa(source) crm_debug_3("Triggering FSA: %s", __FUNCTION__); \ mainloop_set_trigger(source); #if SUPPORT_HEARTBEAT gboolean crmd_ha_msg_dispatch(ll_cluster_t *cluster_conn, gpointer user_data) { IPC_Channel *channel = NULL; gboolean stay_connected = TRUE; crm_debug_3("Invoked"); if(cluster_conn != NULL) { channel = cluster_conn->llc_ops->ipcchan(cluster_conn); } CRM_CHECK(cluster_conn != NULL, ;); CRM_CHECK(channel != NULL, ;); if(channel != NULL && IPC_ISRCONN(channel)) { if(cluster_conn->llc_ops->msgready(cluster_conn) == 0) { crm_debug_2("no message ready yet"); } /* invoke the callbacks but dont block */ cluster_conn->llc_ops->rcvmsg(cluster_conn, 0); } if (channel == NULL || channel->ch_status != IPC_CONNECT) { if(is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) { crm_crit("Lost connection to heartbeat service."); } else { crm_info("Lost connection to heartbeat service."); } trigger_fsa(fsa_source); stay_connected = FALSE; } return stay_connected; } #endif void crmd_ha_connection_destroy(gpointer user_data) { crm_debug_3("Invoked"); if(is_set(fsa_input_register, R_HA_DISCONNECTED)) { /* we signed out, so this is expected */ crm_info("Heartbeat disconnection complete"); return; } crm_crit("Lost connection to heartbeat service!"); register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); trigger_fsa(fsa_source); } void crmd_ha_msg_filter(xmlNode *msg) { if(AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if(safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if(safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if(fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_ERR; new_input.msg = msg; register_fsa_error_adv( C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if(safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml(LOG_MSG, "HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } #if SUPPORT_HEARTBEAT void crmd_ha_msg_callback(HA_Message *hamsg, void* private_data) { int level = LOG_DEBUG; crm_node_t *from_node = NULL; xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__); const char *from = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_CRM_TASK); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail); crm_debug_2("HA[inbound]: %s from %s", op, from); if(crm_peer_cache == NULL || crm_active_members() == 0) { crm_debug("Ignoring HA messages until we are" " connected to the CCM (%s op from %s)", op, from); crm_log_xml(LOG_MSG, "HA[inbound]: Ignore (No CCM)", msg); goto bail; } from_node = crm_get_peer(0, from); if(crm_is_member_active(from_node) == FALSE) { if(safe_str_eq(op, CRM_OP_VOTE)) { level = LOG_WARNING; } else if(AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) { level = LOG_WARNING; } else if(safe_str_eq(sys_from, CRM_SYSTEM_DC)) { level = LOG_WARNING; } do_crm_log(level, "Ignoring HA message (op=%s) from %s: not in our" " membership list (size=%d)", op, from, crm_active_members()); crm_log_xml(LOG_MSG, "HA[inbound]: CCM Discard", msg); } else { crmd_ha_msg_filter(msg); } bail: free_xml(msg); return; } #endif /* * Apparently returning TRUE means "stay connected, keep doing stuff". * Returning FALSE means "we're all done, close the connection" */ gboolean crmd_ipc_msg_callback(IPC_Channel *client, gpointer user_data) { int lpc = 0; xmlNode *msg = NULL; crmd_client_t *curr_client = (crmd_client_t*)user_data; gboolean stay_connected = TRUE; crm_debug_2("Invoked: %s", curr_client->table_key); while(IPC_ISRCONN(client)) { if(client->ops->is_message_pending(client) == 0) { break; } msg = xmlfromIPC(client, MAX_IPC_DELAY); if (msg == NULL) { break; } #if ENABLE_ACL determine_request_user(&curr_client->user, client, msg, F_CRM_USER); #endif lpc++; crm_debug_2("Processing msg from %s", curr_client->table_key); crm_log_xml(LOG_DEBUG_2, "CRMd[inbound]", msg); if(crmd_authorize_message(msg, curr_client)) { route_message(C_IPC_MESSAGE, msg); } free_xml(msg); msg = NULL; if(client->ch_status != IPC_CONNECT) { break; } } crm_debug_2("Processed %d messages", lpc); if (client->ch_status != IPC_CONNECT) { stay_connected = FALSE; process_client_disconnect(curr_client); } trigger_fsa(fsa_source); return stay_connected; } extern GCHSource *lrm_source; gboolean lrm_dispatch(IPC_Channel *src_not_used, gpointer user_data) { /* ?? src == lrm_channel ?? */ ll_lrm_t *lrm = (ll_lrm_t*)user_data; IPC_Channel *lrm_channel = lrm->lrm_ops->ipcchan(lrm); lrm->lrm_ops->rcvmsg(lrm, FALSE); if(lrm_channel->ch_status != IPC_CONNECT) { lrm_connection_destroy(NULL); return FALSE; } return TRUE; } extern gboolean process_lrm_event(lrm_op_t *op); void lrm_op_callback(lrm_op_t* op) { CRM_CHECK(op != NULL, return); process_lrm_event(op); } static void crmd_peer_update(crm_node_t *member, enum crm_proc_flag client) { const char *status = NULL; CRM_CHECK(member != NULL, return); status = (member->processes&client)?ONLINESTATUS:OFFLINESTATUS; crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)", member->uname, peer2text(client), status, AM_I_DC?"true":crm_str(fsa_our_dc)); if((client & crm_proc_crmd) == 0) { return; } else if(is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { return; } else if(fsa_state == S_STOPPING) { return; } if(safe_str_eq(member->uname, fsa_our_dc) && crm_is_full_member(member) == FALSE){ /* Did the DC leave us? */ crm_info("Got client status callback - our DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } else if(AM_I_DC) { xmlNode *update = NULL; update = create_node_state( member->uname, NULL, NULL, status, NULL, NULL, FALSE, __FUNCTION__); fsa_cib_anon_update( XML_CIB_TAG_STATUS, update, cib_scope_local|cib_quorum_override|cib_can_create); free_xml(update); if((member->processes & client) == 0) { erase_node_from_join(member->uname); check_join_state(fsa_state, __FUNCTION__); + fail_incompletable_actions(transition_graph, member->uuid); } else { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } } trigger_fsa(fsa_source); } void ais_status_callback(enum crm_status_type type, crm_node_t *node, const void *data) { gboolean reset_status_entry = FALSE; uint32_t old = 0 ; set_bit_inplace(fsa_input_register, R_PEER_DATA); if(node->uname == NULL) { return; } switch(type) { case crm_status_uname: crm_info("status: %s is now %s", node->uname, node->state); /* reset_status_entry = TRUE; */ /* If we've never seen the node, then it also wont be in the status section */ break; case crm_status_nstate: crm_info("status: %s is now %s (was %s)", node->uname, node->state, (const char *)data); reset_status_entry = TRUE; break; case crm_status_processes: if (data) { old = *(const uint32_t *)data; } if( (node->processes ^ old) & crm_proc_crmd ) { crmd_peer_update(node, crm_proc_crmd); } break; } /* Can this be removed now that do_cl_join_finalize_respond() does the same thing? */ if(AM_I_DC && reset_status_entry && safe_str_eq(CRMD_STATE_ACTIVE, node->state)) { erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); /* TODO: potentially we also want to set XML_CIB_ATTR_JOINSTATE and XML_CIB_ATTR_EXPSTATE here */ } } void crmd_ha_status_callback(const char *node, const char *status, void *private) { xmlNode *update = NULL; crm_node_t *member = NULL; crm_notice("Status update: Node %s now has status [%s]", node, status); member = crm_get_peer(0, node); if(member == NULL || crm_is_member_active(member) == FALSE) { /* Make sure it is created so crm_update_peer_proc() succeeds */ const char *uuid = get_uuid(node); member = crm_update_peer(0, 0, 0, -1, 0, uuid, node, NULL, NULL); } if(safe_str_eq(status, PINGSTATUS)) { return; } if(safe_str_eq(status, DEADSTATUS)) { /* this node is toast */ crm_update_peer_proc(node, crm_proc_ais, OFFLINESTATUS); if(AM_I_DC) { update = create_node_state( node, DEADSTATUS, XML_BOOLEAN_NO, OFFLINESTATUS, CRMD_JOINSTATE_DOWN, NULL, TRUE, __FUNCTION__); } } else { crm_update_peer_proc(node, crm_proc_ais, ONLINESTATUS); if(AM_I_DC) { update = create_node_state( node, ACTIVESTATUS, NULL, NULL, CRMD_JOINSTATE_PENDING, NULL, FALSE, __FUNCTION__); } } trigger_fsa(fsa_source); if(update != NULL) { fsa_cib_anon_update( XML_CIB_TAG_STATUS, update, cib_scope_local|cib_quorum_override|cib_can_create); free_xml(update); } } void crmd_client_status_callback(const char * node, const char * client, const char * status, void * private) { const char *join = NULL; crm_node_t *member = NULL; gboolean clear_shutdown = FALSE; crm_debug_3("Invoked"); if(safe_str_neq(client, CRM_SYSTEM_CRMD)) { return; } if(safe_str_eq(status, JOINSTATUS)){ clear_shutdown = TRUE; status = ONLINESTATUS; join = CRMD_JOINSTATE_PENDING; } else if(safe_str_eq(status, LEAVESTATUS)){ status = OFFLINESTATUS; join = CRMD_JOINSTATE_DOWN; /* clear_shutdown = TRUE; */ } set_bit_inplace(fsa_input_register, R_PEER_DATA); crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)", node, client, status, AM_I_DC?"true":"false"); if(safe_str_eq(status, ONLINESTATUS)) { /* remove the cached value in case it changed */ crm_debug_2("Uncaching UUID for %s", node); unget_uuid(node); } member = crm_get_peer(0, node); if(member == NULL || crm_is_member_active(member) == FALSE) { /* Make sure it is created so crm_update_peer_proc() succeeds */ const char *uuid = get_uuid(node); member = crm_update_peer(0, 0, 0, -1, 0, uuid, node, NULL, NULL); } if(AM_I_DC) { xmlNode *update = NULL; crm_debug_3("Got client status callback"); update = create_node_state( node, NULL, NULL, status, join, NULL, clear_shutdown, __FUNCTION__); fsa_cib_anon_update( XML_CIB_TAG_STATUS, update, cib_scope_local|cib_quorum_override|cib_can_create); free_xml(update); } crm_update_peer_proc(node, crm_proc_crmd, status); } void crmd_ipc_connection_destroy(gpointer user_data) { GCHSource *source = NULL; crmd_client_t *client = user_data; /* Calling this function on an _active_ connection results in: * crmd_ipc_connection_destroy (callbacks.c:431) * -> G_main_del_IPC_Channel (GSource.c:478) * -> g_source_unref * -> G_CH_destroy_int (GSource.c:647) * -> crmd_ipc_connection_destroy (callbacks.c:437)\ * * A better alternative is to call G_main_del_IPC_Channel() directly */ if(client == NULL) { crm_debug_4("No client to delete"); return; } crm_debug_2("Disconnecting client %s (%p)", client->table_key, client); source = client->client_source; client->client_source = NULL; if(source != NULL) { crm_debug_3("Deleting %s (%p) from mainloop", client->table_key, source); G_main_del_IPC_Channel(source); } crm_free(client->table_key); crm_free(client->sub_sys); crm_free(client->uuid); crm_free(client->user); crm_free(client); return; } gboolean crmd_client_connect(IPC_Channel *client_channel, gpointer user_data) { crm_debug_3("Invoked"); if (client_channel == NULL) { crm_err("Channel was NULL"); } else if (client_channel->ch_status == IPC_DISCONNECT) { crm_err("Channel was disconnected"); } else { crmd_client_t *blank_client = NULL; crm_debug_3("Channel connected"); crm_malloc0(blank_client, sizeof(crmd_client_t)); CRM_ASSERT(blank_client != NULL); crm_debug_2("Created client: %p", blank_client); client_channel->ops->set_recv_qlen(client_channel, 1024); client_channel->ops->set_send_qlen(client_channel, 1024); blank_client->client_channel = client_channel; blank_client->sub_sys = NULL; blank_client->uuid = NULL; blank_client->table_key = NULL; blank_client->client_source = G_main_add_IPC_Channel( G_PRIORITY_LOW, client_channel, FALSE, crmd_ipc_msg_callback, blank_client, crmd_ipc_connection_destroy); } return TRUE; } #if SUPPORT_HEARTBEAT static gboolean fsa_have_quorum = FALSE; gboolean ccm_dispatch(int fd, gpointer user_data) { int rc = 0; oc_ev_t *ccm_token = (oc_ev_t*)user_data; gboolean was_error = FALSE; crm_debug_3("Invoked"); rc = oc_ev_handle_event(ccm_token); if(rc != 0) { if(is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) { /* we signed out, so this is expected */ register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL); crm_err("CCM connection appears to have failed: rc=%d.", rc); } was_error = TRUE; } trigger_fsa(fsa_source); return !was_error; } void crmd_ccm_msg_callback( oc_ed_t event, void *cookie, size_t size, const void *data) { gboolean update_cache = FALSE; const oc_ev_membership_t *membership = data; gboolean update_quorum = FALSE; crm_debug_3("Invoked"); CRM_ASSERT(data != NULL); crm_info("Quorum %s after event=%s (id=%d)", ccm_have_quorum(event)?"(re)attained":"lost", ccm_event_name(event), membership->m_instance); if(crm_peer_seq > membership->m_instance) { crm_err("Membership instance ID went backwards! %llu->%d", crm_peer_seq, membership->m_instance); CRM_ASSERT(crm_peer_seq <= membership->m_instance); return; } /* * OC_EV_MS_NEW_MEMBERSHIP: membership with quorum * OC_EV_MS_MS_INVALID: membership without quorum * OC_EV_MS_NOT_PRIMARY: previous membership no longer valid * OC_EV_MS_PRIMARY_RESTORED: previous membership restored * OC_EV_MS_EVICTED: the client is evicted from ccm. */ switch(event) { case OC_EV_MS_NEW_MEMBERSHIP: case OC_EV_MS_INVALID: update_cache = TRUE; update_quorum = TRUE; break; case OC_EV_MS_NOT_PRIMARY: break; case OC_EV_MS_PRIMARY_RESTORED: update_cache = TRUE; crm_peer_seq = membership->m_instance; break; case OC_EV_MS_EVICTED: update_quorum = TRUE; register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL); crm_err("Shutting down after CCM event: %s", ccm_event_name(event)); break; default: crm_err("Unknown CCM event: %d", event); } if(update_quorum) { crm_have_quorum = ccm_have_quorum(event); crm_update_quorum(crm_have_quorum, FALSE); if(crm_have_quorum == FALSE) { /* did we just loose quorum? */ if(fsa_have_quorum) { crm_info("Quorum lost: %s", ccm_event_name(event)); } } } if(update_cache) { crm_debug_2("Updating cache after event %s", ccm_event_name(event)); do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL); } else if(event != OC_EV_MS_NOT_PRIMARY) { crm_peer_seq = membership->m_instance; register_fsa_action(A_TE_CANCEL); } oc_ev_callback_done(cookie); return; } #endif void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn, ;); crm_debug_3("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if(is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit_inplace(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_debug_2("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_debug_2("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/crmd/ccm.c b/crmd/ccm.c index 1c355719b5..0c88b26196 100644 --- a/crmd/ccm.c +++ b/crmd/ccm.c @@ -1,479 +1,481 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* put these first so that uuid_t is defined without conflicts */ #include #if SUPPORT_HEARTBEAT #include #include #endif #include #include #include #include #include #include #include #include #include #include +#include gboolean membership_flux_hack = FALSE; void post_cache_update(int instance); #if SUPPORT_HEARTBEAT oc_ev_t *fsa_ev_token; void oc_ev_special(const oc_ev_t *, oc_ev_class_t , int ); void crmd_ccm_msg_callback( oc_ed_t event, void *cookie, size_t size, const void *data); #endif void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); void check_dead_member(const char *uname, GHashTable *members); void reap_dead_ccm_nodes(gpointer key, gpointer value, gpointer user_data); #define CCM_EVENT_DETAIL 0 #define CCM_EVENT_DETAIL_PARTIAL 0 int num_ccm_register_fails = 0; int max_ccm_register_fails = 30; int last_peer_update = 0; extern GHashTable *voted; struct update_data_s { const char *state; const char *caller; xmlNode *updates; gboolean overwrite_join; }; void reap_dead_ccm_nodes(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; if(crm_is_member_active(node) == FALSE) { check_dead_member(node->uname, NULL); + fail_incompletable_actions(transition_graph, node->uuid); } } extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void check_dead_member(const char *uname, GHashTable *members) { CRM_CHECK(uname != NULL, return); if(members != NULL && g_hash_table_lookup(members, uname) != NULL) { crm_err("%s didnt really leave the membership!", uname); return; } erase_node_from_join(uname); if(voted != NULL) { g_hash_table_remove(voted, uname); } if(safe_str_eq(fsa_our_uname, uname)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if(AM_I_DC == FALSE && safe_str_eq(uname, fsa_our_dc)) { crm_warn("Our DC node (%s) left the cluster", uname); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if(fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { check_join_state(fsa_state, __FUNCTION__); } } /* A_CCM_CONNECT */ void do_ccm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { #if SUPPORT_HEARTBEAT if(is_heartbeat_cluster()) { if(action & A_CCM_DISCONNECT){ set_bit_inplace(fsa_input_register, R_CCM_DISCONNECTED); oc_ev_unregister(fsa_ev_token); } if(action & A_CCM_CONNECT) { int ret; int fsa_ev_fd; gboolean did_fail = FALSE; crm_debug_3("Registering with CCM"); clear_bit_inplace(fsa_input_register, R_CCM_DISCONNECTED); ret = oc_ev_register(&fsa_ev_token); if (ret != 0) { crm_warn("CCM registration failed"); did_fail = TRUE; } if(did_fail == FALSE) { crm_debug_3("Setting up CCM callbacks"); ret = oc_ev_set_callback(fsa_ev_token, OC_EV_MEMB_CLASS, crmd_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if(did_fail == FALSE) { oc_ev_special(fsa_ev_token, OC_EV_MEMB_CLASS, 0/*don't care*/); crm_debug_3("Activating CCM token"); ret = oc_ev_activate(fsa_ev_token, &fsa_ev_fd); if (ret != 0){ crm_warn("CCM Activation failed"); did_fail = TRUE; } } if(did_fail) { num_ccm_register_fails++; oc_ev_unregister(fsa_ev_token); if(num_ccm_register_fails < max_ccm_register_fails) { crm_warn("CCM Connection failed" " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return; } else { crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return; } } crm_info("CCM connection established..." " waiting for first callback"); G_main_add_fd(G_PRIORITY_HIGH, fsa_ev_fd, FALSE, ccm_dispatch, fsa_ev_token, default_ipc_connection_destroy); } } #endif if(action & ~(A_CCM_CONNECT|A_CCM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } #if SUPPORT_HEARTBEAT void ccm_event_detail(const oc_ev_membership_t *oc, oc_ed_t event) { int lpc; gboolean member = FALSE; member = FALSE; crm_debug_2("-----------------------"); crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", ccm_event_name(event), oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); #if !CCM_EVENT_DETAIL_PARTIAL for(lpc=0; lpc < oc->m_n_member; lpc++) { crm_info("\tCURRENT: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_memb_idx+lpc].node_uname, oc->m_array[oc->m_memb_idx+lpc].node_id, oc->m_array[oc->m_memb_idx+lpc].node_born_on); if(safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx+lpc].node_uname)) { member = TRUE; } } if (member == FALSE) { crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST"); } #endif for(lpc=0; lpc<(int)oc->m_n_in; lpc++) { crm_info("\tNEW: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_in_idx+lpc].node_uname, oc->m_array[oc->m_in_idx+lpc].node_id, oc->m_array[oc->m_in_idx+lpc].node_born_on); } for(lpc=0; lpc<(int)oc->m_n_out; lpc++) { crm_info("\tLOST: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_out_idx+lpc].node_uname, oc->m_array[oc->m_out_idx+lpc].node_id, oc->m_array[oc->m_out_idx+lpc].node_born_on); } crm_debug_2("-----------------------"); } #endif gboolean ever_had_quorum = FALSE; void post_cache_update(int instance) { xmlNode *no_op = NULL; crm_peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_ccm_nodes, NULL); set_bit_inplace(fsa_input_register, R_CCM_DATA); if(AM_I_DC) { populate_cib_nodes(FALSE); do_update_cib_nodes(FALSE, __FUNCTION__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ register_fsa_action(A_ELECTION_CHECK); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request( CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC?CRM_SYSTEM_DC:CRM_SYSTEM_CRMD, NULL); send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); free_xml(no_op); } /* A_CCM_UPDATE_CACHE */ /* * Take the opportunity to update the node status in the CIB as well */ #if SUPPORT_HEARTBEAT void do_ccm_update_cache( enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, oc_ed_t event, const oc_ev_membership_t *oc, xmlNode *xml) { unsigned long long instance = 0; unsigned int lpc = 0; if(is_heartbeat_cluster()) { CRM_ASSERT(oc != NULL); instance = oc->m_instance; } CRM_ASSERT(crm_peer_seq <= instance); switch(cur_state) { case S_STOPPING: case S_TERMINATE: case S_HALT: crm_debug("Ignoring %s CCM event %llu, we're in state %s", ccm_event_name(event), instance, fsa_state2string(cur_state)); return; case S_ELECTION: register_fsa_action(A_ELECTION_CHECK); break; default: break; } if(is_heartbeat_cluster()) { ccm_event_detail(oc, event); /*--*-- Recently Dead Member Nodes --*--*/ for(lpc=0; lpc < oc->m_n_out; lpc++) { crm_update_ccm_node(oc, lpc+oc->m_out_idx, CRM_NODE_LOST, instance); } /*--*-- All Member Nodes --*--*/ for(lpc=0; lpc < oc->m_n_member; lpc++) { crm_update_ccm_node(oc, lpc+oc->m_memb_idx, CRM_NODE_ACTIVE, instance); } } if(event == OC_EV_MS_EVICTED) { crm_update_peer( 0, 0, 0, -1, 0, fsa_our_uuid, fsa_our_uname, NULL, CRM_NODE_EVICTED); /* todo: drop back to S_PENDING instead */ /* get out... NOW! * * go via the error recovery process so that HA will * restart us if required */ register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__); } post_cache_update(instance); return; } #endif static void ccm_node_update_complete(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { fsa_data_t *msg_data = NULL; last_peer_update = 0; if(rc == cib_ok) { crm_debug_2("Node update %d complete", call_id); } else { crm_err("Node update %d failed", call_id); crm_log_xml(LOG_DEBUG, "failed", msg); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) { xmlNode *tmp1 = NULL; const char *join = NULL; crm_node_t *node = value; struct update_data_s* data = (struct update_data_s*)user_data; data->state = XML_BOOLEAN_NO; if(safe_str_eq(node->state, CRM_NODE_ACTIVE)) { data->state = XML_BOOLEAN_YES; } crm_debug("Updating %s: %s (overwrite=%s) hash_size=%d", node->uname, data->state, data->overwrite_join?"true":"false", g_hash_table_size(confirmed_nodes)); if(data->overwrite_join) { if((node->processes & crm_proc_crmd) == FALSE) { join = CRMD_JOINSTATE_DOWN; } else { const char *peer_member = g_hash_table_lookup( confirmed_nodes, node->uname); if(peer_member != NULL) { join = CRMD_JOINSTATE_MEMBER; } else { join = CRMD_JOINSTATE_PENDING; } } } tmp1 = create_node_state( node->uname, (node->processes&crm_proc_ais)?ACTIVESTATUS:DEADSTATUS, data->state, (node->processes&crm_proc_crmd)?ONLINESTATUS:OFFLINESTATUS, join, NULL, FALSE, data->caller); add_node_copy(data->updates, tmp1); free_xml(tmp1); } void do_update_cib_nodes(gboolean overwrite, const char *caller) { int call_id = 0; int call_options = cib_scope_local|cib_quorum_override; struct update_data_s update_data; xmlNode *fragment = NULL; if(crm_peer_cache == NULL) { /* We got a replace notification before being connected to * the CCM. * So there is no need to update the local CIB with our values * - since we have none. */ return; } else if(AM_I_DC == FALSE) { return; } fragment = create_xml_node(NULL, XML_CIB_TAG_STATUS); update_data.caller = caller; update_data.updates = fragment; update_data.overwrite_join = overwrite; g_hash_table_foreach(crm_peer_cache, ghash_update_cib_node, &update_data); fsa_cib_update(XML_CIB_TAG_STATUS, fragment, call_options, call_id, NULL); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, ccm_node_update_complete); last_peer_update = call_id; free_xml(fragment); } static void cib_quorum_update_complete( xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { fsa_data_t *msg_data = NULL; if(rc == cib_ok) { crm_debug_2("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed", call_id); crm_log_xml(LOG_DEBUG, "failed", msg); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { ever_had_quorum |= quorum; if(AM_I_DC && (force_update || fsa_has_quorum != quorum)) { int call_id = 0; xmlNode *update = NULL; int call_options = cib_scope_local|cib_quorum_override; update = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); set_uuid(update, XML_ATTR_DC_UUID, fsa_our_uname); fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); crm_info("Updating quorum status to %s (call=%d)", quorum?"true":"false", call_id); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); } fsa_has_quorum = quorum; } diff --git a/crmd/te_events.c b/crmd/te_events.c index 456ff3b11d..fde33b5dd6 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -1,501 +1,505 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc, int target_rc); gboolean fail_incompletable_actions(crm_graph_t *graph, const char *down_node) { const char *target = NULL; xmlNode *last_action = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; + if(graph == NULL || graph->complete) { + return FALSE; + } + gIter = graph->synapses; for(; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t*)gIter->data; if (synapse->confirmed) { continue; } gIter2 = synapse->actions; for(; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t*)gIter2->data; if(action->type == action_type_pseudo || action->confirmed) { continue; } else if(action->type == action_type_crm) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if(safe_str_eq(task, CRM_OP_FENCE)) { continue; } } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if(safe_str_eq(target, down_node)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } } } if(last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } static gboolean update_failcount(xmlNode *event, const char *event_node, int rc, int target_rc, gboolean do_update) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *value = NULL; const char *id = ID(event); const char *on_uname = get_uname(event_node); if(rc == 99) { /* this is an internal code for "we're busy, try again" */ return FALSE; } else if(rc == target_rc) { return FALSE; } if(failed_stop_offset == NULL) { failed_stop_offset = crm_strdup(INFINITY_S); } if(failed_start_offset == NULL) { failed_start_offset = crm_strdup(INFINITY_S); } CRM_CHECK(on_uname != NULL, return TRUE); CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if(do_update || interval > 0) { do_update = TRUE; } else if(safe_str_eq(task, CRMD_ACTION_START)) { do_update = TRUE; value = failed_start_offset; } else if(safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if(safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if(safe_str_eq(task, CRMD_ACTION_PROMOTE)) { do_update = TRUE; } else if(safe_str_eq(task, CRMD_ACTION_DEMOTE)) { do_update = TRUE; } if(value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE"++"; } if(do_update) { char *now = crm_itoa(time(NULL)); crm_warn("Updating failcount for %s on %s after failed %s:" " rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now); attr_name = crm_concat("fail-count", rsc_id, '-'); update_attrd(on_uname, attr_name, value, NULL); crm_free(attr_name); attr_name = crm_concat("last-failure", rsc_id, '-'); update_attrd(on_uname, attr_name, now, NULL); crm_free(attr_name); crm_free(now); } bail: crm_free(rsc_id); crm_free(task); return TRUE; } static int status_from_rc(crm_action_t *action, int orig_status, int rc, int target_rc) { int status = orig_status; if(target_rc == rc) { crm_debug_2("Target rc: == %d", rc); if(status != LRM_OP_DONE) { crm_debug_2("Re-mapping op status to" " LRM_OP_DONE for rc=%d", rc); status = LRM_OP_DONE; } } else { status = LRM_OP_ERROR; } /* 99 is the code we use for direct nack's */ if(rc != 99 && status != LRM_OP_DONE) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s", action->id, task, uname, target_rc, rc, op_status2text(status)); } return status; } /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc, int target_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = NULL; crm_action_t *action = NULL; action = get_action(action_id, FALSE); if(action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc, target_rc); if(op_status != LRM_OP_DONE) { update_failcount(event, event_node, op_rc, target_rc, FALSE); } /* Process OP status */ switch(op_status) { case LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case LRM_OP_DONE: break; case LRM_OP_ERROR: case LRM_OP_TIMEOUT: case LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); if(action->failed) { allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); if(crm_is_true(allow_fail)) { action->failed = FALSE; } } if(action->failed) { abort_transition(action->synapse->priority+1, tg_restart, "Event failed", event); } this_event = ID(event); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); te_log_action(LOG_INFO, "Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for(; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t*) gIter->data; gIter2 = synapse->actions; for(; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t*)gIter2->data; if(action->id == id) { if(confirmed) { stop_te_timer(action->timer); action->confirmed = TRUE; } return action; } } } return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { const char *task = NULL; const char *target = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for(; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t*) gIter->data; gIter2 = synapse->actions; for(; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t*)gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if(safe_str_neq(CRMD_ACTION_CANCEL, task)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if(safe_str_neq(task, id)) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if(safe_str_neq(target, node)) { continue; } return action; } } return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for(; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t*)gIter->data; /* lookup event */ gIter2 = synapse->actions; for(; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t*)gIter2->data; if(id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value( action->xml, XML_LRM_ATTR_TASK); if(action->type != action_type_crm) { continue; } else if(safe_str_eq(this_action, CRM_OP_LRM_REFRESH)){ continue; } else if(filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID); if(this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if(safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; id = action->id; break; } if(match != NULL) { /* stop this event's timer if it had one */ break; } } if(match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); } else if(id > 0) { crm_err("No match for action %d", id); } else { crm_warn("No match for shutdown action on %s", target); } return match; } gboolean process_graph_event(xmlNode *event, const char *event_node) { int rc = -1; int status = -1; int action = -1; int target_rc = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean stop_early = FALSE; gboolean passed = FALSE; const char *id = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); id = ID(event); magic = crm_element_value(event, XML_ATTR_TRANSITION_MAGIC); if(magic == NULL) { /* non-change */ return FALSE; } CRM_CHECK(decode_transition_magic( magic, &update_te_uuid, &transition_num, &action, &status, &rc, &target_rc), crm_err("Invalid event %s detected", id); abort_transition(INFINITY, tg_restart,"Bad event", event); return FALSE; ); if(status == LRM_OP_PENDING) { goto bail; } if(transition_num == -1) { crm_err("Action %s (%s) initiated outside of a transition", id, magic); abort_transition(INFINITY, tg_restart,"Unexpected event",event); } else if(action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) { crm_info("Action %s/%d (%s) initiated by a different transitioner", id, action, magic); abort_transition(INFINITY, tg_restart,"Foreign event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if(transition_graph->id != transition_num) { crm_info("Detected action %s from a different transition:" " %d vs. %d", id, transition_num, transition_graph->id); abort_transition(INFINITY, tg_restart,"Old event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if(transition_graph->complete) { crm_info("Action %s arrived after a completed transition", id); abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if(match_graph_event( action, event, event_node, status, rc, target_rc) < 0) { crm_err("Unknown graph action %s", id); abort_transition(INFINITY, tg_restart, "Unknown event", event); } else { passed = TRUE; crm_debug_2("Processed update to %s: %s", id, magic); } if(passed == FALSE) { if(update_failcount(event, event_node, rc, target_rc, transition_num == -1)) { /* Turns out this wasn't an lrm status refresh update aferall */ stop_early = FALSE; } } bail: crm_free(update_te_uuid); return stop_early; }