diff --git a/crm/crmd/callbacks.c b/crm/crmd/callbacks.c index c4106561d4..cbf6e51d98 100644 --- a/crm/crmd/callbacks.c +++ b/crm/crmd/callbacks.c @@ -1,589 +1,584 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *crmd_peer_state = NULL; crm_data_t *find_xml_in_hamessage(const HA_Message * msg); void crmd_ha_connection_destroy(gpointer user_data); /* From join_dc... */ extern gboolean check_join_state( enum crmd_fsa_state cur_state, const char *source); /* #define MAX_EMPTY_CALLBACKS 20 */ /* int empty_callbacks = 0; */ gboolean crmd_ha_msg_dispatch(IPC_Channel *channel, gpointer user_data) { int lpc = 0; ll_cluster_t *hb_cluster = (ll_cluster_t*)user_data; while(lpc < 2 && hb_cluster->llc_ops->msgready(hb_cluster)) { if(channel->ch_status != IPC_CONNECT) { /* there really is no point continuing */ break; } lpc++; /* invoke the callbacks but dont block */ hb_cluster->llc_ops->rcvmsg(hb_cluster, 0); } crm_debug_3("%d HA messages dispatched", lpc); G_main_set_trigger(fsa_source); if (channel && (channel->ch_status != IPC_CONNECT)) { crm_crit("Lost connection to heartbeat service."); return FALSE; } return TRUE; } void crmd_ha_msg_callback(const HA_Message * msg, void* private_data) { ha_msg_input_t *new_input = NULL; oc_node_t *from_node = NULL; const char *from = ha_msg_value(msg, F_ORIG); const char *seq = ha_msg_value(msg, F_SEQ); const char *op = ha_msg_value(msg, F_CRM_TASK); const char *sys_to = ha_msg_value(msg, F_CRM_SYS_TO); const char *sys_from = ha_msg_value(msg, F_CRM_SYS_FROM); CRM_DEV_ASSERT(from != NULL); if(fsa_membership_copy == NULL) { crm_debug("Ignoring HA messages until we are" " connected to the CCM (%s op from %s)", op, from); crm_log_message_adv( LOG_MSG, "HA[inbound]: Ignore (No CCM)", msg); return; } from_node = g_hash_table_lookup(fsa_membership_copy->members, from); if(from_node == NULL) { int level = LOG_DEBUG; if(safe_str_eq(op, CRM_OP_VOTE)) { level = LOG_WARNING; } else if(AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) { level = LOG_WARNING; } else if(safe_str_eq(sys_from, CRM_SYSTEM_DC)) { level = LOG_WARNING; } do_crm_log(level, __FILE__, __FUNCTION__, "Ignoring HA message (op=%s) from %s: not in our" " membership list (size=%d)", op, from, g_hash_table_size(fsa_membership_copy->members)); crm_log_message_adv(LOG_MSG, "HA[inbound]: CCM Discard", msg); } else if(AM_I_DC && safe_str_eq(sys_from, CRM_SYSTEM_DC) && safe_str_neq(from, fsa_our_uname)) { crm_err("Another DC detected: %s (op=%s)", from, op); crm_log_message_adv( LOG_WARNING, "HA[inbound]: Duplicate DC", msg); new_input = new_ha_msg_input(msg); /* make sure the election happens NOW */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, new_input, __FUNCTION__); #if 0 /* still thinking about this one... * could create a timing issue if we dont notice the * election before a new DC is elected. */ } else if(fsa_our_dc != NULL && safe_str_eq(sys_from, CRM_SYSTEM_DC) && safe_str_neq(from, fsa_our_dc)) { crm_warn("Ignoring message from wrong DC: %s vs. %s ", from, fsa_our_dc); crm_log_message_adv(LOG_WARNING, "HA[inbound]: wrong DC", msg); #endif } else if(safe_str_eq(sys_to, CRM_SYSTEM_DC) && AM_I_DC == FALSE) { crm_debug_2("Ignoring message for the DC [F_SEQ=%s]", seq); crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]: ignore", msg); return; } else if(safe_str_eq(from, fsa_our_uname) && safe_str_eq(op, CRM_OP_VOTE)) { crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]", msg); crm_debug_2("Ignoring our own vote [F_SEQ=%s]: own vote", seq); return; } else if(AM_I_DC && safe_str_eq(op, CRM_OP_HBEAT)) { crm_debug_2("Ignoring our own heartbeat [F_SEQ=%s]", seq); crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]: own heartbeat", msg); return; } else { crm_debug_3("Processing message"); crm_log_message_adv(LOG_MSG, "HA[inbound]", msg); new_input = new_ha_msg_input(msg); register_fsa_input(C_HA_MESSAGE, I_ROUTER, new_input); } #if 0 if(ha_msg_value(msg, XML_ATTR_REFERENCE) == NULL) { ha_msg_add(new_input->msg, XML_ATTR_REFERENCE, seq); } #endif delete_ha_msg_input(new_input); return; } /* * Apparently returning TRUE means "stay connected, keep doing stuff". * Returning FALSE means "we're all done, close the connection" */ gboolean crmd_ipc_msg_callback(IPC_Channel *client, gpointer user_data) { int lpc = 0; IPC_Message *msg = NULL; ha_msg_input_t *new_input = NULL; crmd_client_t *curr_client = (crmd_client_t*)user_data; gboolean stay_connected = TRUE; crm_debug_2("Processing IPC message from %s", curr_client->table_key); while(lpc == 0 && client->ops->is_message_pending(client)) { if (client->ch_status != IPC_CONNECT) { /* The message which was pending for us is that * the IPC status is now IPC_DISCONNECT */ break; } if (client->ops->recv(client, &msg) != IPC_OK) { perror("Receive failure:"); crm_err("[%s] [receive failure]", curr_client->table_key); stay_connected = FALSE; break; } else if (msg == NULL) { crm_err("[%s] [no message this time]", curr_client->table_key); continue; } lpc++; new_input = new_ipc_msg_input(msg); msg->msg_done(msg); crm_debug_2("Processing msg from %s", curr_client->table_key); crm_log_message_adv(LOG_MSG, "CRMd[inbound]", new_input->msg); if(crmd_authorize_message(new_input, curr_client)) { register_fsa_input(C_IPC_MESSAGE, I_ROUTER, new_input); } delete_ha_msg_input(new_input); msg = NULL; new_input = NULL; } crm_debug_2("Processed %d messages", lpc); if (client->ch_status != IPC_CONNECT) { stay_connected = FALSE; process_client_disconnect(curr_client); } G_main_set_trigger(fsa_source); return stay_connected; } gboolean lrm_dispatch(IPC_Channel*src_not_used, gpointer user_data) { int num_msgs = 0; ll_lrm_t *lrm = (ll_lrm_t*)user_data; crm_debug_3("received callback"); num_msgs = lrm->lrm_ops->rcvmsg(lrm, FALSE); if(num_msgs < 1) { crm_err("lrm->lrm_ops->rcvmsg() failed, connection lost?"); clear_bit_inplace(fsa_input_register, R_LRM_CONNECTED); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); return FALSE; } return TRUE; } void lrm_op_callback(lrm_op_t* op) { CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return; } crm_debug("received callback: %s/%s (%s)", op->op_type, op->rsc_id, op_status2text(op->op_status)); /* Make sure the LRM events are received in order */ register_fsa_input_later(C_LRM_OP_CALLBACK, I_LRM_EVENT, op); } void crmd_ha_status_callback( const char *node, const char * status, void* private_data) { crm_data_t *update = NULL; crm_debug_3("received callback"); crm_notice("Status update: Node %s now has status [%s]",node,status); if(safe_str_neq(status, DEADSTATUS)) { crm_debug_3("nstatus callback was not for a dead node"); return; } /* this node is taost */ update = create_node_state( node, node, status, NULL, NULL, NULL, NULL, __FUNCTION__); set_xml_property_copy( update, XML_CIB_ATTR_CLEAR_SHUTDOWN, XML_BOOLEAN_TRUE); update_local_cib(create_cib_fragment(update, NULL)); G_main_set_trigger(fsa_source); free_xml(update); } void crmd_client_status_callback(const char * node, const char * client, const char * status, void * private) { const char *join = NULL; const char *extra = NULL; crm_data_t * update = NULL; crm_debug_3("received callback"); if(safe_str_neq(client, CRM_SYSTEM_CRMD)) { return; } if(safe_str_eq(status, JOINSTATUS)){ status = ONLINESTATUS; extra = XML_CIB_ATTR_CLEAR_SHUTDOWN; } else if(safe_str_eq(status, LEAVESTATUS)){ status = OFFLINESTATUS; join = CRMD_JOINSTATE_DOWN; extra = XML_CIB_ATTR_CLEAR_SHUTDOWN; } set_bit_inplace(fsa_input_register, R_PEER_DATA); g_hash_table_replace( crmd_peer_state, crm_strdup(node), crm_strdup(status)); if(fsa_state == S_STARTING || fsa_state == S_STOPPING) { return; } crm_notice("Status update: Client %s/%s now has status [%s]", node, client, status); if(safe_str_eq(node, fsa_our_dc) && safe_str_eq(status, OFFLINESTATUS)) { /* did our DC leave us */ crm_info("Got client status callback - our DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } else { crm_data_t *fragment = NULL; crm_debug_3("Got client status callback"); update = create_node_state( node, node, NULL, NULL, status, join, NULL, __FUNCTION__); set_xml_property_copy(update, extra, XML_BOOLEAN_TRUE); fragment = create_cib_fragment(update, NULL); /* it is safe to keep these updates on the local node * each node updates their own CIB */ fsa_cib_conn->cmds->modify( fsa_cib_conn, XML_CIB_TAG_STATUS, fragment, NULL, cib_inhibit_bcast|cib_scope_local|cib_quorum_override); free_xml(fragment); free_xml(update); if(AM_I_DC && safe_str_eq(status, OFFLINESTATUS)) { g_hash_table_remove(confirmed_nodes, node); g_hash_table_remove(finalized_nodes, node); g_hash_table_remove(integrated_nodes, node); g_hash_table_remove(welcomed_nodes, node); check_join_state(fsa_state, __FUNCTION__); } } G_main_set_trigger(fsa_source); } void crmd_ha_connection_destroy(gpointer user_data) { crm_crit("Heartbeat has left us"); /* this is always an error */ /* feed this back into the FSA */ register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); } gboolean crmd_client_connect(IPC_Channel *client_channel, gpointer user_data) { if (client_channel == NULL) { crm_err("Channel was NULL"); } else if (client_channel->ch_status == IPC_DISCONNECT) { crm_err("Channel was disconnected"); } else { crmd_client_t *blank_client = NULL; crm_debug_3("Channel connected"); crm_malloc0(blank_client, sizeof(crmd_client_t)); if (blank_client == NULL) { return FALSE; } client_channel->ops->set_recv_qlen(client_channel, 100); client_channel->ops->set_send_qlen(client_channel, 100); blank_client->client_channel = client_channel; blank_client->sub_sys = NULL; blank_client->uuid = NULL; blank_client->table_key = NULL; blank_client->client_source = G_main_add_IPC_Channel( G_PRIORITY_LOW, client_channel, FALSE, crmd_ipc_msg_callback, blank_client, default_ipc_connection_destroy); } return TRUE; } gboolean ccm_dispatch(int fd, gpointer user_data) { int rc = 0; oc_ev_t *ccm_token = (oc_ev_t*)user_data; gboolean was_error = FALSE; crm_debug_3("received callback"); rc = oc_ev_handle_event(ccm_token); if(rc != 0) { crm_err("CCM connection appears to have failed: rc=%d.", rc); register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL); was_error = TRUE; } G_main_set_trigger(fsa_source); return !was_error; } static gboolean fsa_have_quorum = FALSE; void crmd_ccm_msg_callback( oc_ed_t event, void *cookie, size_t size, const void *data) { int instance = -1; gboolean update_cache = FALSE; struct crmd_ccm_data_s *event_data = NULL; const oc_ev_membership_t *membership = data; - const oc_ev_membership_t *oc = data; gboolean update_quorum = FALSE; gboolean trigger_transition = FALSE; crm_debug_3("received callback"); if(data != NULL) { instance = membership->m_instance; } crm_info("Quorum %s after event=%s (id=%d)", ccm_have_quorum(event)?"(re)attained":"lost", ccm_event_name(event), instance); switch(event) { case OC_EV_MS_NEW_MEMBERSHIP: - CRM_DEV_ASSERT(oc->m_n_in != 0 || oc->m_n_out != 0); - CRM_DEV_ASSERT(oc->m_n_in == 0 || oc->m_n_out == 0); case OC_EV_MS_INVALID:/* fall through */ update_cache = TRUE; update_quorum = TRUE; if(AM_I_DC && need_transition(fsa_state)) { trigger_transition = TRUE; } break; case OC_EV_MS_NOT_PRIMARY: #if UNTESTED if(AM_I_DC == FALSE) { break; } /* tell the TE to pretend it completed and stop */ /* side effect: we'll end up in S_IDLE */ register_fsa_action(A_TE_HALT, TRUE); #endif break; case OC_EV_MS_PRIMARY_RESTORED: - CRM_DEV_ASSERT(oc->m_n_in != 0 || oc->m_n_out != 0); - CRM_DEV_ASSERT(oc->m_n_in == 0 || oc->m_n_out == 0); if(AM_I_DC && need_transition(fsa_state)) { fsa_membership_copy->id = instance; trigger_transition = TRUE; } break; case OC_EV_MS_EVICTED: update_quorum = TRUE; register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL); break; default: crm_err("Unknown CCM event: %d", event); } if(update_quorum && ccm_have_quorum(event) == FALSE) { /* did we just loose quorum? */ if(fsa_have_quorum && need_transition(fsa_state)) { crm_info("Quorum lost: triggering transition (%s)", ccm_event_name(event)); trigger_transition = TRUE; } fsa_have_quorum = FALSE; } else if(update_quorum) { crm_debug("Updating quorum after event %s", ccm_event_name(event)); fsa_have_quorum = TRUE; } if(update_cache) { crm_debug("Updating cache after event %s", ccm_event_name(event)); crm_malloc0(event_data, sizeof(struct crmd_ccm_data_s)); if(event_data == NULL) { return; } event_data->event = event; if(data != NULL) { event_data->oc = copy_ccm_oc_data(data); } register_fsa_input_adv( C_CCM_CALLBACK, I_CCM_EVENT, event_data, trigger_transition?A_TE_CANCEL:A_NOTHING, FALSE, __FUNCTION__); if (event_data->oc) { crm_free(event_data->oc); event_data->oc = NULL; } crm_free(event_data); } else if(trigger_transition) { crm_debug("Scheduling transition after event %s", ccm_event_name(event)); register_fsa_action(A_TE_CANCEL); } oc_ev_callback_done(cookie); return; } void crmd_cib_connection_destroy(gpointer user_data) { if(is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit_inplace(fsa_input_register, R_CIB_CONNECTED); return; } longclock_t fsa_start = 0; longclock_t fsa_stop = 0; longclock_t fsa_diff = 0; gboolean crm_fsa_trigger(gpointer user_data) { unsigned int fsa_diff_ms = 0; if(fsa_diff_max_ms > 0) { fsa_start = time_longclock(); } s_crmd_fsa(C_FSA_INTERNAL); if(fsa_diff_max_ms > 0) { fsa_stop = time_longclock(); fsa_diff = sub_longclock(fsa_stop, fsa_start); fsa_diff_ms = longclockto_ms(fsa_diff); if(fsa_diff_ms > fsa_diff_max_ms) { crm_err("FSA took %dms to complete", fsa_diff_ms); } else if(fsa_diff_ms > fsa_diff_warn_ms) { crm_warn("FSA took %dms to complete", fsa_diff_ms); } } return TRUE; } diff --git a/crm/crmd/ccm.c b/crm/crmd/ccm.c index cd111ca245..5ea6a99ddf 100644 --- a/crm/crmd/ccm.c +++ b/crm/crmd/ccm.c @@ -1,671 +1,661 @@ -/* $Id: ccm.c,v 1.81 2005/05/23 16:24:21 andrew Exp $ */ +/* $Id: ccm.c,v 1.82 2005/05/23 20:28:26 andrew Exp $ */ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include void oc_ev_special(const oc_ev_t *, oc_ev_class_t , int ); int register_with_ccm(ll_cluster_t *hb_cluster); void msg_ccm_join(const HA_Message *msg, void *foo); void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data); gboolean ghash_node_clfree(gpointer key, gpointer value, gpointer user_data); void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); #define CCM_EVENT_DETAIL 0 #define CCM_EVENT_DETAIL_PARTIAL 1 oc_ev_t *fsa_ev_token; int num_ccm_register_fails = 0; int max_ccm_register_fails = 30; /* A_CCM_CONNECT */ enum crmd_fsa_input do_ccm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int ret; int fsa_ev_fd; gboolean did_fail = FALSE; if(action & A_CCM_DISCONNECT){ oc_ev_unregister(fsa_ev_token); } if(action & A_CCM_CONNECT) { crm_debug_3("Registering with CCM"); ret = oc_ev_register(&fsa_ev_token); if (ret != 0) { crm_warn("CCM registration failed"); did_fail = TRUE; } if(did_fail == FALSE) { crm_debug_3("Setting up CCM callbacks"); ret = oc_ev_set_callback(fsa_ev_token, OC_EV_MEMB_CLASS, crmd_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if(did_fail == FALSE) { oc_ev_special(fsa_ev_token, OC_EV_MEMB_CLASS, 0/*don't care*/); crm_debug_3("Activating CCM token"); ret = oc_ev_activate(fsa_ev_token, &fsa_ev_fd); if (ret != 0){ crm_warn("CCM Activation failed"); did_fail = TRUE; } } if(did_fail) { num_ccm_register_fails++; oc_ev_unregister(fsa_ev_token); if(num_ccm_register_fails < max_ccm_register_fails) { crm_warn("CCM Connection failed" " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return I_NULL; } else { crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } } crm_info("CCM Activation passed... all set to go!"); G_main_add_fd(G_PRIORITY_HIGH, fsa_ev_fd, FALSE, ccm_dispatch, fsa_ev_token, default_ipc_connection_destroy); } if(action & ~(A_CCM_CONNECT|A_CCM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } return I_NULL; } /* A_CCM_EVENT */ enum crmd_fsa_input do_ccm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum crmd_fsa_input return_input = I_NULL; oc_ed_t event; const oc_ev_membership_t *oc = NULL; struct crmd_ccm_data_s *ccm_data = fsa_typed_data(fsa_dt_ccm); if(ccm_data == NULL) { crm_err("No data provided to FSA function"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } else if(msg_data->fsa_cause != C_CCM_CALLBACK) { crm_err("FSA function called in response to incorect input"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } event = ccm_data->event; oc = ccm_data->oc; crm_info("event=%s", ccm_event_name(event)); if(CCM_EVENT_DETAIL /*constant condition*/ || CCM_EVENT_DETAIL_PARTIAL) { ccm_event_detail(oc, event); } if (OC_EV_MS_EVICTED == event) { /* todo: drop back to S_PENDING instead */ /* get out... NOW! * * go via the error recovery process so that HA will * restart us if required */ register_fsa_error(cause, I_ERROR, msg_data->data); return I_NULL; } - /* My understanding is that we will never get both - * node leaving *and* node joining callbacks at the - * same time. - * - * Gshi concurrs with this - * - * Various logic would need to change if this is not - * the case... so assert its true - */ CRM_DEV_ASSERT(oc->m_n_in != 0 || oc->m_n_out != 0); - CRM_DEV_ASSERT(oc->m_n_in == 0 || oc->m_n_out == 0); if(AM_I_DC) { /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ HA_Message *no_op = create_request( CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); send_msg_via_ha(fsa_cluster_conn, no_op); } else if(oc->m_n_out != 0) { /* Possibly move this logic to ghash_update_cib_node() */ unsigned lpc = 0; int offset = oc->m_out_idx; for(lpc=0; lpc < oc->m_n_out; lpc++) { const char *uname = oc->m_array[offset+lpc].node_uname; if(uname == NULL) { crm_err("CCM node had no name"); continue; } else if(safe_str_eq(uname, fsa_our_dc)) { crm_info("Our DC node (%s) left the cluster", uname); register_fsa_input(cause, I_ELECTION, NULL); } } } return return_input; } /* A_CCM_UPDATE_CACHE */ /* * Take the opportunity to update the node status in the CIB as well */ enum crmd_fsa_input do_ccm_update_cache(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum crmd_fsa_input next_input = I_NULL; int lpc, offset; GHashTable *members = NULL; oc_ed_t event; const oc_ev_membership_t *oc = NULL; oc_node_list_t *tmp = NULL, *membership_copy = NULL; struct crmd_ccm_data_s *ccm_data = fsa_typed_data(fsa_dt_ccm); if(ccm_data == NULL) { crm_err("No data provided to FSA function"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } event = ccm_data->event; oc = ccm_data->oc; crm_info("Updating CCM cache after a \"%s\" event.", ccm_event_name(event)); crm_debug("instance=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); #define ALAN_DEBUG 1 #ifdef ALAN_DEBUG { /* * Size (Size + 2) / 2 * * 3 (3+2)/2 = 5 / 2 = 2 * 4 (4+2)/2 = 6 / 2 = 3 * 5 (5+2)/2 = 7 / 2 = 3 * 6 (6+2)/2 = 8 / 2 = 4 * 7 (7+2)/2 = 9 / 2 = 4 */ int clsize = (oc->m_out_idx - oc->m_n_member); int plsize = (clsize + 2)/2; gboolean plurality = (oc->m_n_member >= plsize); gboolean Q = ccm_have_quorum(event); if(clsize == 2) { if (!Q) { crm_err("2 nodes w/o quorum"); } } else if(Q && !plurality) { crm_err("Quorum w/o plurality (%d/%d nodes)", oc->m_n_member, clsize); } else if(plurality && !Q) { crm_err("Plurality w/o Quorum (%d/%d nodes)", oc->m_n_member, clsize); } else { crm_debug("Quorum(%s) and plurality (%d/%d) agree.", Q?"true":"false", oc->m_n_member, clsize); } } #endif crm_malloc0(membership_copy, sizeof(oc_node_list_t)); if(membership_copy == NULL) { crm_crit("Couldnt create membership copy - out of memory"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return I_NULL; } membership_copy->id = oc->m_instance; membership_copy->last_event = event; crm_debug_3("Copying members"); /*--*-- All Member Nodes --*--*/ offset = oc->m_memb_idx; membership_copy->members_size = oc->m_n_member; if(membership_copy->members_size > 0) { membership_copy->members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->members; for(lpc=0; lpc < membership_copy->members_size; lpc++) { oc_node_t *member = NULL; crm_debug_3("Copying member %d", lpc); crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; member->node_uname = NULL; if(oc->m_array[offset+lpc].node_uname != NULL) { member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); } else { crm_err("Node %d had a NULL uname", member->node_id); } g_hash_table_insert( members, member->node_uname, member); } } else { membership_copy->members = NULL; } crm_debug_3("Copying new members"); /*--*-- New Member Nodes --*--*/ offset = oc->m_in_idx; membership_copy->new_members_size = oc->m_n_in; if(membership_copy->new_members_size > 0) { membership_copy->new_members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->new_members; for(lpc=0; lpc < membership_copy->new_members_size; lpc++) { oc_node_t *member = NULL; crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_uname = NULL; member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; if(oc->m_array[offset+lpc].node_uname != NULL) { member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); } else { crm_err("Node %d had a NULL uname", member->node_id); } g_hash_table_insert( members, member->node_uname, member); g_hash_table_insert(members, member->node_uname, member); } } else { membership_copy->new_members = NULL; } crm_debug_3("Copying dead members"); /*--*-- Recently Dead Member Nodes --*--*/ offset = oc->m_out_idx; membership_copy->dead_members_size = oc->m_n_out; if(membership_copy->dead_members_size > 0) { membership_copy->dead_members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->dead_members; for(lpc=0; lpc < membership_copy->dead_members_size; lpc++) { oc_node_t *member = NULL; crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; member->node_uname = NULL; if(oc->m_array[offset+lpc].node_uname != NULL) { member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); } else { crm_err("Node %d had a NULL uname", member->node_id); } g_hash_table_insert( members, member->node_uname, member); g_hash_table_insert(members, member->node_uname, member); } } else { membership_copy->dead_members = NULL; } tmp = fsa_membership_copy; fsa_membership_copy = membership_copy; crm_info("Updated membership cache with %d (%d new, %d lost) members", g_hash_table_size(fsa_membership_copy->members), g_hash_table_size(fsa_membership_copy->new_members), g_hash_table_size(fsa_membership_copy->dead_members)); /* Free the old copy */ if(tmp != NULL) { if(tmp->members != NULL) g_hash_table_foreach_remove( tmp->members, ghash_node_clfree, NULL); if(tmp->new_members != NULL) g_hash_table_foreach_remove( tmp->new_members, ghash_node_clfree, NULL); if(tmp->dead_members != NULL) g_hash_table_foreach_remove( tmp->dead_members, ghash_node_clfree, NULL); crm_free(tmp); } crm_debug_3("Free'd old copies"); set_bit_inplace(fsa_input_register, R_CCM_DATA); if(cur_state != S_STARTING && cur_state != S_STOPPING) { crm_debug_3("Updating the CIB from CCM cache"); do_update_cib_nodes(NULL, FALSE); } return next_input; } void ccm_event_detail(const oc_ev_membership_t *oc, oc_ed_t event) { int lpc; gboolean member = FALSE; member = FALSE; crm_info("-----------------------"); crm_debug("trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); #if !CCM_EVENT_DETAIL_PARTIAL for(lpc=0; lpc < oc->m_n_member; lpc++) { crm_info("\tCURRENT: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_memb_idx+lpc].node_uname, oc->m_array[oc->m_memb_idx+lpc].node_id, oc->m_array[oc->m_memb_idx+lpc].node_born_on); if(safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx+lpc].node_uname)) { member = TRUE; } } if (member == FALSE) { crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST"); } #endif for(lpc=0; lpc<(int)oc->m_n_in; lpc++) { crm_info("\tNEW: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_in_idx+lpc].node_uname, oc->m_array[oc->m_in_idx+lpc].node_id, oc->m_array[oc->m_in_idx+lpc].node_born_on); } for(lpc=0; lpc<(int)oc->m_n_out; lpc++) { crm_info("\tLOST: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_out_idx+lpc].node_uname, oc->m_array[oc->m_out_idx+lpc].node_id, oc->m_array[oc->m_out_idx+lpc].node_born_on); if(fsa_our_uname != NULL && 0 == strcmp(fsa_our_uname, oc->m_array[oc->m_out_idx+lpc].node_uname)) { crm_err("We're not part of the cluster anymore"); } } crm_info("-----------------------"); } int register_with_ccm(ll_cluster_t *hb_cluster) { return 0; } void msg_ccm_join(const HA_Message *msg, void *foo) { crm_debug_2("###### Received ccm_join message..."); if (msg != NULL) { crm_debug_2("[type=%s]", ha_msg_value(msg, F_TYPE)); crm_debug_2("[orig=%s]", ha_msg_value(msg, F_ORIG)); crm_debug_2("[to=%s]", ha_msg_value(msg, F_TO)); crm_debug_2("[status=%s]", ha_msg_value(msg, F_STATUS)); crm_debug_2("[info=%s]", ha_msg_value(msg, F_COMMENT)); crm_debug_2("[rsc_hold=%s]", ha_msg_value(msg, F_RESOURCES)); crm_debug_2("[stable=%s]", ha_msg_value(msg, F_ISSTABLE)); crm_debug_2("[rtype=%s]", ha_msg_value(msg, F_RTYPE)); crm_debug_2("[ts=%s]", ha_msg_value(msg, F_TIME)); crm_debug_2("[seq=%s]", ha_msg_value(msg, F_SEQ)); crm_debug_2("[generation=%s]", ha_msg_value(msg, F_HBGENERATION)); /* crm_debug_2("[=%s]", ha_msg_value(msg, F_)); */ } return; } struct update_data_s { crm_data_t *updates; const char *state; const char *join; }; crm_data_t* do_update_cib_nodes(crm_data_t *updates, gboolean overwrite) { int call_options = cib_scope_local|cib_quorum_override; struct update_data_s update_data; crm_data_t *fragment = updates; crm_data_t *tmp = NULL; if(updates == NULL) { fragment = create_cib_fragment(NULL, NULL); set_xml_property_copy( fragment, XML_ATTR_SECTION, XML_CIB_TAG_STATUS); } tmp = find_xml_node(fragment, XML_TAG_CIB, TRUE); tmp = get_object_root(XML_CIB_TAG_STATUS, tmp); CRM_DEV_ASSERT(tmp != NULL); update_data.updates = tmp; update_data.state = XML_BOOLEAN_YES; update_data.join = NULL; if(overwrite) { crm_debug_2("Performing a join update based on CCM data"); update_data.join = CRMD_JOINSTATE_PENDING; if(fsa_membership_copy->members != NULL) { g_hash_table_foreach(fsa_membership_copy->members, ghash_update_cib_node, &update_data); } } else { call_options = call_options|cib_inhibit_bcast; crm_debug("Inhibiting bcast for CCM updates"); if(fsa_membership_copy->members != NULL) { g_hash_table_foreach(fsa_membership_copy->new_members, ghash_update_cib_node, &update_data); } update_data.state = XML_BOOLEAN_NO; update_data.join = CRMD_JOINSTATE_DOWN; if(fsa_membership_copy->dead_members != NULL) { g_hash_table_foreach(fsa_membership_copy->dead_members, ghash_update_cib_node, &update_data); } } if(update_data.updates != NULL) { fsa_cib_conn->cmds->modify(fsa_cib_conn, XML_CIB_TAG_STATUS, fragment, NULL, call_options); free_xml(fragment); } return NULL; } void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) { crm_data_t *tmp1 = NULL; const char *node_uname = (const char*)key; struct update_data_s* data = (struct update_data_s*)user_data; if(safe_str_eq(data->state, XML_BOOLEAN_NO)) { crm_info("Node %s has left the cluster", node_uname); } crm_debug_2("%s processing %s (%s)", __FUNCTION__, node_uname, data->state); tmp1 = create_node_state(node_uname, node_uname, NULL, data->state, NULL, data->join, NULL, __FUNCTION__); add_node_copy(data->updates, tmp1); free_xml(tmp1); } gboolean ghash_node_clfree(gpointer key, gpointer value, gpointer user_data) { /* value->node_uname is free'd as "key" */ if(key != NULL) { crm_free(key); } if(value != NULL) { crm_free(value); } return TRUE; }