diff --git a/crm/crmd/callbacks.c b/crm/crmd/callbacks.c index 374662d650..c28bfe7758 100644 --- a/crm/crmd/callbacks.c +++ b/crm/crmd/callbacks.c @@ -1,599 +1,599 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *crmd_peer_state = NULL; crm_data_t *find_xml_in_hamessage(const HA_Message * msg); void crmd_ha_connection_destroy(gpointer user_data); /* From join_dc... */ extern gboolean check_join_state( enum crmd_fsa_state cur_state, const char *source); /* #define MAX_EMPTY_CALLBACKS 20 */ /* int empty_callbacks = 0; */ gboolean crmd_ha_msg_dispatch(IPC_Channel *channel, gpointer user_data) { int lpc = 0; ll_cluster_t *hb_cluster = (ll_cluster_t*)user_data; while(lpc < 2 && hb_cluster->llc_ops->msgready(hb_cluster)) { if(channel->ch_status != IPC_CONNECT) { /* there really is no point continuing */ break; } lpc++; /* invoke the callbacks but dont block */ hb_cluster->llc_ops->rcvmsg(hb_cluster, 0); } crm_debug_3("%d HA messages dispatched", lpc); G_main_set_trigger(fsa_source); if (channel && (channel->ch_status != IPC_CONNECT)) { crm_crit("Lost connection to heartbeat service."); return FALSE; } return TRUE; } void crmd_ha_msg_callback(const HA_Message * msg, void* private_data) { ha_msg_input_t *new_input = NULL; oc_node_t *from_node = NULL; const char *from = ha_msg_value(msg, F_ORIG); const char *seq = ha_msg_value(msg, F_SEQ); const char *op = ha_msg_value(msg, F_CRM_TASK); const char *sys_to = ha_msg_value(msg, F_CRM_SYS_TO); const char *sys_from = ha_msg_value(msg, F_CRM_SYS_FROM); CRM_DEV_ASSERT(from != NULL); if(fsa_membership_copy == NULL) { crm_debug("Ignoring HA messages until we are" " connected to the CCM (%s op from %s)", op, from); crm_log_message_adv( LOG_MSG, "HA[inbound]: Ignore (No CCM)", msg); return; } from_node = g_hash_table_lookup(fsa_membership_copy->members, from); if(from_node == NULL) { int level = LOG_DEBUG; if(safe_str_eq(op, CRM_OP_VOTE)) { level = LOG_WARNING; } else if(AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) { level = LOG_WARNING; } else if(safe_str_eq(sys_from, CRM_SYSTEM_DC)) { level = LOG_WARNING; } do_crm_log(level, __FILE__, __FUNCTION__, "Ignoring HA message (op=%s) from %s: not in our" " membership list (size=%d)", op, from, g_hash_table_size(fsa_membership_copy->members)); crm_log_message_adv(LOG_MSG, "HA[inbound]: CCM Discard", msg); } else if(AM_I_DC && safe_str_eq(sys_from, CRM_SYSTEM_DC) && safe_str_neq(from, fsa_our_uname)) { crm_err("Another DC detected: %s (op=%s)", from, op); crm_log_message_adv( LOG_WARNING, "HA[inbound]: Duplicate DC", msg); new_input = new_ha_msg_input(msg); /* make sure the election happens NOW */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, new_input, __FUNCTION__); #if 0 /* still thinking about this one... * could create a timing issue if we dont notice the * election before a new DC is elected. */ } else if(fsa_our_dc != NULL && safe_str_eq(sys_from, CRM_SYSTEM_DC) && safe_str_neq(from, fsa_our_dc)) { crm_warn("Ignoring message from wrong DC: %s vs. %s ", from, fsa_our_dc); crm_log_message_adv(LOG_WARNING, "HA[inbound]: wrong DC", msg); #endif } else if(safe_str_eq(sys_to, CRM_SYSTEM_DC) && AM_I_DC == FALSE) { crm_debug_2("Ignoring message for the DC [F_SEQ=%s]", seq); crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]: ignore", msg); return; } else if(safe_str_eq(from, fsa_our_uname) && safe_str_eq(op, CRM_OP_VOTE)) { crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]", msg); crm_debug_2("Ignoring our own vote [F_SEQ=%s]: own vote", seq); return; } else if(AM_I_DC && safe_str_eq(op, CRM_OP_HBEAT)) { crm_debug_2("Ignoring our own heartbeat [F_SEQ=%s]", seq); crm_log_message_adv(LOG_DEBUG_4, "HA[inbound]: own heartbeat", msg); return; } else { crm_debug_3("Processing message"); crm_log_message_adv(LOG_MSG, "HA[inbound]", msg); new_input = new_ha_msg_input(msg); register_fsa_input(C_HA_MESSAGE, I_ROUTER, new_input); } #if 0 if(ha_msg_value(msg, XML_ATTR_REFERENCE) == NULL) { ha_msg_add(new_input->msg, XML_ATTR_REFERENCE, seq); } #endif delete_ha_msg_input(new_input); return; } /* * Apparently returning TRUE means "stay connected, keep doing stuff". * Returning FALSE means "we're all done, close the connection" */ gboolean crmd_ipc_msg_callback(IPC_Channel *client, gpointer user_data) { int lpc = 0; IPC_Message *msg = NULL; ha_msg_input_t *new_input = NULL; crmd_client_t *curr_client = (crmd_client_t*)user_data; gboolean stay_connected = TRUE; crm_debug_2("Processing IPC message from %s", curr_client->table_key); while(lpc == 0 && client->ops->is_message_pending(client)) { if (client->ch_status == IPC_DISCONNECT) { /* The message which was pending for us is that * the IPC status is now IPC_DISCONNECT */ break; } if (client->ops->recv(client, &msg) != IPC_OK) { perror("Receive failure:"); crm_err("[%s] [receive failure]", curr_client->table_key); stay_connected = FALSE; break; } else if (msg == NULL) { crm_err("[%s] [no message this time]", curr_client->table_key); continue; } lpc++; new_input = new_ipc_msg_input(msg); msg->msg_done(msg); crm_debug_2("Processing msg from %s", curr_client->table_key); crm_log_message_adv(LOG_MSG, "CRMd[inbound]", new_input->msg); if(crmd_authorize_message(new_input, curr_client)) { register_fsa_input(C_IPC_MESSAGE, I_ROUTER, new_input); } delete_ha_msg_input(new_input); msg = NULL; new_input = NULL; } crm_debug_2("Processed %d messages", lpc); if (client->ch_status == IPC_DISCONNECT) { stay_connected = FALSE; process_client_disconnect(curr_client); } G_main_set_trigger(fsa_source); return stay_connected; } gboolean lrm_dispatch(IPC_Channel*src_not_used, gpointer user_data) { int num_msgs = 0; ll_lrm_t *lrm = (ll_lrm_t*)user_data; crm_debug_3("received callback"); num_msgs = lrm->lrm_ops->rcvmsg(lrm, FALSE); if(num_msgs < 1) { crm_err("lrm->lrm_ops->rcvmsg() failed, connection lost?"); clear_bit_inplace(fsa_input_register, R_LRM_CONNECTED); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); return FALSE; } return TRUE; } void lrm_op_callback(lrm_op_t* op) { CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return; } crm_debug("received callback: %s/%s (%s)", op->op_type, op->rsc_id, op_status2text(op->op_status)); /* Make sure the LRM events are received in order */ register_fsa_input_later(C_LRM_OP_CALLBACK, I_LRM_EVENT, op); } void crmd_ha_status_callback( const char *node, const char * status, void* private_data) { crm_data_t *update = NULL; crm_debug_3("received callback"); crm_notice("Status update: Node %s now has status [%s]",node,status); if(safe_str_neq(status, DEADSTATUS)) { crm_debug_3("nstatus callback was not for a dead node"); return; } /* this node is taost */ update = create_node_state( node, node, status, NULL, NULL, NULL, NULL, __FUNCTION__); crm_xml_add(update, XML_CIB_ATTR_CLEAR_SHUTDOWN, XML_BOOLEAN_TRUE); /* this change should not be broadcast */ update_local_cib(create_cib_fragment(update, NULL)); G_main_set_trigger(fsa_source); free_xml(update); } void crmd_client_status_callback(const char * node, const char * client, const char * status, void * private) { const char *join = NULL; const char *extra = NULL; crm_data_t * update = NULL; crm_debug_3("received callback"); if(safe_str_neq(client, CRM_SYSTEM_CRMD)) { return; } if(safe_str_eq(status, JOINSTATUS)){ status = ONLINESTATUS; extra = XML_CIB_ATTR_CLEAR_SHUTDOWN; } else if(safe_str_eq(status, LEAVESTATUS)){ status = OFFLINESTATUS; - join = CRMD_JOINSTATE_DOWN; + join = CRMD_STATE_INACTIVE; extra = XML_CIB_ATTR_CLEAR_SHUTDOWN; } set_bit_inplace(fsa_input_register, R_PEER_DATA); g_hash_table_replace( crmd_peer_state, crm_strdup(node), crm_strdup(status)); if(fsa_state == S_STARTING || fsa_state == S_STOPPING) { return; } crm_notice("Status update: Client %s/%s now has status [%s]", node, client, status); if(safe_str_eq(node, fsa_our_dc) && safe_str_eq(status, OFFLINESTATUS)) { /* did our DC leave us */ crm_info("Got client status callback - our DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } else { crm_data_t *fragment = NULL; crm_debug_3("Got client status callback"); update = create_node_state( node, node, NULL, NULL, status, join, NULL, __FUNCTION__); crm_xml_add(update, extra, XML_BOOLEAN_TRUE); fragment = create_cib_fragment(update, NULL); /* it is safe to keep these updates on the local node * each node updates their own CIB */ fsa_cib_conn->cmds->modify( fsa_cib_conn, XML_CIB_TAG_STATUS, fragment, NULL, cib_inhibit_bcast|cib_scope_local|cib_quorum_override); free_xml(fragment); free_xml(update); if(AM_I_DC && safe_str_eq(status, OFFLINESTATUS)) { g_hash_table_remove(confirmed_nodes, node); g_hash_table_remove(finalized_nodes, node); g_hash_table_remove(integrated_nodes, node); g_hash_table_remove(welcomed_nodes, node); check_join_state(fsa_state, __FUNCTION__); } } G_main_set_trigger(fsa_source); } void crmd_ha_connection_destroy(gpointer user_data) { crm_crit("Heartbeat has left us"); /* this is always an error */ /* feed this back into the FSA */ register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); } gboolean crmd_client_connect(IPC_Channel *client_channel, gpointer user_data) { if (client_channel == NULL) { crm_err("Channel was NULL"); } else if (client_channel->ch_status == IPC_DISCONNECT) { crm_err("Channel was disconnected"); } else { crmd_client_t *blank_client = NULL; crm_debug_3("Channel connected"); crm_malloc0(blank_client, sizeof(crmd_client_t)); if (blank_client == NULL) { return FALSE; } client_channel->ops->set_recv_qlen(client_channel, 100); client_channel->ops->set_send_qlen(client_channel, 100); blank_client->client_channel = client_channel; blank_client->sub_sys = NULL; blank_client->uuid = NULL; blank_client->table_key = NULL; blank_client->client_source = G_main_add_IPC_Channel( G_PRIORITY_LOW, client_channel, FALSE, crmd_ipc_msg_callback, blank_client, default_ipc_connection_destroy); } return TRUE; } gboolean ccm_dispatch(int fd, gpointer user_data) { int rc = 0; oc_ev_t *ccm_token = (oc_ev_t*)user_data; gboolean was_error = FALSE; crm_debug_3("received callback"); rc = oc_ev_handle_event(ccm_token); if(rc != 0) { crm_err("CCM connection appears to have failed: rc=%d.", rc); register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL); was_error = TRUE; } G_main_set_trigger(fsa_source); return !was_error; } static gboolean fsa_have_quorum = FALSE; void crmd_ccm_msg_callback( oc_ed_t event, void *cookie, size_t size, const void *data) { int instance = -1; gboolean update_cache = FALSE; struct crmd_ccm_data_s *event_data = NULL; const oc_ev_membership_t *membership = data; gboolean update_quorum = FALSE; gboolean trigger_transition = FALSE; crm_debug_3("received callback"); if(data != NULL) { instance = membership->m_instance; } crm_info("Quorum %s after event=%s (id=%d)", ccm_have_quorum(event)?"(re)attained":"lost", ccm_event_name(event), instance); switch(event) { case OC_EV_MS_NEW_MEMBERSHIP: case OC_EV_MS_INVALID:/* fall through */ update_cache = TRUE; update_quorum = TRUE; break; case OC_EV_MS_NOT_PRIMARY: #if UNTESTED if(AM_I_DC == FALSE) { break; } /* tell the TE to pretend it had completed and stop */ /* side effect: we'll end up in S_IDLE */ register_fsa_action(A_TE_HALT, TRUE); #endif break; case OC_EV_MS_PRIMARY_RESTORED: fsa_membership_copy->id = instance; if(AM_I_DC && need_transition(fsa_state)) { trigger_transition = TRUE; } break; case OC_EV_MS_EVICTED: update_quorum = TRUE; register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL); break; default: crm_err("Unknown CCM event: %d", event); } if(update_quorum && ccm_have_quorum(event) == FALSE) { /* did we just loose quorum? */ if(fsa_have_quorum && need_transition(fsa_state)) { crm_info("Quorum lost: triggering transition (%s)", ccm_event_name(event)); trigger_transition = TRUE; } fsa_have_quorum = FALSE; } else if(update_quorum) { crm_debug_2("Updating quorum after event %s", ccm_event_name(event)); fsa_have_quorum = TRUE; } if(trigger_transition) { crm_debug_2("Scheduling transition after event %s", ccm_event_name(event)); /* make sure that when we query the CIB that it has * the changes that triggered the transition */ switch(event) { case OC_EV_MS_NEW_MEMBERSHIP: case OC_EV_MS_INVALID: case OC_EV_MS_PRIMARY_RESTORED: fsa_membership_copy->id = instance; break; default: break; } if(update_cache == FALSE) { /* a stand-alone transition */ register_fsa_action(A_TE_CANCEL); } } if(update_cache) { crm_debug_2("Updating cache after event %s", ccm_event_name(event)); crm_malloc0(event_data, sizeof(struct crmd_ccm_data_s)); if(event_data == NULL) { return; } event_data->event = event; if(data != NULL) { event_data->oc = copy_ccm_oc_data(data); } register_fsa_input_adv( C_CCM_CALLBACK, I_CCM_EVENT, event_data, trigger_transition?A_TE_CANCEL:A_NOTHING, FALSE, __FUNCTION__); if (event_data->oc) { crm_free(event_data->oc); event_data->oc = NULL; } crm_free(event_data); } oc_ev_callback_done(cookie); return; } void crmd_cib_connection_destroy(gpointer user_data) { if(is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit_inplace(fsa_input_register, R_CIB_CONNECTED); return; } longclock_t fsa_start = 0; longclock_t fsa_stop = 0; longclock_t fsa_diff = 0; gboolean crm_fsa_trigger(gpointer user_data) { unsigned int fsa_diff_ms = 0; if(fsa_diff_max_ms > 0) { fsa_start = time_longclock(); } s_crmd_fsa(C_FSA_INTERNAL); if(fsa_diff_max_ms > 0) { fsa_stop = time_longclock(); fsa_diff = sub_longclock(fsa_stop, fsa_start); fsa_diff_ms = longclockto_ms(fsa_diff); if(fsa_diff_ms > fsa_diff_max_ms) { crm_err("FSA took %dms to complete", fsa_diff_ms); } else if(fsa_diff_ms > fsa_diff_warn_ms) { crm_warn("FSA took %dms to complete", fsa_diff_ms); } } return TRUE; } diff --git a/crm/crmd/ccm.c b/crm/crmd/ccm.c index f8b1e7330a..436b584c21 100644 --- a/crm/crmd/ccm.c +++ b/crm/crmd/ccm.c @@ -1,656 +1,656 @@ -/* $Id: ccm.c,v 1.86 2005/07/03 22:15:49 alan Exp $ */ +/* $Id: ccm.c,v 1.87 2005/08/25 08:29:37 andrew Exp $ */ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include void oc_ev_special(const oc_ev_t *, oc_ev_class_t , int ); int register_with_ccm(ll_cluster_t *hb_cluster); void msg_ccm_join(const HA_Message *msg, void *foo); void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data); gboolean ghash_node_clfree(gpointer key, gpointer value, gpointer user_data); void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); #define CCM_EVENT_DETAIL 0 #define CCM_EVENT_DETAIL_PARTIAL 1 oc_ev_t *fsa_ev_token; int num_ccm_register_fails = 0; int max_ccm_register_fails = 30; /* A_CCM_CONNECT */ enum crmd_fsa_input do_ccm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int ret; int fsa_ev_fd; gboolean did_fail = FALSE; if(action & A_CCM_DISCONNECT){ oc_ev_unregister(fsa_ev_token); } if(action & A_CCM_CONNECT) { crm_debug_3("Registering with CCM"); ret = oc_ev_register(&fsa_ev_token); if (ret != 0) { crm_warn("CCM registration failed"); did_fail = TRUE; } if(did_fail == FALSE) { crm_debug_3("Setting up CCM callbacks"); ret = oc_ev_set_callback(fsa_ev_token, OC_EV_MEMB_CLASS, crmd_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if(did_fail == FALSE) { oc_ev_special(fsa_ev_token, OC_EV_MEMB_CLASS, 0/*don't care*/); crm_debug_3("Activating CCM token"); ret = oc_ev_activate(fsa_ev_token, &fsa_ev_fd); if (ret != 0){ crm_warn("CCM Activation failed"); did_fail = TRUE; } } if(did_fail) { num_ccm_register_fails++; oc_ev_unregister(fsa_ev_token); if(num_ccm_register_fails < max_ccm_register_fails) { crm_warn("CCM Connection failed" " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return I_NULL; } else { crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } } crm_info("CCM Activation passed... all set to go!"); G_main_add_fd(G_PRIORITY_HIGH, fsa_ev_fd, FALSE, ccm_dispatch, fsa_ev_token, default_ipc_connection_destroy); } if(action & ~(A_CCM_CONNECT|A_CCM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } return I_NULL; } /* A_CCM_EVENT */ enum crmd_fsa_input do_ccm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum crmd_fsa_input return_input = I_NULL; oc_ed_t event; const oc_ev_membership_t *oc = NULL; struct crmd_ccm_data_s *ccm_data = fsa_typed_data(fsa_dt_ccm); if(ccm_data == NULL) { crm_err("No data provided to FSA function"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } else if(msg_data->fsa_cause != C_CCM_CALLBACK) { crm_err("FSA function called in response to incorect input"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } event = ccm_data->event; oc = ccm_data->oc; ccm_event_detail(oc, event); if (OC_EV_MS_EVICTED == event) { /* todo: drop back to S_PENDING instead */ /* get out... NOW! * * go via the error recovery process so that HA will * restart us if required */ register_fsa_error(cause, I_ERROR, msg_data->data); return I_NULL; } CRM_DEV_ASSERT(oc->m_n_in != 0 || oc->m_n_out != 0); if(AM_I_DC) { /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ HA_Message *no_op = create_request( CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); send_msg_via_ha(fsa_cluster_conn, no_op); } else if(oc->m_n_out != 0) { /* Possibly move this logic to ghash_update_cib_node() */ unsigned lpc = 0; int offset = oc->m_out_idx; for(lpc=0; lpc < oc->m_n_out; lpc++) { const char *uname = oc->m_array[offset+lpc].node_uname; if(uname == NULL) { crm_err("CCM node had no name"); continue; } else if(safe_str_eq(uname, fsa_our_dc)) { crm_warn("Our DC node (%s) left the cluster", uname); register_fsa_input(cause, I_ELECTION, NULL); } } } return return_input; } /* A_CCM_UPDATE_CACHE */ /* * Take the opportunity to update the node status in the CIB as well */ enum crmd_fsa_input do_ccm_update_cache(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum crmd_fsa_input next_input = I_NULL; unsigned int lpc; int offset; GHashTable *members = NULL; oc_ed_t event; const oc_ev_membership_t *oc = NULL; oc_node_list_t *tmp = NULL, *membership_copy = NULL; struct crmd_ccm_data_s *ccm_data = fsa_typed_data(fsa_dt_ccm); if(ccm_data == NULL) { crm_err("No data provided to FSA function"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } event = ccm_data->event; oc = ccm_data->oc; crm_debug_2("Updating CCM cache after a \"%s\" event.", ccm_event_name(event)); crm_debug_2("instance=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); #define ALAN_DEBUG 1 #ifdef ALAN_DEBUG { /* * Size (Size + 2) / 2 * * 3 (3+2)/2 = 5 / 2 = 2 * 4 (4+2)/2 = 6 / 2 = 3 * 5 (5+2)/2 = 7 / 2 = 3 * 6 (6+2)/2 = 8 / 2 = 4 * 7 (7+2)/2 = 9 / 2 = 4 */ unsigned int clsize = (oc->m_out_idx - oc->m_n_member); unsigned int plsize = (clsize + 2)/2; gboolean plurality = (oc->m_n_member >= plsize); gboolean Q = ccm_have_quorum(event); if(clsize == 2) { if (!Q) { crm_err("2 nodes w/o quorum"); } } else if(Q && !plurality) { crm_err("Quorum w/o plurality (%d/%d nodes)", oc->m_n_member, clsize); } else if(plurality && !Q) { crm_err("Plurality w/o Quorum (%d/%d nodes)", oc->m_n_member, clsize); } else { crm_debug_2("Quorum(%s) and plurality (%d/%d) agree.", Q?"true":"false", oc->m_n_member, clsize); } } #endif crm_malloc0(membership_copy, sizeof(oc_node_list_t)); if(membership_copy == NULL) { crm_crit("Couldnt create membership copy - out of memory"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return I_NULL; } membership_copy->id = oc->m_instance; membership_copy->last_event = event; crm_debug_3("Copying members"); /*--*-- All Member Nodes --*--*/ offset = oc->m_memb_idx; membership_copy->members_size = oc->m_n_member; if(membership_copy->members_size > 0) { membership_copy->members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->members; for(lpc=0; lpc < membership_copy->members_size; lpc++) { oc_node_t *member = NULL; crm_debug_3("Copying member %d", lpc); crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; member->node_uname = NULL; if(oc->m_array[offset+lpc].node_uname != NULL) { member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); } else { crm_err("Node %d had a NULL uname", member->node_id); } g_hash_table_insert( members, member->node_uname, member); } } else { membership_copy->members = NULL; } crm_debug_3("Copying new members"); /*--*-- New Member Nodes --*--*/ offset = oc->m_in_idx; membership_copy->new_members_size = oc->m_n_in; if(membership_copy->new_members_size > 0) { membership_copy->new_members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->new_members; for(lpc=0; lpc < membership_copy->new_members_size; lpc++) { oc_node_t *member = NULL; crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_uname = NULL; member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; if(oc->m_array[offset+lpc].node_uname != NULL) { member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); } else { crm_err("Node %d had a NULL uname", member->node_id); } g_hash_table_insert( members, member->node_uname, member); g_hash_table_insert(members, member->node_uname, member); } } else { membership_copy->new_members = NULL; } crm_debug_3("Copying dead members"); /*--*-- Recently Dead Member Nodes --*--*/ offset = oc->m_out_idx; membership_copy->dead_members_size = oc->m_n_out; if(membership_copy->dead_members_size > 0) { membership_copy->dead_members = g_hash_table_new(g_str_hash, g_str_equal); members = membership_copy->dead_members; for(lpc=0; lpc < membership_copy->dead_members_size; lpc++) { oc_node_t *member = NULL; crm_malloc0(member, sizeof(oc_node_t)); if(member == NULL) { continue; } member->node_id = oc->m_array[offset+lpc].node_id; member->node_born_on = oc->m_array[offset+lpc].node_born_on; member->node_uname = NULL; CRM_DEV_ASSERT(oc->m_array[offset+lpc].node_uname != NULL); if(oc->m_array[offset+lpc].node_uname == NULL) { continue; } member->node_uname = crm_strdup(oc->m_array[offset+lpc].node_uname); g_hash_table_insert( members, member->node_uname, member); g_hash_table_insert(members, member->node_uname, member); } } else { membership_copy->dead_members = NULL; } tmp = fsa_membership_copy; fsa_membership_copy = membership_copy; crm_debug_2("Updated membership cache with %d (%d new, %d lost) members", g_hash_table_size(fsa_membership_copy->members), g_hash_table_size(fsa_membership_copy->new_members), g_hash_table_size(fsa_membership_copy->dead_members)); /* Free the old copy */ if(tmp != NULL) { if(tmp->members != NULL) g_hash_table_foreach_remove( tmp->members, ghash_node_clfree, NULL); if(tmp->new_members != NULL) g_hash_table_foreach_remove( tmp->new_members, ghash_node_clfree, NULL); if(tmp->dead_members != NULL) g_hash_table_foreach_remove( tmp->dead_members, ghash_node_clfree, NULL); crm_free(tmp); } crm_debug_3("Free'd old copies"); set_bit_inplace(fsa_input_register, R_CCM_DATA); if(cur_state != S_STARTING && cur_state != S_STOPPING) { crm_debug_3("Updating the CIB from CCM cache"); do_update_cib_nodes(NULL, FALSE); } return next_input; } void ccm_event_detail(const oc_ev_membership_t *oc, oc_ed_t event) { int lpc; gboolean member = FALSE; member = FALSE; crm_debug_2("-----------------------"); crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", ccm_event_name(event), oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); #if !CCM_EVENT_DETAIL_PARTIAL for(lpc=0; lpc < oc->m_n_member; lpc++) { crm_info("\tCURRENT: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_memb_idx+lpc].node_uname, oc->m_array[oc->m_memb_idx+lpc].node_id, oc->m_array[oc->m_memb_idx+lpc].node_born_on); if(safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx+lpc].node_uname)) { member = TRUE; } } if (member == FALSE) { crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST"); } #endif for(lpc=0; lpc<(int)oc->m_n_in; lpc++) { crm_info("\tNEW: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_in_idx+lpc].node_uname, oc->m_array[oc->m_in_idx+lpc].node_id, oc->m_array[oc->m_in_idx+lpc].node_born_on); } for(lpc=0; lpc<(int)oc->m_n_out; lpc++) { crm_info("\tLOST: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_out_idx+lpc].node_uname, oc->m_array[oc->m_out_idx+lpc].node_id, oc->m_array[oc->m_out_idx+lpc].node_born_on); if(fsa_our_uname != NULL && 0 == strcmp(fsa_our_uname, oc->m_array[oc->m_out_idx+lpc].node_uname)) { crm_err("We're not part of the cluster anymore"); } } crm_debug_2("-----------------------"); } int register_with_ccm(ll_cluster_t *hb_cluster) { return 0; } void msg_ccm_join(const HA_Message *msg, void *foo) { crm_debug_2("###### Received ccm_join message..."); if (msg != NULL) { crm_debug_2("[type=%s]", ha_msg_value(msg, F_TYPE)); crm_debug_2("[orig=%s]", ha_msg_value(msg, F_ORIG)); crm_debug_2("[to=%s]", ha_msg_value(msg, F_TO)); crm_debug_2("[status=%s]", ha_msg_value(msg, F_STATUS)); crm_debug_2("[info=%s]", ha_msg_value(msg, F_COMMENT)); crm_debug_2("[rsc_hold=%s]", ha_msg_value(msg, F_RESOURCES)); crm_debug_2("[stable=%s]", ha_msg_value(msg, F_ISSTABLE)); crm_debug_2("[rtype=%s]", ha_msg_value(msg, F_RTYPE)); crm_debug_2("[ts=%s]", ha_msg_value(msg, F_TIME)); crm_debug_2("[seq=%s]", ha_msg_value(msg, F_SEQ)); crm_debug_2("[generation=%s]", ha_msg_value(msg, F_HBGENERATION)); /* crm_debug_2("[=%s]", ha_msg_value(msg, F_)); */ } return; } struct update_data_s { crm_data_t *updates; const char *state; const char *join; }; crm_data_t* do_update_cib_nodes(crm_data_t *updates, gboolean overwrite) { int call_options = cib_scope_local|cib_quorum_override; struct update_data_s update_data; crm_data_t *fragment = updates; crm_data_t *tmp = NULL; if(updates == NULL) { fragment = create_cib_fragment(NULL, NULL); crm_xml_add(fragment, XML_ATTR_SECTION, XML_CIB_TAG_STATUS); } tmp = find_xml_node(fragment, XML_TAG_CIB, TRUE); tmp = get_object_root(XML_CIB_TAG_STATUS, tmp); CRM_DEV_ASSERT(tmp != NULL); update_data.updates = tmp; update_data.state = XML_BOOLEAN_YES; update_data.join = NULL; if(overwrite) { crm_debug_2("Performing a join update based on CCM data"); update_data.join = CRMD_JOINSTATE_PENDING; if(fsa_membership_copy->members != NULL) { g_hash_table_foreach(fsa_membership_copy->members, ghash_update_cib_node, &update_data); } } else { call_options = call_options|cib_inhibit_bcast; crm_debug_2("Inhibiting bcast for CCM updates"); if(fsa_membership_copy->members != NULL) { g_hash_table_foreach(fsa_membership_copy->new_members, ghash_update_cib_node, &update_data); } update_data.state = XML_BOOLEAN_NO; - update_data.join = CRMD_JOINSTATE_DOWN; + update_data.join = CRMD_STATE_INACTIVE; if(fsa_membership_copy->dead_members != NULL) { g_hash_table_foreach(fsa_membership_copy->dead_members, ghash_update_cib_node, &update_data); } } if(update_data.updates != NULL) { fsa_cib_conn->cmds->modify(fsa_cib_conn, XML_CIB_TAG_STATUS, fragment, NULL, call_options); free_xml(fragment); } return NULL; } void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) { crm_data_t *tmp1 = NULL; const char *node_uname = (const char*)key; struct update_data_s* data = (struct update_data_s*)user_data; crm_debug_2("%s processing %s (%s)", __FUNCTION__, node_uname, data->state); tmp1 = create_node_state(node_uname, node_uname, NULL, data->state, NULL, data->join, NULL, __FUNCTION__); add_node_copy(data->updates, tmp1); free_xml(tmp1); } gboolean ghash_node_clfree(gpointer key, gpointer value, gpointer user_data) { /* value->node_uname is free'd as "key" */ if(key != NULL) { crm_free(key); } if(value != NULL) { crm_free(value); } return TRUE; } diff --git a/crm/crmd/join_dc.c b/crm/crmd/join_dc.c index a1ca8cada5..5666f82905 100644 --- a/crm/crmd/join_dc.c +++ b/crm/crmd/join_dc.c @@ -1,626 +1,631 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include GHashTable *welcomed_nodes = NULL; GHashTable *integrated_nodes = NULL; GHashTable *finalized_nodes = NULL; GHashTable *confirmed_nodes = NULL; char *max_epoch = NULL; char *max_generation_from = NULL; crm_data_t *max_generation_xml = NULL; void initialize_join(gboolean before); gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data); void join_send_offer(gpointer key, gpointer value, gpointer user_data); void finalize_sync_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data); gboolean process_join_ack_msg( const char *join_from, crm_data_t *lrm_update, int join_id); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void join_update_complete_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data); void finalize_join(const char *caller); static int current_join_id = 0; /* A_DC_JOIN_OFFER_ALL */ enum crmd_fsa_input do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { /* reset everyones status back to down or in_ccm in the CIB * * any nodes that are active in the CIB but not in the CCM list * will be seen as offline by the PE anyway */ do_update_cib_nodes(NULL, TRUE); crm_info("0) Offering membership to %d clients", fsa_membership_copy->members_size); initialize_join(TRUE); current_join_id++; g_hash_table_foreach( fsa_membership_copy->members, join_send_offer, NULL); /* dont waste time by invoking the PE yet; */ crm_debug("1) Waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); return I_NULL; } /* A_DC_JOIN_OFFER_ONE */ enum crmd_fsa_input do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { oc_node_t member; gpointer a_node = NULL; ha_msg_input_t *welcome = fsa_typed_data(fsa_dt_ha_msg); const char *join_to = NULL; if(welcome == NULL) { crm_err("Attempt to send welcome message " "without a message to reply to!"); return I_NULL; } join_to = cl_get_string(welcome->msg, F_CRM_HOST_FROM); if(a_node != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) { /* note: it _is_ possible that a node will have been * sick or starting up when the original offer was made. * however, it will either re-announce itself in due course * _or_ we can re-store the original offer on the client. */ crm_debug("Re-offering membership to %s...", join_to); } crm_info("Processing annouce request from %s in state %s", join_to, fsa_state2string(cur_state)); /* always offer to the DC (ourselves) * this ensures the correct value for max_generation_from */ member.node_uname = crm_strdup(fsa_our_uname); join_send_offer(NULL, &member, NULL); crm_free(member.node_uname); member.node_uname = crm_strdup(join_to); join_send_offer(NULL, &member, NULL); crm_free(member.node_uname); /* this was a genuine join request, cancel any existing * transition and invoke the PE */ if(need_transition(fsa_state)) { register_fsa_action(A_TE_CANCEL); } /* dont waste time by invoking the pe yet; */ crm_debug("1) Waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); return I_NULL; } /* A_DC_JOIN_PROCESS_REQ */ enum crmd_fsa_input do_dc_join_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { crm_data_t *generation = NULL; int join_id = -1; gboolean ack_nack_bool = TRUE; const char *ack_nack = CRMD_JOINSTATE_MEMBER; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = cl_get_string(join_ack->msg,F_CRM_HOST_FROM); const char *ref = cl_get_string(join_ack->msg,XML_ATTR_REFERENCE); gpointer join_node = g_hash_table_lookup(fsa_membership_copy->members, join_from); crm_debug_3("2) Processing req from %s", join_from); generation = join_ack->xml; ha_msg_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); crm_log_xml_debug_2(max_generation_xml, "Max generation"); crm_log_xml_debug_2(generation, "Their generation"); if(join_node == NULL) { crm_err("Node %s is not a member", join_from); ack_nack_bool = FALSE; } else if(generation == NULL) { crm_err("Generation was NULL"); ack_nack_bool = FALSE; } else if(join_id != current_join_id) { crm_debug("Response from %s was for invalid join: %d vs. %d", join_from, join_id, current_join_id); check_join_state(cur_state, __FUNCTION__); return I_NULL; } else if(max_generation_xml == NULL) { max_generation_xml = copy_xml(generation); max_generation_from = crm_strdup(join_from); } else if(cib_compare_generation(max_generation_xml, generation) < 0) { crm_debug("%s has a better generation number than" " the current max %s", join_from, max_generation_from); crm_free(max_generation_from); free_xml(max_generation_xml); max_generation_from = crm_strdup(join_from); max_generation_xml = copy_xml(join_ack->xml); } if(ack_nack_bool == FALSE) { /* NACK this client */ - ack_nack = CRMD_JOINSTATE_DOWN; + ack_nack = CRMD_STATE_INACTIVE; crm_err("2) NACK'ing node %s (ref %s)", join_from, ref); } else { crm_debug("2) Welcoming node %s after ACK (ref %s)", join_from, ref); } /* add them to our list of CRMD_STATE_ACTIVE nodes */ g_hash_table_insert( integrated_nodes, crm_strdup(join_from), crm_strdup(ack_nack)); crm_debug_2("%u nodes have been integrated", g_hash_table_size(integrated_nodes)); g_hash_table_remove(welcomed_nodes, join_from); if(check_join_state(cur_state, __FUNCTION__) == FALSE) { /* dont waste time by invoking the PE yet; */ crm_debug_2("Still waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); } return I_NULL; } #define JOIN_AFTER_SYNC 1 /* A_DC_JOIN_FINALIZE */ enum crmd_fsa_input do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum cib_errors rc = cib_ok; /* This we can do straight away and avoid clients timing us out * while we compute the latest CIB */ #if JOIN_AFTER_SYNC crm_debug("Finializing join for %d clients", g_hash_table_size(integrated_nodes)); #else crm_debug("Notifying %d clients of join results", g_hash_table_size(integrated_nodes)); g_hash_table_foreach_remove( integrated_nodes, finalize_join_for, NULL); #endif clear_bit_inplace(fsa_input_register, R_HAVE_CIB); if(max_generation_from == NULL || safe_str_eq(max_generation_from, fsa_our_uname)){ set_bit_inplace(fsa_input_register, R_HAVE_CIB); } if(is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { /* ask for the agreed best CIB */ crm_info("Asking %s for its copy of the CIB", crm_str(max_generation_from)); set_bit_inplace(fsa_input_register, R_CIB_ASKED); fsa_cib_conn->call_timeout = 10; rc = fsa_cib_conn->cmds->sync_from( fsa_cib_conn, max_generation_from, NULL, cib_quorum_override); fsa_cib_conn->call_timeout = 0; /* back to the default */ add_cib_op_callback(rc, FALSE, crm_strdup(max_generation_from), finalize_sync_callback); return I_NULL; } finalize_join(__FUNCTION__); return I_NULL; } void finalize_sync_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { CRM_DEV_ASSERT(cib_not_master != rc); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); if(rc == cib_remote_timeout) { crm_err("Sync from %s resulted in an error: %s." " Use what we have...", (char*)user_data, cib_error2string(rc)); #if 0 /* restart the whole join process */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__); return; #else rc = cib_ok; #endif } if(rc < cib_ok) { crm_err("Sync from %s resulted in an error: %s", (char*)user_data, cib_error2string(rc)); register_fsa_error_adv( C_FSA_INTERNAL, I_ERROR, NULL, NULL, __FUNCTION__); } else if(AM_I_DC && fsa_state == S_FINALIZE_JOIN) { finalize_join(__FUNCTION__); } else { crm_debug("No longer the DC in S_FINALIZE_JOIN: %s/%s", AM_I_DC?"DC":"CRMd", fsa_state2string(fsa_state)); } crm_free(user_data); } void finalize_join(const char *caller) { crm_data_t *cib = createEmptyCib(); crm_data_t *cib_update = NULL; set_bit_inplace(fsa_input_register, R_HAVE_CIB); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); set_uuid(fsa_cluster_conn, cib, XML_ATTR_DC_UUID, fsa_our_uname); crm_debug_3("Update %s in the CIB to our uuid: %s", XML_ATTR_DC_UUID, crm_element_value(cib, XML_ATTR_DC_UUID)); cib_update = create_cib_fragment(cib, NULL); fsa_cib_conn->cmds->modify( fsa_cib_conn, NULL, cib_update, NULL, cib_quorum_override); free_xml(cib_update); free_xml(cib); crm_debug_3("Bumping the epoch and syncing to %d clients", g_hash_table_size(finalized_nodes)); fsa_cib_conn->cmds->bump_epoch( fsa_cib_conn, cib_scope_local|cib_quorum_override); #if JOIN_AFTER_SYNC /* make sure dc_uuid is re-set to us */ if(check_join_state(fsa_state, caller) == FALSE) { crm_debug("Notifying %d clients of join results", g_hash_table_size(integrated_nodes)); g_hash_table_foreach_remove( integrated_nodes, finalize_join_for, NULL); } #else check_join_state(cur_state, caller); rc = fsa_cib_conn->cmds->sync(fsa_cib_conn, NULL, cib_quorum_override); #endif } /* A_DC_JOIN_PROCESS_ACK */ enum crmd_fsa_input do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = cl_get_string(join_ack->msg, F_CRM_HOST_FROM); const char *op = cl_get_string(join_ack->msg, F_CRM_TASK); if(safe_str_neq(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring op=%s message", op); } else { int join_id = -1; ha_msg_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); process_join_ack_msg(join_from, join_ack->xml, join_id); } return I_NULL; } gboolean process_join_ack_msg(const char *join_from, crm_data_t *lrm_update, int join_id) { /* now update them to "member" */ int call_id = 0; crm_data_t *update = NULL; crm_data_t *fragment = NULL; const char *join_state = NULL; crm_debug_2("Processing ack from %s", join_from); join_state = (const char *) g_hash_table_lookup(finalized_nodes, join_from); if(join_state == NULL) { crm_err("Join not in progress: ignoring join from %s", join_from); return FALSE; } else if(safe_str_neq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_err("Node %s wasnt invited to join the cluster",join_from); g_hash_table_remove(finalized_nodes, join_from); return FALSE; } else if(join_id != current_join_id) { crm_err("Node %s responded to an invalid join: %d vs. %d", join_from, join_id, current_join_id); g_hash_table_remove(finalized_nodes, join_from); return FALSE; } g_hash_table_remove(finalized_nodes, join_from); if(g_hash_table_lookup(confirmed_nodes, join_from) != NULL) { crm_err("hash already contains confirmation from %s",join_from); } g_hash_table_insert(confirmed_nodes, crm_strdup(join_from), crm_strdup(CRMD_JOINSTATE_MEMBER)); crm_info("4) Updating node state to %s for %s", CRMD_JOINSTATE_MEMBER, join_from); - + +#if 0 + ???dig into the fragment and clear shutdown?? + /* the slave will re-ask if it wants to be shutdown */ + crm_xml_add(lrm_update, XML_CIB_ATTR_CLEAR_SHUTDOWN, XML_BOOLEAN_TRUE); +#endif /* update CIB with the current LRM status from the node * We dont need to notify the TE of these updates, a transition will * be started in due time */ call_id = fsa_cib_conn->cmds->modify( fsa_cib_conn, XML_CIB_TAG_STATUS, lrm_update, NULL, cib_scope_local|cib_quorum_override); add_cib_op_callback(call_id, TRUE,NULL, join_update_complete_callback); free_xml(fragment); free_xml(update); return TRUE; } gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const char *join_state = NULL; HA_Message *acknak = NULL; if(key == NULL || value == NULL) { return TRUE; } join_to = (const char *)key; join_state = (const char *)value; /* make sure the node exists in the config section */ create_node_entry(join_to, join_to, CRMD_JOINSTATE_MEMBER); /* send the ack/nack to the node */ acknak = create_request( CRM_OP_JOIN_ACKNAK, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); ha_msg_add_int(acknak, F_CRM_JOIN_ID, current_join_id); /* set the ack/nack */ if(safe_str_eq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_debug("3) ACK'ing join request from %s, state %s", join_to, join_state); ha_msg_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE); g_hash_table_insert( finalized_nodes, crm_strdup(join_to), crm_strdup(CRMD_JOINSTATE_MEMBER)); } else { crm_warn("3) NACK'ing join request from %s, state %s", join_to, join_state); ha_msg_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_FALSE); } send_msg_via_ha(fsa_cluster_conn, acknak); return TRUE; } void initialize_join(gboolean before) { /* clear out/reset a bunch of stuff */ crm_debug("Initializing join data"); g_hash_table_destroy(welcomed_nodes); g_hash_table_destroy(integrated_nodes); g_hash_table_destroy(finalized_nodes); g_hash_table_destroy(confirmed_nodes); if(before) { if(max_generation_from != NULL) { crm_free(max_generation_from); max_generation_from = NULL; } if(max_generation_xml != NULL) { free_xml(max_generation_xml); max_generation_xml = NULL; } clear_bit_inplace(fsa_input_register, R_HAVE_CIB); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); } welcomed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } void join_send_offer(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const char *crm_online = NULL; const oc_node_t *member = (const oc_node_t*)value; if(member != NULL) { join_to = member->node_uname; } if(join_to == NULL) { crm_err("No recipient for welcome message"); return; } g_hash_table_remove(confirmed_nodes, join_to); g_hash_table_remove(finalized_nodes, join_to); g_hash_table_remove(integrated_nodes, join_to); g_hash_table_remove(welcomed_nodes, join_to); crm_online = g_hash_table_lookup(crmd_peer_state, join_to); if(safe_str_eq(crm_online, ONLINESTATUS)) { HA_Message *offer = create_request( CRM_OP_JOIN_OFFER, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); ha_msg_add_int(offer, F_CRM_JOIN_ID, current_join_id); /* send the welcome */ crm_debug("Sending %s(%d) to %s", CRM_OP_JOIN_OFFER, current_join_id, join_to); send_msg_via_ha(fsa_cluster_conn, offer); g_hash_table_insert(welcomed_nodes, crm_strdup(join_to), crm_strdup(CRMD_JOINSTATE_PENDING)); } else { crm_debug("Peer process on %s is not active", join_to); } } gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source) { crm_debug_2("Invoked by %s in state: %s", source, fsa_state2string(cur_state)); if(cur_state == S_INTEGRATION) { if(g_hash_table_size(welcomed_nodes) == 0) { crm_debug("Integration of %d peers complete: %s", g_hash_table_size(integrated_nodes), source); register_fsa_input_before( C_FSA_INTERNAL, I_INTEGRATED, NULL); return TRUE; } } else if(cur_state == S_FINALIZE_JOIN) { if(is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { crm_debug("Delaying I_FINALIZED until we have the CIB"); return TRUE; } else if(g_hash_table_size(integrated_nodes) == 0 && g_hash_table_size(finalized_nodes) == 0) { crm_debug("Join process complete: %s", source); register_fsa_input_later( C_FSA_INTERNAL, I_FINALIZED, NULL); } else if(g_hash_table_size(integrated_nodes) != 0 && g_hash_table_size(finalized_nodes) != 0) { crm_err("Waiting on %d integrated nodes" " AND %d confirmations", g_hash_table_size(integrated_nodes), g_hash_table_size(finalized_nodes)); } else if(g_hash_table_size(integrated_nodes) != 0) { crm_debug("Still waiting on %d integrated nodes", g_hash_table_size(integrated_nodes)); } else if(g_hash_table_size(finalized_nodes) != 0) { crm_debug_2("Still waiting on %d confirmations", g_hash_table_size(finalized_nodes)); } } return FALSE; } void join_update_complete_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { fsa_data_t *msg_data = NULL; if(rc == cib_ok) { check_join_state(fsa_state, __FUNCTION__); } else { crm_err("Join update failed"); crm_log_message(LOG_DEBUG, msg); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } diff --git a/include/crm/crm.h b/include/crm/crm.h index 581b19888d..59a4240c72 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -1,305 +1,305 @@ -/* $Id: crm.h,v 1.72 2005/08/17 08:44:57 andrew Exp $ */ +/* $Id: crm.h,v 1.73 2005/08/25 08:29:37 andrew Exp $ */ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef CRM__H #define CRM__H #include #include #include #include #include #include #include #ifdef MCHECK #include #endif #include #ifndef CRM_DEV_BUILD # define CRM_DEV_BUILD 0 #endif #define CRM_DEPRECATED_SINCE_2_0_1 1 #define CRM_DEPRECATED_SINCE_2_0_2 1 #define CRM_DEPRECATED_SINCE_2_0_3 1 #define CRM_DEPRECATED_SINCE_2_0_4 1 #define CRM_DEPRECATED_SINCE_2_1_0 1 #define ipc_call_diff_max_ms 5000 #define action_diff_warn_ms 5000 #define action_diff_max_ms 20000 #define fsa_diff_warn_ms 10000 #define fsa_diff_max_ms 30000 #include #define CRM_ASSERT(expr) if((expr) == FALSE) { \ do_crm_log(LOG_CRIT, __FILE__, __PRETTY_FUNCTION__, \ "Triggered dev assert at %s:%d : %s", \ __FILE__, __LINE__, #expr); \ abort(); \ } extern gboolean crm_assert_failed; #define CRM_DEV_ASSERT(expr) crm_assert_failed = FALSE; \ if((expr) == FALSE) { \ crm_assert_failed = TRUE; \ do_crm_log(CRM_DEV_BUILD?LOG_CRIT:LOG_ERR, \ __FILE__, __PRETTY_FUNCTION__, \ "Triggered dev assert at %s:%d : %s", \ __FILE__, __LINE__, #expr); \ if(CRM_DEV_BUILD) { \ abort(); \ } \ } /* Clean these up at some point, some probably should be runtime options */ #define WORKING_DIR HA_VARLIBDIR"/heartbeat/crm" #define CRM_SOCK_DIR HA_VARRUNDIR"/heartbeat/crm" #define BIN_DIR HA_LIBDIR"/heartbeat" #define SOCKET_LEN 1024 #define APPNAME_LEN 256 #define MAX_IPC_FAIL 5 #define CIB_FILENAME WORKING_DIR"/cib.xml" #define CIB_BACKUP WORKING_DIR"/cib_backup.xml" #define CRM_VERSION "1.0" #define MSG_LOG 1 #define DOT_FSA_ACTIONS 1 #define DOT_ALL_FSA_INPUTS 1 /* #define FSA_TRACE 1 */ #define INFINITY_S "INFINITY" #define MINUS_INFINITY_S "-INFINITY" #define INFINITY 1000000.0 /* Sub-systems */ #define CRM_SYSTEM_DC "dc" #define CRM_SYSTEM_DCIB "dcib" /* The master CIB */ #define CRM_SYSTEM_CIB "cib" #define CRM_SYSTEM_CRMD "crmd" #define CRM_SYSTEM_LRMD "lrmd" #define CRM_SYSTEM_PENGINE "pengine" #define CRM_SYSTEM_TENGINE "tengine" /* Valid operations */ #define CRM_OP_NOOP "noop" #define CRM_OP_JOIN_ANNOUNCE "join_announce" #define CRM_OP_JOIN_OFFER "join_offer" #define CRM_OP_JOIN_REQUEST "join_request" #define CRM_OP_JOIN_ACKNAK "join_ack_nack" #define CRM_OP_JOIN_CONFIRM "join_confirm" #define CRM_OP_DIE "die_no_respawn" #define CRM_OP_RETRIVE_CIB "retrieve_cib" #define CRM_OP_PING "ping" #define CRM_OP_VOTE "vote" #define CRM_OP_HELLO "hello" #define CRM_OP_HBEAT "dc_beat" #define CRM_OP_PECALC "pe_calc" #define CRM_OP_ABORT "abort" #define CRM_OP_QUIT "quit" #define CRM_OP_LOCAL_SHUTDOWN "start_shutdown" #define CRM_OP_SHUTDOWN_REQ "req_shutdown" #define CRM_OP_SHUTDOWN "do_shutdown" #define CRM_OP_FENCE "stonith" #define CRM_OP_EVENTCC "event_cc" #define CRM_OP_TEABORT "te_abort" #define CRM_OP_TEABORTED "te_abort_confirmed" /* we asked */ #define CRM_OP_TE_HALT "te_halt" #define CRM_OP_TECOMPLETE "te_complete" #define CRM_OP_TETIMEOUT "te_timeout" #define CRM_OP_TRANSITION "transition" #define CRM_OP_REGISTER "register" #define CRM_OP_DEBUG_UP "debug_inc" #define CRM_OP_DEBUG_DOWN "debug_dec" #define CRMD_STATE_ACTIVE "member" #define CRMD_STATE_INACTIVE "down" -#define CRMD_JOINSTATE_DOWN "down" +#define CRMD_JOINSTATE_DOWN CRMD_STATE_INACTIVE #define CRMD_JOINSTATE_PENDING "pending" -#define CRMD_JOINSTATE_MEMBER "member" +#define CRMD_JOINSTATE_MEMBER CRMD_STATE_ACTIVE #define CRMD_ACTION_START "start" #define CRMD_ACTION_STARTED "running" #define CRMD_ACTION_START_FAIL "start_failed" #define CRMD_ACTION_START_PENDING "starting" #define CRMD_ACTION_STOP "stop" #define CRMD_ACTION_STOPPED "stopped" #define CRMD_ACTION_STOP_FAIL "stop_failed" #define CRMD_ACTION_STOP_PENDING "stopping" #define CRMD_ACTION_NOTIFY "notify" #define CRMD_ACTION_NOTIFIED "notified" #define CRMD_ACTION_MON "monitor" #define CRMD_ACTION_MON_PENDING CRMD_ACTION_STARTED #define CRMD_ACTION_MON_OK CRMD_ACTION_STARTED #define CRMD_ACTION_MON_FAIL "monitor_failed" /* #define CRMD_ACTION_GENERIC "pending" */ #define CRMD_ACTION_GENERIC_PENDING "pending" #define CRMD_ACTION_GENERIC_OK "complete" #define CRMD_ACTION_GENERIC_FAIL "pending_failed" typedef GList* GListPtr; #define crm_atoi(text, default) atoi(text?text:default) extern gboolean safe_str_eq(const char *a, const char *b); extern gboolean safe_str_neq(const char *a, const char *b); #define slist_iter(child, child_type, parent, counter, a) \ { \ GListPtr __crm_iter_head = parent; \ child_type *child = NULL; \ int counter = 0; \ for(; __crm_iter_head != NULL; counter++) { \ child = __crm_iter_head->data; \ __crm_iter_head = __crm_iter_head->next; \ { a; } \ } \ } #define LOG_DEBUG_2 LOG_DEBUG+1 #define LOG_DEBUG_3 LOG_DEBUG+2 #define LOG_DEBUG_4 LOG_DEBUG+3 #define LOG_DEBUG_5 LOG_DEBUG+4 #define LOG_DEBUG_6 LOG_DEBUG+5 #define LOG_MSG LOG_DEBUG_3 #define crm_crit(w...) do_crm_log(LOG_CRIT, __FILE__, __FUNCTION__, w) #define crm_err(w...) do_crm_log(LOG_ERR, __FILE__, __FUNCTION__, w) #define crm_warn(w...) do_crm_log(LOG_WARNING, __FILE__, __FUNCTION__, w) #define crm_notice(w...) do_crm_log(LOG_NOTICE, __FILE__, __FUNCTION__, w) #define crm_info(w...) do_crm_log(LOG_INFO, __FILE__, __FUNCTION__, w) #define crm_log_maybe(level, fmt...) if(crm_log_level >= level) { \ do_crm_log(level, __FILE__, __FUNCTION__, fmt); \ } #define crm_debug(fmt...) crm_log_maybe(LOG_DEBUG, fmt) #define crm_debug_2(fmt...) crm_log_maybe(LOG_DEBUG_2, fmt) /* If this is not a developmental build, give the compiler every chance to * optimize these away */ #if CRM_DEV_BUILD # define crm_debug_3(fmt...) crm_log_maybe(LOG_DEBUG_3, fmt) # define crm_debug_4(fmt...) crm_log_maybe(LOG_DEBUG_4, fmt) # define crm_debug_5(fmt...) crm_log_maybe(LOG_DEBUG_5, fmt) # define crm_debug_6(fmt...) crm_log_maybe(LOG_DEBUG_6, fmt) #else # define crm_debug_3(w...) if(0) { do_crm_log(LOG_DEBUG, NULL, NULL, w); } # define crm_debug_4(w...) if(0) { do_crm_log(LOG_DEBUG, NULL, NULL, w); } # define crm_debug_5(w...) if(0) { do_crm_log(LOG_DEBUG, NULL, NULL, w); } # define crm_debug_6(w...) if(0) { do_crm_log(LOG_DEBUG, NULL, NULL, w); } #endif extern void crm_log_message_adv( int level, const char *alt_debugfile, const HA_Message *msg); #define crm_log_message(level, msg) if(crm_log_level >= level) { \ crm_log_message_adv(level, NULL, msg); \ } #define crm_do_action(level, actions) if(crm_log_level >= level) { \ actions; \ } #define crm_action_info(x) crm_do_action(LOG_INFO, x) #define crm_action_debug(x) crm_do_action(LOG_DEBUG, x) #define crm_action_debug_2(x) crm_do_action(LOG_DEBUG_2, x) #define crm_action_debug_3(x) crm_do_action(LOG_DEBUG_3, x) #define crm_action_debug_4(x) crm_do_action(LOG_DEBUG_4, x) #define crm_log_xml(level, text, xml) if(crm_log_level >= level) { \ print_xml_formatted(level, __FUNCTION__, xml, text); \ } #define crm_log_xml_crit(xml, text) crm_log_xml(LOG_CRIT, text, xml) #define crm_log_xml_err(xml, text) crm_log_xml(LOG_ERR, text, xml) #define crm_log_xml_warn(xml, text) crm_log_xml(LOG_WARNING, text, xml) #define crm_log_xml_notice(xml, text) crm_log_xml(LOG_NOTICE, text, xml) #define crm_log_xml_info(xml, text) crm_log_xml(LOG_INFO, text, xml) #define crm_log_xml_debug(xml, text) crm_log_xml(LOG_DEBUG, text, xml) #define crm_log_xml_debug_2(xml, text) crm_log_xml(LOG_DEBUG_2, text, xml) #define crm_log_xml_debug_3(xml, text) crm_log_xml(LOG_DEBUG_3, text, xml) #define crm_log_xml_debug_4(xml, text) crm_log_xml(LOG_DEBUG_4, text, xml) #define crm_log_xml_debug_5(xml, text) crm_log_xml(LOG_DEBUG_5, text, xml) #define crm_str(x) (const char*)(x?x:"") #if CRM_USE_MALLOC # define crm_malloc0(new_obj,length) \ { \ new_obj = malloc(length); \ if(new_obj == NULL) { \ crm_crit("Out of memory... exiting"); \ exit(1); \ } else { \ memset(new_obj, 0, length); \ } \ } # define crm_free(x) if(x) { free(x); x=NULL; } # define crm_is_allocated(obj) obj?TRUE:FALSE #else # if CRM_DEV_BUILD # define crm_malloc0(new_obj,length) \ { \ if(new_obj) { \ crm_err("Potential memory leak:" \ " %s at %s:%d not NULL before alloc.", \ #new_obj, __FILE__, __LINE__); \ abort(); \ } \ new_obj = cl_malloc(length); \ if(new_obj == NULL) { \ crm_crit("Out of memory... exiting"); \ abort(); \ } \ memset(new_obj, 0, length); \ } #else # define crm_malloc0(new_obj,length) \ { \ new_obj = cl_malloc(length); \ if(new_obj == NULL) { \ crm_crit("Out of memory... exiting"); \ abort(); \ } \ memset(new_obj, 0, length); \ } # endif # define crm_free(x) if(x) { \ CRM_ASSERT(cl_is_allocated(x) == 1); \ cl_free(x); \ x=NULL; \ } # define crm_is_allocated(obj) cl_is_allocated(obj) #endif #define crm_msg_del(msg) if(msg != NULL) { ha_msg_del(msg); msg = NULL; } #endif