diff --git a/crmd/Makefile.am b/crmd/Makefile.am index aedce94ffc..09d4571252 100644 --- a/crmd/Makefile.am +++ b/crmd/Makefile.am @@ -1,75 +1,79 @@ # # Copyright (C) 2004 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ -I$(top_builddir)/libltdl -I$(top_srcdir)/libltdl halibdir = $(CRM_DAEMON_DIR) ## binary progs halib_PROGRAMS = crmd noinst_PROGRAMS = atest ## SOURCES noinst_HEADERS = crmd.h crmd_fsa.h crmd_messages.h fsa_defines.h \ fsa_matrix.h fsa_proto.h crmd_utils.h crmd_callbacks.h \ crmd_lrm.h te_callbacks.h tengine.h atest_SOURCES = atest.c atest_LDADD = $(top_builddir)/lib/common/libcrmcommon.la crmd_SOURCES = main.c crmd.c corosync.c \ - fsa.c control.c messages.c ccm.c callbacks.c \ + fsa.c control.c messages.c membership.c callbacks.c \ election.c join_client.c join_dc.c subsystems.c \ cib.c pengine.c tengine.c lrm.c \ utils.c misc.c te_events.c te_actions.c te_utils.c te_callbacks.c +if BUILD_HEARTBEAT_SUPPORT +crmd_SOURCES += ccm.c +endif + crmd_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ $(top_builddir)/lib/transition/libtransitioner.la \ $(top_builddir)/lib/pengine/libpe_rules.la \ $(top_builddir)/lib/cib/libcib.la \ $(top_builddir)/lib/cluster/libcrmcluster.la \ $(top_builddir)/lib/common/libcrmcommon.la \ $(CLUSTERLIBS) -llrm if BUILD_XML_HELP man7_MANS = crmd.7 %.xml: % $(top_builddir)/crmd/$< metadata | $(XSLTPROC) --nonet --novalid --stringparam man.name $< $(top_srcdir)/xml/ocf-meta2man.xsl - > $(top_builddir)/crmd/$@ %.7: %.xml $(XSLTPROC) $(MANPAGE_XSLT) $(top_builddir)/crmd/$< endif clean-generic: rm -f *.log *.debug *.xml *~ install-exec-local: uninstall-local: graphs: fsa_inputs.png fsa_inputs_by_action.png fsa_actions_by_state.png %.png: %.dot dot -Tpng $< > $@ %.dot : fsa_matrix.h make_dot.pl perl $(top_srcdir)/crmd/make_dot.pl $(top_srcdir)/crmd/fsa_matrix.h $(top_builddir)/crmd diff --git a/crmd/callbacks.c b/crmd/callbacks.c index 69fca8ddf8..d69b58b3f9 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -1,689 +1,573 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void crmd_ha_connection_destroy(gpointer user_data); void crmd_ha_msg_filter(xmlNode * msg); /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); -#define trigger_fsa(source) crm_trace("Triggering FSA: %s", __FUNCTION__); \ - mainloop_set_trigger(source); #if SUPPORT_HEARTBEAT gboolean crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data) { IPC_Channel *channel = NULL; gboolean stay_connected = TRUE; crm_trace("Invoked"); if (cluster_conn != NULL) { channel = cluster_conn->llc_ops->ipcchan(cluster_conn); } CRM_CHECK(cluster_conn != NULL,;); CRM_CHECK(channel != NULL,;); if (channel != NULL && IPC_ISRCONN(channel)) { if (cluster_conn->llc_ops->msgready(cluster_conn) == 0) { crm_trace("no message ready yet"); } /* invoke the callbacks but dont block */ cluster_conn->llc_ops->rcvmsg(cluster_conn, 0); } if (channel == NULL || channel->ch_status != IPC_CONNECT) { if (is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) { crm_crit("Lost connection to heartbeat service."); } else { crm_info("Lost connection to heartbeat service."); } trigger_fsa(fsa_source); stay_connected = FALSE; } return stay_connected; } #endif void crmd_ha_connection_destroy(gpointer user_data) { crm_trace("Invoked"); if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { /* we signed out, so this is expected */ crm_info("Heartbeat disconnection complete"); return; } crm_crit("Lost connection to heartbeat service!"); register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); trigger_fsa(fsa_source); } void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if (safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } #if SUPPORT_HEARTBEAT void crmd_ha_msg_callback(HA_Message * hamsg, void *private_data) { int level = LOG_DEBUG; crm_node_t *from_node = NULL; xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__); const char *from = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_CRM_TASK); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail); crm_trace("HA[inbound]: %s from %s", op, from); if (crm_peer_cache == NULL || crm_active_members() == 0) { crm_debug("Ignoring HA messages until we are" " connected to the CCM (%s op from %s)", op, from); crm_log_xml_trace(msg, "HA[inbound]: Ignore (No CCM)"); goto bail; } from_node = crm_get_peer(0, from); if (crm_is_member_active(from_node) == FALSE) { if (safe_str_eq(op, CRM_OP_VOTE)) { level = LOG_WARNING; } else if (AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) { level = LOG_WARNING; } else if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { level = LOG_WARNING; } do_crm_log(level, "Ignoring HA message (op=%s) from %s: not in our" " membership list (size=%d)", op, from, crm_active_members()); crm_log_xml_trace(msg, "HA[inbound]: CCM Discard"); } else { crmd_ha_msg_filter(msg); } bail: free_xml(msg); return; } #endif /* * Apparently returning TRUE means "stay connected, keep doing stuff". * Returning FALSE means "we're all done, close the connection" */ gboolean crmd_ipc_msg_callback(IPC_Channel * client, gpointer user_data) { int lpc = 0; xmlNode *msg = NULL; crmd_client_t *curr_client = (crmd_client_t *) user_data; gboolean stay_connected = TRUE; crm_trace("Invoked: %s", curr_client->table_key); while (IPC_ISRCONN(client)) { if (client->ops->is_message_pending(client) == 0) { break; } msg = xmlfromIPC(client, MAX_IPC_DELAY); if (msg == NULL) { break; } #if ENABLE_ACL determine_request_user(&curr_client->user, client, msg, F_CRM_USER); #endif lpc++; crm_trace("Processing msg from %s", curr_client->table_key); crm_log_xml_trace(msg, "CRMd[inbound]"); if (crmd_authorize_message(msg, curr_client)) { route_message(C_IPC_MESSAGE, msg); } free_xml(msg); msg = NULL; if (client->ch_status != IPC_CONNECT) { break; } } crm_trace("Processed %d messages", lpc); if (client->ch_status != IPC_CONNECT) { stay_connected = FALSE; process_client_disconnect(curr_client); } trigger_fsa(fsa_source); return stay_connected; } extern GCHSource *lrm_source; gboolean lrm_dispatch(IPC_Channel * src_not_used, gpointer user_data) { /* ?? src == lrm_channel ?? */ ll_lrm_t *lrm = (ll_lrm_t *) user_data; IPC_Channel *lrm_channel = lrm->lrm_ops->ipcchan(lrm); lrm->lrm_ops->rcvmsg(lrm, FALSE); if (lrm_channel->ch_status != IPC_CONNECT) { lrm_connection_destroy(NULL); return FALSE; } return TRUE; } extern gboolean process_lrm_event(lrm_op_t * op); void lrm_op_callback(lrm_op_t * op) { CRM_CHECK(op != NULL, return); process_lrm_event(op); } static void crmd_peer_update(crm_node_t * member, enum crm_proc_flag client) { const char *status = NULL; CRM_CHECK(member != NULL, return); status = (member->processes & client) ? ONLINESTATUS : OFFLINESTATUS; crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)", member->uname, peer2text(client), status, AM_I_DC ? "true" : crm_str(fsa_our_dc)); if ((client & crm_proc_crmd) == 0) { return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { return; } else if (fsa_state == S_STOPPING) { return; } if (safe_str_eq(member->uname, fsa_our_dc) && crm_is_full_member(member) == FALSE) { /* Did the DC leave us? */ crm_info("Got client status callback - our DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } else if (AM_I_DC) { xmlNode *update = NULL; update = create_node_state(member->uname, NULL, NULL, status, NULL, NULL, FALSE, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); if ((member->processes & client) == 0) { erase_node_from_join(member->uname); check_join_state(fsa_state, __FUNCTION__); fail_incompletable_actions(transition_graph, member->uuid); } else { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } } trigger_fsa(fsa_source); } void ais_status_callback(enum crm_status_type type, crm_node_t * node, const void *data) { gboolean reset_status_entry = FALSE; uint32_t old = 0; set_bit_inplace(fsa_input_register, R_PEER_DATA); if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: crm_info("status: %s is now %s", node->uname, node->state); /* reset_status_entry = TRUE; */ /* If we've never seen the node, then it also wont be in the status section */ break; case crm_status_nstate: crm_info("status: %s is now %s (was %s)", node->uname, node->state, (const char *)data); reset_status_entry = TRUE; break; case crm_status_processes: if (data) { old = *(const uint32_t *)data; } if ((node->processes ^ old) & crm_proc_crmd) { crmd_peer_update(node, crm_proc_crmd); } break; } /* Can this be removed now that do_cl_join_finalize_respond() does the same thing? */ if (AM_I_DC && reset_status_entry && safe_str_eq(CRMD_STATE_ACTIVE, node->state)) { crm_action_t *down = match_down_event(0, node->uname, NULL); erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { crm_info("Node return implies stonith of %s (action %d) completed", node->uname, down->id); down->confirmed = TRUE; } } /* TODO: potentially we also want to set XML_CIB_ATTR_JOINSTATE and XML_CIB_ATTR_EXPSTATE here */ } } void crmd_ha_status_callback(const char *node, const char *status, void *private) { xmlNode *update = NULL; crm_node_t *member = NULL; crm_notice("Status update: Node %s now has status [%s]", node, status); member = crm_get_peer(0, node); if (member == NULL || crm_is_member_active(member) == FALSE) { /* Make sure it is created so crm_update_peer_proc() succeeds */ const char *uuid = get_uuid(node); member = crm_update_peer(0, 0, 0, -1, 0, uuid, node, NULL, NULL); CRM_ASSERT(member); } if (safe_str_eq(status, PINGSTATUS)) { return; } if (safe_str_eq(status, DEADSTATUS)) { /* this node is toast */ crm_update_peer_proc(node, crm_proc_ais, OFFLINESTATUS); if (AM_I_DC) { update = create_node_state(node, DEADSTATUS, XML_BOOLEAN_NO, OFFLINESTATUS, CRMD_JOINSTATE_DOWN, NULL, TRUE, __FUNCTION__); } } else { crm_update_peer_proc(node, crm_proc_ais, ONLINESTATUS); if (AM_I_DC) { update = create_node_state(node, ACTIVESTATUS, NULL, NULL, CRMD_JOINSTATE_PENDING, NULL, FALSE, __FUNCTION__); } } trigger_fsa(fsa_source); if (update != NULL) { fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); } } void crmd_client_status_callback(const char *node, const char *client, const char *status, void *private) { const char *join = NULL; crm_node_t *member = NULL; gboolean clear_shutdown = FALSE; crm_trace("Invoked"); if (safe_str_neq(client, CRM_SYSTEM_CRMD)) { return; } if (safe_str_eq(status, JOINSTATUS)) { clear_shutdown = TRUE; status = ONLINESTATUS; join = CRMD_JOINSTATE_PENDING; } else if (safe_str_eq(status, LEAVESTATUS)) { status = OFFLINESTATUS; join = CRMD_JOINSTATE_DOWN; /* clear_shutdown = TRUE; */ } set_bit_inplace(fsa_input_register, R_PEER_DATA); crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)", node, client, status, AM_I_DC ? "true" : "false"); if (safe_str_eq(status, ONLINESTATUS)) { /* remove the cached value in case it changed */ crm_trace("Uncaching UUID for %s", node); unget_uuid(node); } member = crm_get_peer(0, node); if (member == NULL || crm_is_member_active(member) == FALSE) { /* Make sure it is created so crm_update_peer_proc() succeeds */ const char *uuid = get_uuid(node); member = crm_update_peer(0, 0, 0, -1, 0, uuid, node, NULL, NULL); CRM_ASSERT(member); } if (AM_I_DC) { xmlNode *update = NULL; crm_trace("Got client status callback"); update = create_node_state(node, NULL, NULL, status, join, NULL, clear_shutdown, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); } crm_update_peer_proc(node, crm_proc_crmd, status); } void crmd_ipc_connection_destroy(gpointer user_data) { GCHSource *source = NULL; crmd_client_t *client = user_data; /* Calling this function on an _active_ connection results in: * crmd_ipc_connection_destroy (callbacks.c:431) * -> G_main_del_IPC_Channel (GSource.c:478) * -> g_source_unref * -> G_CH_destroy_int (GSource.c:647) * -> crmd_ipc_connection_destroy (callbacks.c:437)\ * * A better alternative is to call G_main_del_IPC_Channel() directly */ if (client == NULL) { crm_trace("No client to delete"); return; } crm_trace("Disconnecting client %s (%p)", client->table_key, client); source = client->client_source; client->client_source = NULL; if (source != NULL) { crm_trace("Deleting %s (%p) from mainloop", client->table_key, source); G_main_del_IPC_Channel(source); } crm_free(client->table_key); crm_free(client->sub_sys); crm_free(client->uuid); crm_free(client->user); crm_free(client); return; } gboolean crmd_client_connect(IPC_Channel * client_channel, gpointer user_data) { crm_trace("Invoked"); if (client_channel == NULL) { crm_err("Channel was NULL"); } else if (client_channel->ch_status == IPC_DISCONNECT) { crm_err("Channel was disconnected"); } else { crmd_client_t *blank_client = NULL; crm_trace("Channel connected"); crm_malloc0(blank_client, sizeof(crmd_client_t)); CRM_ASSERT(blank_client != NULL); crm_trace("Created client: %p", blank_client); client_channel->ops->set_recv_qlen(client_channel, 1024); client_channel->ops->set_send_qlen(client_channel, 1024); blank_client->client_channel = client_channel; blank_client->sub_sys = NULL; blank_client->uuid = NULL; blank_client->table_key = NULL; blank_client->client_source = G_main_add_IPC_Channel(G_PRIORITY_LOW, client_channel, FALSE, crmd_ipc_msg_callback, blank_client, crmd_ipc_connection_destroy); } return TRUE; } -#if SUPPORT_HEARTBEAT -static void *ccm_library = NULL; -int (*ccm_api_callback_done) (void *cookie) = NULL; -int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL; -static gboolean fsa_have_quorum = FALSE; - -gboolean -ccm_dispatch(int fd, gpointer user_data) -{ - int rc = 0; - oc_ev_t *ccm_token = (oc_ev_t *) user_data; - gboolean was_error = FALSE; - - crm_trace("Invoked"); - if (ccm_api_handle_event == NULL) { - ccm_api_handle_event = - find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event"); - } - rc = (*ccm_api_handle_event) (ccm_token); - - if (rc != 0) { - if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) { - /* we signed out, so this is expected */ - register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL); - crm_err("CCM connection appears to have failed: rc=%d.", rc); - } - was_error = TRUE; - } - - trigger_fsa(fsa_source); - return !was_error; -} - -void -crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data) -{ - gboolean update_cache = FALSE; - const oc_ev_membership_t *membership = data; - - gboolean update_quorum = FALSE; - - crm_trace("Invoked"); - CRM_ASSERT(data != NULL); - - crm_info("Quorum %s after event=%s (id=%d)", - ccm_have_quorum(event) ? "(re)attained" : "lost", - ccm_event_name(event), membership->m_instance); - - if (crm_peer_seq > membership->m_instance) { - crm_err("Membership instance ID went backwards! %llu->%d", - crm_peer_seq, membership->m_instance); - CRM_ASSERT(crm_peer_seq <= membership->m_instance); - return; - } - - /* - * OC_EV_MS_NEW_MEMBERSHIP: membership with quorum - * OC_EV_MS_MS_INVALID: membership without quorum - * OC_EV_MS_NOT_PRIMARY: previous membership no longer valid - * OC_EV_MS_PRIMARY_RESTORED: previous membership restored - * OC_EV_MS_EVICTED: the client is evicted from ccm. - */ - - switch (event) { - case OC_EV_MS_NEW_MEMBERSHIP: - case OC_EV_MS_INVALID: - update_cache = TRUE; - update_quorum = TRUE; - break; - case OC_EV_MS_NOT_PRIMARY: - break; - case OC_EV_MS_PRIMARY_RESTORED: - update_cache = TRUE; - crm_peer_seq = membership->m_instance; - break; - case OC_EV_MS_EVICTED: - update_quorum = TRUE; - register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL); - crm_err("Shutting down after CCM event: %s", ccm_event_name(event)); - break; - default: - crm_err("Unknown CCM event: %d", event); - } - - if (update_quorum) { - crm_have_quorum = ccm_have_quorum(event); - crm_update_quorum(crm_have_quorum, FALSE); - - if (crm_have_quorum == FALSE) { - /* did we just loose quorum? */ - if (fsa_have_quorum) { - crm_info("Quorum lost: %s", ccm_event_name(event)); - } - } - } - - if (update_cache) { - crm_trace("Updating cache after event %s", ccm_event_name(event)); - do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL); - - } else if (event != OC_EV_MS_NOT_PRIMARY) { - crm_peer_seq = membership->m_instance; - register_fsa_action(A_TE_CANCEL); - } - - if (ccm_api_callback_done == NULL) { - ccm_api_callback_done = - find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done"); - } - (*ccm_api_callback_done) (cookie); - return; -} -#endif - void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit_inplace(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/crmd/ccm.c b/crmd/ccm.c index 00406db5cb..3de1768e12 100644 --- a/crmd/ccm.c +++ b/crmd/ccm.c @@ -1,482 +1,365 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* put these first so that uuid_t is defined without conflicts */ #include -#if SUPPORT_HEARTBEAT -# include -# include -static void *ccm_library = NULL; -#endif - #include #include #include #include #include #include #include #include #include #include #include +#include -gboolean membership_flux_hack = FALSE; -void post_cache_update(int instance); +#include +#include -#if SUPPORT_HEARTBEAT -oc_ev_t *fsa_ev_token; void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int); - void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data); -#endif - -void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); -void check_dead_member(const char *uname, GHashTable * members); -void reap_dead_ccm_nodes(gpointer key, gpointer value, gpointer user_data); #define CCM_EVENT_DETAIL 0 #define CCM_EVENT_DETAIL_PARTIAL 0 -int num_ccm_register_fails = 0; -int max_ccm_register_fails = 30; -int last_peer_update = 0; - -extern GHashTable *voted; - -struct update_data_s { - const char *state; - const char *caller; - xmlNode *updates; - gboolean overwrite_join; -}; - -void -reap_dead_ccm_nodes(gpointer key, gpointer value, gpointer user_data) -{ - crm_node_t *node = value; - - if (crm_is_member_active(node) == FALSE) { - check_dead_member(node->uname, NULL); - fail_incompletable_actions(transition_graph, node->uuid); - } -} - -extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); - -void -check_dead_member(const char *uname, GHashTable * members) -{ - CRM_CHECK(uname != NULL, return); - if (members != NULL && g_hash_table_lookup(members, uname) != NULL) { - crm_err("%s didnt really leave the membership!", uname); - return; - } - erase_node_from_join(uname); - if (voted != NULL) { - g_hash_table_remove(voted, uname); - } - - if (safe_str_eq(fsa_our_uname, uname)) { - crm_err("We're not part of the cluster anymore"); - register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); - - } else if (AM_I_DC == FALSE && safe_str_eq(uname, fsa_our_dc)) { - crm_warn("Our DC node (%s) left the cluster", uname); - register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); +int (*ccm_api_callback_done) (void *cookie) = NULL; +int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL; +static gboolean fsa_have_quorum = FALSE; - } else if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { - check_join_state(fsa_state, __FUNCTION__); - } -} +static oc_ev_t *fsa_ev_token; +static void *ccm_library = NULL; +static int num_ccm_register_fails = 0; +static int max_ccm_register_fails = 30; /* A_CCM_CONNECT */ void do_ccm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { -#if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { int (*ccm_api_register) (oc_ev_t ** token) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register"); int (*ccm_api_set_callback) (const oc_ev_t * token, oc_ev_class_t class, oc_ev_callback_t * fn, oc_ev_callback_t ** prev_fn) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback"); void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special"); int (*ccm_api_activate) (const oc_ev_t * token, int *fd) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate"); int (*ccm_api_unregister) (oc_ev_t * token) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister"); if (action & A_CCM_DISCONNECT) { set_bit_inplace(fsa_input_register, R_CCM_DISCONNECTED); (*ccm_api_unregister) (fsa_ev_token); } if (action & A_CCM_CONNECT) { int ret; int fsa_ev_fd; gboolean did_fail = FALSE; crm_trace("Registering with CCM"); clear_bit_inplace(fsa_input_register, R_CCM_DISCONNECTED); ret = (*ccm_api_register) (&fsa_ev_token); if (ret != 0) { crm_warn("CCM registration failed"); did_fail = TRUE; } if (did_fail == FALSE) { crm_trace("Setting up CCM callbacks"); ret = (*ccm_api_set_callback) (fsa_ev_token, OC_EV_MEMB_CLASS, crmd_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if (did_fail == FALSE) { (*ccm_api_special) (fsa_ev_token, OC_EV_MEMB_CLASS, 0 /*don't care */ ); crm_trace("Activating CCM token"); ret = (*ccm_api_activate) (fsa_ev_token, &fsa_ev_fd); if (ret != 0) { crm_warn("CCM Activation failed"); did_fail = TRUE; } } if (did_fail) { num_ccm_register_fails++; (*ccm_api_unregister) (fsa_ev_token); if (num_ccm_register_fails < max_ccm_register_fails) { crm_warn("CCM Connection failed" " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return; } else { crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return; } } crm_info("CCM connection established..." " waiting for first callback"); G_main_add_fd(G_PRIORITY_HIGH, fsa_ev_fd, FALSE, ccm_dispatch, fsa_ev_token, default_ipc_connection_destroy); } } -#endif if (action & ~(A_CCM_CONNECT | A_CCM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } -#if SUPPORT_HEARTBEAT void ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event) { int lpc; gboolean member = FALSE; member = FALSE; crm_trace("-----------------------"); crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, " "new_idx=%d, old_idx=%d", ccm_event_name(event), oc->m_instance, oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx); # if !CCM_EVENT_DETAIL_PARTIAL for (lpc = 0; lpc < oc->m_n_member; lpc++) { crm_info("\tCURRENT: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_memb_idx + lpc].node_uname, oc->m_array[oc->m_memb_idx + lpc].node_id, oc->m_array[oc->m_memb_idx + lpc].node_born_on); if (safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx + lpc].node_uname)) { member = TRUE; } } if (member == FALSE) { crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST"); } # endif for (lpc = 0; lpc < (int)oc->m_n_in; lpc++) { crm_info("\tNEW: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_in_idx + lpc].node_uname, oc->m_array[oc->m_in_idx + lpc].node_id, oc->m_array[oc->m_in_idx + lpc].node_born_on); } for (lpc = 0; lpc < (int)oc->m_n_out; lpc++) { crm_info("\tLOST: %s [nodeid=%d, born=%d]", oc->m_array[oc->m_out_idx + lpc].node_uname, oc->m_array[oc->m_out_idx + lpc].node_id, oc->m_array[oc->m_out_idx + lpc].node_born_on); } crm_trace("-----------------------"); } -#endif - -gboolean ever_had_quorum = FALSE; - -void -post_cache_update(int instance) -{ - xmlNode *no_op = NULL; - - crm_peer_seq = instance; - crm_debug("Updated cache after membership event %d.", instance); - - g_hash_table_foreach(crm_peer_cache, reap_dead_ccm_nodes, NULL); - set_bit_inplace(fsa_input_register, R_CCM_DATA); - - if (AM_I_DC) { - populate_cib_nodes(FALSE); - do_update_cib_nodes(FALSE, __FUNCTION__); - } - - /* - * If we lost nodes, we should re-check the election status - * Safe to call outside of an election - */ - register_fsa_action(A_ELECTION_CHECK); - - /* Membership changed, remind everyone we're here. - * This will aid detection of duplicate DCs - */ - no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, - AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); - send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); - free_xml(no_op); -} /* A_CCM_UPDATE_CACHE */ /* * Take the opportunity to update the node status in the CIB as well */ -#if SUPPORT_HEARTBEAT void do_ccm_update_cache(enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, oc_ed_t event, const oc_ev_membership_t * oc, xmlNode * xml) { unsigned long long instance = 0; unsigned int lpc = 0; if (is_heartbeat_cluster()) { CRM_ASSERT(oc != NULL); instance = oc->m_instance; } CRM_ASSERT(crm_peer_seq <= instance); switch (cur_state) { case S_STOPPING: case S_TERMINATE: case S_HALT: crm_debug("Ignoring %s CCM event %llu, we're in state %s", ccm_event_name(event), instance, fsa_state2string(cur_state)); return; case S_ELECTION: register_fsa_action(A_ELECTION_CHECK); break; default: break; } if (is_heartbeat_cluster()) { ccm_event_detail(oc, event); /*--*-- Recently Dead Member Nodes --*--*/ for (lpc = 0; lpc < oc->m_n_out; lpc++) { crm_update_ccm_node(oc, lpc + oc->m_out_idx, CRM_NODE_LOST, instance); } /*--*-- All Member Nodes --*--*/ for (lpc = 0; lpc < oc->m_n_member; lpc++) { crm_update_ccm_node(oc, lpc + oc->m_memb_idx, CRM_NODE_ACTIVE, instance); } } if (event == OC_EV_MS_EVICTED) { crm_update_peer(0, 0, 0, -1, 0, fsa_our_uuid, fsa_our_uname, NULL, CRM_NODE_EVICTED); /* todo: drop back to S_PENDING instead */ /* get out... NOW! * * go via the error recovery process so that HA will * restart us if required */ register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__); } post_cache_update(instance); return; } -#endif -static void -ccm_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) +gboolean +ccm_dispatch(int fd, gpointer user_data) { - fsa_data_t *msg_data = NULL; - - last_peer_update = 0; - - if (rc == cib_ok) { - crm_trace("Node update %d complete", call_id); - - } else { - crm_err("Node update %d failed", call_id); - crm_log_xml_debug(msg, "failed"); - register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); - } -} - -void -ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) -{ - xmlNode *tmp1 = NULL; - const char *join = NULL; - crm_node_t *node = value; - struct update_data_s *data = (struct update_data_s *)user_data; - - data->state = XML_BOOLEAN_NO; - if (safe_str_eq(node->state, CRM_NODE_ACTIVE)) { - data->state = XML_BOOLEAN_YES; + int rc = 0; + oc_ev_t *ccm_token = (oc_ev_t *) user_data; + gboolean was_error = FALSE; + + crm_trace("Invoked"); + if (ccm_api_handle_event == NULL) { + ccm_api_handle_event = + find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event"); } + rc = (*ccm_api_handle_event) (ccm_token); - crm_debug("Updating %s: %s (overwrite=%s) hash_size=%d", - node->uname, data->state, data->overwrite_join ? "true" : "false", - g_hash_table_size(confirmed_nodes)); - - if (data->overwrite_join) { - if ((node->processes & crm_proc_crmd) == FALSE) { - join = CRMD_JOINSTATE_DOWN; - - } else { - const char *peer_member = g_hash_table_lookup(confirmed_nodes, node->uname); - - if (peer_member != NULL) { - join = CRMD_JOINSTATE_MEMBER; - } else { - join = CRMD_JOINSTATE_PENDING; - } + if (rc != 0) { + if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) { + /* we signed out, so this is expected */ + register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL); + crm_err("CCM connection appears to have failed: rc=%d.", rc); } + was_error = TRUE; } - tmp1 = - create_node_state(node->uname, (node->processes & crm_proc_ais) ? ACTIVESTATUS : DEADSTATUS, - data->state, - (node->processes & crm_proc_crmd) ? ONLINESTATUS : OFFLINESTATUS, join, - NULL, FALSE, data->caller); - - add_node_copy(data->updates, tmp1); - free_xml(tmp1); + trigger_fsa(fsa_source); + return !was_error; } void -do_update_cib_nodes(gboolean overwrite, const char *caller) +crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data) { - int call_id = 0; - int call_options = cib_scope_local | cib_quorum_override; - struct update_data_s update_data; - xmlNode *fragment = NULL; - - if (crm_peer_cache == NULL) { - /* We got a replace notification before being connected to - * the CCM. - * So there is no need to update the local CIB with our values - * - since we have none. - */ - return; + gboolean update_cache = FALSE; + const oc_ev_membership_t *membership = data; - } else if (AM_I_DC == FALSE) { - return; - } + gboolean update_quorum = FALSE; - fragment = create_xml_node(NULL, XML_CIB_TAG_STATUS); + crm_trace("Invoked"); + CRM_ASSERT(data != NULL); - update_data.caller = caller; - update_data.updates = fragment; - update_data.overwrite_join = overwrite; + crm_info("Quorum %s after event=%s (id=%d)", + ccm_have_quorum(event) ? "(re)attained" : "lost", + ccm_event_name(event), membership->m_instance); + + if (crm_peer_seq > membership->m_instance) { + crm_err("Membership instance ID went backwards! %llu->%d", + crm_peer_seq, membership->m_instance); + CRM_ASSERT(crm_peer_seq <= membership->m_instance); + return; + } - g_hash_table_foreach(crm_peer_cache, ghash_update_cib_node, &update_data); + /* + * OC_EV_MS_NEW_MEMBERSHIP: membership with quorum + * OC_EV_MS_MS_INVALID: membership without quorum + * OC_EV_MS_NOT_PRIMARY: previous membership no longer valid + * OC_EV_MS_PRIMARY_RESTORED: previous membership restored + * OC_EV_MS_EVICTED: the client is evicted from ccm. + */ - fsa_cib_update(XML_CIB_TAG_STATUS, fragment, call_options, call_id, NULL); - add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, ccm_node_update_complete); - last_peer_update = call_id; + switch (event) { + case OC_EV_MS_NEW_MEMBERSHIP: + case OC_EV_MS_INVALID: + update_cache = TRUE; + update_quorum = TRUE; + break; + case OC_EV_MS_NOT_PRIMARY: + break; + case OC_EV_MS_PRIMARY_RESTORED: + update_cache = TRUE; + crm_peer_seq = membership->m_instance; + break; + case OC_EV_MS_EVICTED: + update_quorum = TRUE; + register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL); + crm_err("Shutting down after CCM event: %s", ccm_event_name(event)); + break; + default: + crm_err("Unknown CCM event: %d", event); + } - free_xml(fragment); -} + if (update_quorum) { + crm_have_quorum = ccm_have_quorum(event); + crm_update_quorum(crm_have_quorum, FALSE); -static void -cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) -{ - fsa_data_t *msg_data = NULL; + if (crm_have_quorum == FALSE) { + /* did we just loose quorum? */ + if (fsa_have_quorum) { + crm_info("Quorum lost: %s", ccm_event_name(event)); + } + } + } - if (rc == cib_ok) { - crm_trace("Quorum update %d complete", call_id); + if (update_cache) { + crm_trace("Updating cache after event %s", ccm_event_name(event)); + do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL); - } else { - crm_err("Quorum update %d failed", call_id); - crm_log_xml_debug(msg, "failed"); - register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); + } else if (event != OC_EV_MS_NOT_PRIMARY) { + crm_peer_seq = membership->m_instance; + register_fsa_action(A_TE_CANCEL); } -} -void -crm_update_quorum(gboolean quorum, gboolean force_update) -{ - ever_had_quorum |= quorum; - if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { - int call_id = 0; - xmlNode *update = NULL; - int call_options = cib_scope_local | cib_quorum_override; - - update = create_xml_node(NULL, XML_TAG_CIB); - crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); - set_uuid(update, XML_ATTR_DC_UUID, fsa_our_uname); - - fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); - crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id); - add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_quorum_update_complete); - free_xml(update); + if (ccm_api_callback_done == NULL) { + ccm_api_callback_done = + find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done"); } - fsa_has_quorum = quorum; + (*ccm_api_callback_done) (cookie); + return; } + diff --git a/crmd/control.c b/crmd/control.c index ecdbcf9528..993e0376db 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -1,903 +1,903 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include "../lib/cluster/stack.h" #include #include #include #include #include #include #include #include #include char *ipc_server = NULL; extern gboolean crm_connect_corosync(void); extern void crmd_ha_connection_destroy(gpointer user_data); void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); gboolean fsa_has_quorum = FALSE; GHashTable *ipc_clients = NULL; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; if (action & A_HA_DISCONNECT) { if (is_openais_cluster()) { crm_peer_destroy(); #if SUPPORT_COROSYNC terminate_ais_connection(); #endif crm_info("Disconnected from OpenAIS"); #if SUPPORT_HEARTBEAT } else if (fsa_cluster_conn != NULL) { set_bit_inplace(fsa_input_register, R_HA_DISCONNECTED); fsa_cluster_conn->llc_ops->signoff(fsa_cluster_conn, FALSE); crm_info("Disconnected from Heartbeat"); #endif } } if (action & A_HA_CONNECT) { crm_set_status_callback(&ais_status_callback); if (is_openais_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(); #endif } else if (is_heartbeat_cluster()) { #if SUPPORT_HEARTBEAT registered = crm_cluster_connect(&fsa_our_uname, &fsa_our_uuid, crmd_ha_msg_callback, crmd_ha_connection_destroy, &fsa_cluster_conn); #endif } #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { crm_trace("Be informed of Node Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_nstatus_callback(fsa_cluster_conn, crmd_ha_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set nstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } crm_trace("Be informed of CRM Client Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_cstatus_callback(fsa_cluster_conn, crmd_client_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set cstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } if (registered) { crm_trace("Requesting an initial dump of CRMD client_status"); fsa_cluster_conn->llc_ops->client_status(fsa_cluster_conn, NULL, CRM_SYSTEM_CRMD, -1); } } #endif if (registered == FALSE) { set_bit_inplace(fsa_input_register, R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } clear_bit_inplace(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ set_bit_inplace(fsa_input_register, R_SHUTDOWN); if (is_heartbeat_cluster()) { if (is_set(fsa_input_register, pe_subsystem->flag_connected)) { crm_info("Terminating the %s", pe_subsystem->name); if (stop_subsystem(pe_subsystem, TRUE) == FALSE) { /* its gone... */ crm_err("Faking %s exit", pe_subsystem->name); clear_bit_inplace(fsa_input_register, pe_subsystem->flag_connected); } else { crm_info("Waiting for subsystems to exit"); crmd_fsa_stall(NULL); } } crm_info("All subsystems stopped, continuing"); } if (stonith_api) { /* Prevent it from comming up again */ clear_bit_inplace(fsa_input_register, R_ST_REQUIRED); crm_info("Disconnecting STONITH..."); stonith_api->cmds->disconnect(stonith_api); } } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; crm_info("Sending shutdown request to %s", crm_str(fsa_our_dc)); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); /* set_bit_inplace(fsa_input_register, R_STAYDOWN); */ if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; extern GHashTable *reload_hash; void log_connected_client(gpointer key, gpointer value, gpointer user_data); void log_connected_client(gpointer key, gpointer value, gpointer user_data) { crmd_client_t *client = value; crm_err("%s is still connected at exit", client->table_key); } static void free_mem(fsa_data_t * msg_data) { g_main_loop_quit(crmd_mainloop); g_main_loop_unref(crmd_mainloop); #if SUPPORT_HEARTBEAT if (fsa_cluster_conn) { fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); fsa_cluster_conn = NULL; } #endif slist_destroy(fsa_data_t, fsa_data, fsa_message_queue, crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); ); delete_fsa_input(msg_data); if (ipc_clients) { crm_debug("Number of connected clients: %d", g_hash_table_size(ipc_clients)); /* g_hash_table_foreach(ipc_clients, log_connected_client, NULL); */ g_hash_table_destroy(ipc_clients); } empty_uuid_cache(); crm_peer_destroy(); - clear_bit_inplace(fsa_input_register, R_CCM_DATA); + clear_bit_inplace(fsa_input_register, R_MEMBERSHIP); if (te_subsystem->client && te_subsystem->client->client_source) { crm_debug("Full destroy: TE"); G_main_del_IPC_Channel(te_subsystem->client->client_source); } else { crm_debug("Partial destroy: TE"); crmd_ipc_connection_destroy(te_subsystem->client); } crm_free(te_subsystem); if (pe_subsystem->client && pe_subsystem->client->client_source) { crm_debug("Full destroy: PE"); G_main_del_IPC_Channel(pe_subsystem->client->client_source); } else { crm_debug("Partial destroy: PE"); crmd_ipc_connection_destroy(pe_subsystem->client); } crm_free(pe_subsystem); crm_free(cib_subsystem); if (integrated_nodes) { g_hash_table_destroy(integrated_nodes); } if (finalized_nodes) { g_hash_table_destroy(finalized_nodes); } if (confirmed_nodes) { g_hash_table_destroy(confirmed_nodes); } if (reload_hash) { g_hash_table_destroy(reload_hash); } if (resource_history) { g_hash_table_destroy(resource_history); } if (voted) { g_hash_table_destroy(voted); } cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; if (fsa_lrm_conn) { fsa_lrm_conn->lrm_ops->delete(fsa_lrm_conn); } crm_free(transition_timer); crm_free(integration_timer); crm_free(finalization_timer); crm_free(election_trigger); crm_free(election_timeout); crm_free(shutdown_escalation_timer); crm_free(wait_timer); crm_free(recheck_timer); crm_free(fsa_our_dc_version); crm_free(fsa_our_uname); crm_free(fsa_our_uuid); crm_free(fsa_our_dc); crm_free(ipc_server); crm_free(max_generation_from); free_xml(max_generation_xml); crm_xml_cleanup(); } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int exit_code = 0; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { exit_code = 1; log_level = LOG_ERR; exit_type = "forcefully"; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the CRMd", fsa_action2string(action), exit_type); if (is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = 2; } if (is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn by Heartbeat"); exit_code = 100; } free_mem(msg_data); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); cl_flush_logs(); exit(exit_code); } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int was_error = 0; int interval = 1; /* seconds between DC heartbeats */ crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); ipc_clients = g_hash_table_new(crm_str_hash, g_str_equal); crm_debug("Creating CIB and LRM objects"); fsa_cib_conn = cib_new(); fsa_lrm_conn = ll_lrm_new(XML_CIB_TAG_LRM); /* set up the timers */ crm_malloc0(transition_timer, sizeof(fsa_timer_t)); crm_malloc0(integration_timer, sizeof(fsa_timer_t)); crm_malloc0(finalization_timer, sizeof(fsa_timer_t)); crm_malloc0(election_trigger, sizeof(fsa_timer_t)); crm_malloc0(election_timeout, sizeof(fsa_timer_t)); crm_malloc0(shutdown_escalation_timer, sizeof(fsa_timer_t)); crm_malloc0(wait_timer, sizeof(fsa_timer_t)); crm_malloc0(recheck_timer, sizeof(fsa_timer_t)); interval = interval * 1000; if (election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if (election_timeout != NULL) { election_timeout->source_id = 0; election_timeout->period_ms = -1; election_timeout->fsa_input = I_ELECTION_DC; election_timeout->callback = crm_timer_popped; election_timeout->repeat = FALSE; } else { was_error = TRUE; } if (transition_timer != NULL) { transition_timer->source_id = 0; transition_timer->period_ms = -1; transition_timer->fsa_input = I_PE_CALC; transition_timer->callback = crm_timer_popped; transition_timer->repeat = FALSE; } else { was_error = TRUE; } if (integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if (finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think its in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because its in S_PENDING * * if we have nodes where heartbeat is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if (shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if (wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 2000; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if (recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } /* set up the sub systems */ crm_malloc0(cib_subsystem, sizeof(struct crm_subsystem_s)); crm_malloc0(te_subsystem, sizeof(struct crm_subsystem_s)); crm_malloc0(pe_subsystem, sizeof(struct crm_subsystem_s)); if (cib_subsystem != NULL) { cib_subsystem->pid = -1; cib_subsystem->path = CRM_DAEMON_DIR; cib_subsystem->name = CRM_SYSTEM_CIB; cib_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_CIB; cib_subsystem->args = "-VVc"; cib_subsystem->flag_connected = R_CIB_CONNECTED; cib_subsystem->flag_required = R_CIB_REQUIRED; } else { was_error = TRUE; } if (te_subsystem != NULL) { te_subsystem->pid = -1; te_subsystem->path = CRM_DAEMON_DIR; te_subsystem->name = CRM_SYSTEM_TENGINE; te_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_TENGINE; te_subsystem->args = NULL; te_subsystem->flag_connected = R_TE_CONNECTED; te_subsystem->flag_required = R_TE_REQUIRED; } else { was_error = TRUE; } if (pe_subsystem != NULL) { pe_subsystem->pid = -1; pe_subsystem->path = CRM_DAEMON_DIR; pe_subsystem->name = CRM_SYSTEM_PENGINE; pe_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_PENGINE; pe_subsystem->args = NULL; pe_subsystem->flag_connected = R_PE_CONNECTED; pe_subsystem->flag_required = R_PE_REQUIRED; } else { was_error = TRUE; } if (was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } welcomed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); set_sigchld_proctrack(G_PRIORITY_HIGH, DEFAULT_MAXDISPATCHTIME); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; - } else if (is_set(fsa_input_register, R_CCM_DATA) == FALSE) { - crm_info("Delaying start, no membership data (%.16llx)", R_CCM_DATA); + } else if (is_set(fsa_input_register, R_MEMBERSHIP) == FALSE) { + crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, LRM not connected (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_PEER_DATA) == FALSE) { HA_Message *msg = NULL; /* try reading from HA */ crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crm_trace("Looking for a HA message"); #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { msg = fsa_cluster_conn->llc_ops->readmsg(fsa_cluster_conn, 0); } #endif if (msg != NULL) { crm_trace("There was a HA message"); crm_msg_del(msg); } crmd_fsa_stall(NULL); return; } crm_debug("Init server comms"); if (ipc_server == NULL) { ipc_server = crm_strdup(CRM_SYSTEM_CRMD); } if (init_server_ipc_comms(ipc_server, crmd_client_connect, default_ipc_connection_destroy)) { crm_err("Couldn't start IPC server"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } if (stonith_reconnect == NULL) { int dummy; stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); } set_bit_inplace(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); crm_notice("The local CRM is operational"); clear_bit_inplace(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { set_bit_inplace(fsa_input_register, R_IN_RECOVERY); crm_err("Action %s (%.16llx) not supported", fsa_action2string(action), action); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* *INDENT-OFF* */ pe_cluster_option crmd_opts[] = { /* name, old-name, validate, default, description */ { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, { XML_ATTR_EXPECTED_VOTES, NULL, "integer", NULL, "2", &check_number, "The number of nodes expected to be in the cluster", "Used to calculate quorum in openais based clusters." }, }; /* *INDENT-ON* */ void crmd_metadata(void) { config_metadata("CRM Daemon", "1.0", "CRM Daemon Options", "This is a fake resource that details the options that can be configured for the CRM Daemon.", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable * options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable * options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; ha_time_t *now = new_ha_date(TRUE); if (rc != cib_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", cib_error2string(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == cib_bad_permissions || rc == cib_dtd_validation || rc == cib_bad_digest || rc == cib_bad_config) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit_inplace(fsa_input_register, R_STAYDOWN); } goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(output, output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now); verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); crm_debug("Shutdown escalation occurs after: %dms", shutdown_escalation_timer->period_ms); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); election_timeout->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); crm_debug("Checking for expired actions every %dms", recheck_timer->period_ms); value = crmd_pref(config_hash, "crmd-transition-delay"); transition_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); #if SUPPORT_COROSYNC if (is_classic_ais_cluster()) { value = crmd_pref(config_hash, XML_ATTR_EXPECTED_VOTES); crm_debug("Sending expected-votes=%s to corosync", value); send_ais_text(crm_class_quorum, value, TRUE, NULL, crm_msg_ais); } #endif set_bit_inplace(fsa_input_register, R_READ_CONFIG); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: free_ha_date(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { set_bit_inplace(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms < 1) { const char *value = crmd_pref(NULL, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_debug("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; } /* cant rely on this... */ crm_notice("Requesting shutdown, upper limit is %dms", shutdown_escalation_timer->period_ms); crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); exit(LSB_EXIT_OK); } } static void default_cib_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != cib_ok) { fsa_data_t *msg_data = NULL; crm_err("CIB Update failed: %s", cib_error2string(rc)); crm_log_xml_warn(output, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } #if SUPPORT_HEARTBEAT static void populate_cib_nodes_ha(gboolean with_client_status) { int call_id = 0; const char *ha_node = NULL; xmlNode *cib_node_list = NULL; if (fsa_cluster_conn == NULL) { crm_debug("Not connected"); return; } /* Async get client status information in the cluster */ crm_info("Requesting the list of configured nodes"); fsa_cluster_conn->llc_ops->init_nodewalk(fsa_cluster_conn); cib_node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); do { const char *ha_node_type = NULL; const char *ha_node_uuid = NULL; xmlNode *cib_new_node = NULL; ha_node = fsa_cluster_conn->llc_ops->nextnode(fsa_cluster_conn); if (ha_node == NULL) { continue; } ha_node_type = fsa_cluster_conn->llc_ops->node_type(fsa_cluster_conn, ha_node); if (safe_str_neq(NORMALNODE, ha_node_type)) { crm_debug("Node %s: skipping '%s'", ha_node, ha_node_type); continue; } ha_node_uuid = get_uuid(ha_node); if (ha_node_uuid == NULL) { crm_warn("Node %s: no uuid found", ha_node); continue; } crm_debug("Node: %s (uuid: %s)", ha_node, ha_node_uuid); cib_new_node = create_xml_node(cib_node_list, XML_CIB_TAG_NODE); crm_xml_add(cib_new_node, XML_ATTR_ID, ha_node_uuid); crm_xml_add(cib_new_node, XML_ATTR_UNAME, ha_node); crm_xml_add(cib_new_node, XML_ATTR_TYPE, ha_node_type); } while (ha_node != NULL); fsa_cluster_conn->llc_ops->end_nodewalk(fsa_cluster_conn); /* Now update the CIB with the list of nodes */ fsa_cib_update(XML_CIB_TAG_NODES, cib_node_list, cib_scope_local | cib_quorum_override, call_id, NULL); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, default_cib_update_callback); free_xml(cib_node_list); crm_trace("Complete"); } #endif static void create_cib_node_definition(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; xmlNode *cib_nodes = user_data; xmlNode *cib_new_node = NULL; crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); cib_new_node = create_xml_node(cib_nodes, XML_CIB_TAG_NODE); crm_xml_add(cib_new_node, XML_ATTR_ID, node->uuid); crm_xml_add(cib_new_node, XML_ATTR_UNAME, node->uname); crm_xml_add(cib_new_node, XML_ATTR_TYPE, NORMALNODE); } void populate_cib_nodes(gboolean with_client_status) { int call_id = 0; xmlNode *cib_node_list = NULL; #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { populate_cib_nodes_ha(with_client_status); return; } #endif cib_node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); g_hash_table_foreach(crm_peer_cache, create_cib_node_definition, cib_node_list); fsa_cib_update(XML_CIB_TAG_NODES, cib_node_list, cib_scope_local | cib_quorum_override, call_id, NULL); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, default_cib_update_callback); free_xml(cib_node_list); crm_trace("Complete"); } diff --git a/crmd/corosync.c b/crmd/corosync.c index 52bd485b6e..3d846b4cd2 100644 --- a/crmd/corosync.c +++ b/crmd/corosync.c @@ -1,179 +1,179 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void post_cache_update(int seq); extern void crmd_ha_connection_destroy(gpointer user_data); /* A_HA_CONNECT */ #if SUPPORT_COROSYNC extern void crmd_ha_msg_filter(xmlNode * msg); static gboolean crmd_ais_dispatch(AIS_Message * wrapper, char *data, int sender) { int seq = 0; xmlNode *xml = NULL; const char *seq_s = NULL; xml = string2xml(data); if (xml == NULL) { crm_err("Could not parse message content (%d): %.100s", wrapper->header.id, data); return TRUE; } switch (wrapper->header.id) { case crm_class_members: seq_s = crm_element_value(xml, "id"); seq = crm_int_helper(seq_s, NULL); set_bit_inplace(fsa_input_register, R_PEER_DATA); post_cache_update(seq); /* fall through */ case crm_class_quorum: crm_update_quorum(crm_have_quorum, FALSE); if (AM_I_DC) { const char *votes = crm_element_value(xml, "expected"); if (votes == NULL || check_number(votes) == FALSE) { crm_log_xml_err(xml, "Invalid quorum/membership update"); } else { int rc = update_attr(fsa_cib_conn, cib_quorum_override | cib_scope_local | cib_inhibit_notify, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, XML_ATTR_EXPECTED_VOTES, votes, FALSE); crm_info("Setting expected votes to %s", votes); if (cib_ok > rc) { crm_err("Quorum update failed: %s", cib_error2string(rc)); } } } break; case crm_class_cluster: crm_xml_add(xml, F_ORIG, wrapper->sender.uname); crm_xml_add_int(xml, F_SEQ, wrapper->id); crmd_ha_msg_filter(xml); break; case crm_class_rmpeer: /* Ignore */ break; case crm_class_notify: case crm_class_nodeid: crm_err("Unexpected message class (%d): %.100s", wrapper->header.id, data); break; default: crm_err("Invalid message class (%d): %.100s", wrapper->header.id, data); } free_xml(xml); return TRUE; } static gboolean crmd_cman_dispatch(unsigned long long seq, gboolean quorate) { crm_update_quorum(quorate, FALSE); post_cache_update(seq); return TRUE; } static void crmd_cman_destroy(gpointer user_data) { if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { crm_err("connection terminated"); exit(1); } else { crm_info("connection closed"); } } static void crmd_quorum_destroy(gpointer user_data) { if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { crm_err("connection terminated"); exit(1); } else { crm_info("connection closed"); } } static void crmd_ais_destroy(gpointer user_data) { if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { crm_err("connection terminated"); exit(1); } else { crm_info("connection closed"); } } extern gboolean crm_connect_corosync(void); gboolean crm_connect_corosync(void) { gboolean rc = FALSE; if (is_openais_cluster()) { crm_set_status_callback(&ais_status_callback); rc = crm_cluster_connect(&fsa_our_uname, &fsa_our_uuid, crmd_ais_dispatch, crmd_ais_destroy, NULL); } if (rc && is_corosync_cluster()) { init_quorum_connection(crmd_cman_dispatch, crmd_quorum_destroy); } if (rc && is_cman_cluster()) { init_cman_connection(crmd_cman_dispatch, crmd_cman_destroy); - set_bit_inplace(fsa_input_register, R_CCM_DATA); + set_bit_inplace(fsa_input_register, R_MEMBERSHIP); } return rc; } #endif diff --git a/crmd/crmd_fsa.h b/crmd/crmd_fsa.h index 5d69b023d2..cf81e041a2 100644 --- a/crmd/crmd_fsa.h +++ b/crmd/crmd_fsa.h @@ -1,131 +1,134 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRMD_FSA__H # define CRMD_FSA__H # include # include # include # include # include # include # include # if SUPPORT_HEARTBEAT extern ll_cluster_t *fsa_cluster_conn; # endif /* copy from struct client_child in heartbeat.h * * Plus a couple of other things */ struct crm_subsystem_s { pid_t pid; /* Process id of child process */ const char *name; /* executable name */ const char *path; /* Command location */ const char *command; /* Command with path */ const char *args; /* Command arguments */ crmd_client_t *client; /* Client connection object */ gboolean sent_kill; IPC_Channel *ipc; /* How can we communicate with it */ long long flag_connected; /* */ long long flag_required; /* */ }; typedef struct fsa_timer_s fsa_timer_t; struct fsa_timer_s { guint source_id; /* timer source id */ int period_ms; /* timer period */ enum crmd_fsa_input fsa_input; gboolean(*callback) (gpointer data); gboolean repeat; }; enum fsa_data_type { fsa_dt_none, fsa_dt_ha_msg, fsa_dt_xml, fsa_dt_lrm, }; typedef struct fsa_data_s fsa_data_t; struct fsa_data_s { int id; enum crmd_fsa_input fsa_input; enum crmd_fsa_cause fsa_cause; long long actions; const char *origin; void *data; enum fsa_data_type data_type; }; extern enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause); /* Global FSA stuff */ extern volatile gboolean do_fsa_stall; extern volatile enum crmd_fsa_state fsa_state; extern volatile long long fsa_input_register; extern volatile long long fsa_actions; extern ll_lrm_t *fsa_lrm_conn; extern cib_t *fsa_cib_conn; extern char *fsa_our_uname; extern char *fsa_our_uuid; extern char *fsa_pe_ref; /* the last invocation of the PE */ extern char *fsa_our_dc; extern char *fsa_our_dc_version; extern GListPtr fsa_message_queue; extern fsa_timer_t *election_trigger; /* */ extern fsa_timer_t *election_timeout; /* */ extern fsa_timer_t *shutdown_escalation_timer; /* */ extern fsa_timer_t *transition_timer; extern fsa_timer_t *integration_timer; extern fsa_timer_t *finalization_timer; extern fsa_timer_t *wait_timer; extern fsa_timer_t *recheck_timer; extern crm_trigger_t *fsa_source; extern crm_trigger_t *config_read; extern struct crm_subsystem_s *cib_subsystem; extern struct crm_subsystem_s *te_subsystem; extern struct crm_subsystem_s *pe_subsystem; extern GHashTable *welcomed_nodes; extern GHashTable *integrated_nodes; extern GHashTable *finalized_nodes; extern GHashTable *confirmed_nodes; extern GHashTable *crmd_peer_state; /* these two should be moved elsewhere... */ extern void do_update_cib_nodes(gboolean overwrite, const char *caller); extern gboolean do_dc_heartbeat(gpointer data); # define AM_I_DC is_set(fsa_input_register, R_THE_DC) # define AM_I_OPERATIONAL (is_set(fsa_input_register, R_STARTING)==FALSE) extern unsigned long long saved_ccm_membership_id; extern gboolean ever_had_quorum; # include # include +#define trigger_fsa(source) crm_trace("Triggering FSA: %s", __FUNCTION__); \ + mainloop_set_trigger(source); + #endif diff --git a/crmd/fsa.c b/crmd/fsa.c index 9d340a9a83..57ddf91fbf 100644 --- a/crmd/fsa.c +++ b/crmd/fsa.c @@ -1,670 +1,682 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *fsa_our_dc = NULL; cib_t *fsa_cib_conn = NULL; char *fsa_our_dc_version = NULL; ll_lrm_t *fsa_lrm_conn; char *fsa_our_uuid = NULL; char *fsa_our_uname = NULL; #if SUPPORT_HEARTBEAT ll_cluster_t *fsa_cluster_conn; #endif fsa_timer_t *wait_timer = NULL; fsa_timer_t *recheck_timer = NULL; fsa_timer_t *election_trigger = NULL; fsa_timer_t *election_timeout = NULL; fsa_timer_t *transition_timer = NULL; fsa_timer_t *integration_timer = NULL; fsa_timer_t *finalization_timer = NULL; fsa_timer_t *shutdown_escalation_timer = NULL; volatile gboolean do_fsa_stall = FALSE; volatile long long fsa_input_register = 0; volatile long long fsa_actions = A_NOTHING; volatile enum crmd_fsa_state fsa_state = S_STARTING; extern uint highest_born_on; extern uint num_join_invites; extern GHashTable *welcomed_nodes; extern GHashTable *finalized_nodes; extern GHashTable *confirmed_nodes; extern GHashTable *integrated_nodes; extern void initialize_join(gboolean before); #define DOT_PREFIX "actions:trace: " #define do_dot_log(fmt, args...) crm_trace( fmt, ##args) long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data); void dump_rsc_info(void); void dump_rsc_info_callback(const xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); void ghash_print_node(gpointer key, gpointer value, gpointer user_data); void s_crmd_fsa_actions(fsa_data_t * fsa_data); void log_fsa_input(fsa_data_t * stored_msg); void init_dotfile(void); void init_dotfile(void) { do_dot_log(DOT_PREFIX "digraph \"g\" {"); do_dot_log(DOT_PREFIX " size = \"30,30\""); do_dot_log(DOT_PREFIX " graph ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " bb = \"0,0,398.922306,478.927856\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " node ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " shape = \"ellipse\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " edge ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// special nodes"); do_dot_log(DOT_PREFIX " \"S_PENDING\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"blue\""); do_dot_log(DOT_PREFIX " fontcolor = \"blue\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " \"S_TERMINATE\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"red\""); do_dot_log(DOT_PREFIX " fontcolor = \"red\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// DC only nodes"); do_dot_log(DOT_PREFIX " \"S_INTEGRATION\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_POLICY_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_TRANSITION_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_RELEASE_DC\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_IDLE\" [ fontcolor = \"green\" ]"); } static void do_fsa_action(fsa_data_t * fsa_data, long long an_action, void (*function) (long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)) { int action_log_level = LOG_DEBUG; /* The calls to fsa_action2string() is expensive, * only make it if we will use the result */ if (an_action & A_MSG_ROUTE) { action_log_level = LOG_DEBUG_2; } fsa_actions &= ~an_action; do_crm_log(action_log_level, DOT_PREFIX "\t// %s", fsa_action2string(an_action)); function(an_action, fsa_data->fsa_cause, fsa_state, fsa_data->fsa_input, fsa_data); } static long long startup_actions = A_STARTUP | A_CIB_START | A_LRM_CONNECT | A_CCM_CONNECT | A_HA_CONNECT | A_READCONFIG | A_STARTED | A_CL_JOIN_QUERY; enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause) { fsa_data_t *fsa_data = NULL; long long register_copy = fsa_input_register; long long new_actions = A_NOTHING; enum crmd_fsa_state last_state = fsa_state; crm_trace("FSA invoked with Cause: %s\tState: %s", fsa_cause2string(cause), fsa_state2string(fsa_state)); do_fsa_stall = FALSE; if (is_message() == FALSE && fsa_actions != A_NOTHING) { /* fake the first message so we can get into the loop */ crm_malloc0(fsa_data, sizeof(fsa_data_t)); fsa_data->fsa_input = I_NULL; fsa_data->fsa_cause = C_FSA_INTERNAL; fsa_data->origin = __FUNCTION__; fsa_data->data_type = fsa_dt_none; fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); fsa_data = NULL; } while (is_message() && do_fsa_stall == FALSE) { crm_trace("Checking messages (%d remaining)", g_list_length(fsa_message_queue)); fsa_data = get_message(); CRM_CHECK(fsa_data != NULL, continue); log_fsa_input(fsa_data); /* add any actions back to the queue */ fsa_actions |= fsa_data->actions; /* get the next batch of actions */ new_actions = crmd_fsa_actions[fsa_data->fsa_input][fsa_state]; fsa_actions |= new_actions; if (fsa_data->fsa_input != I_NULL && fsa_data->fsa_input != I_ROUTER) { crm_debug("Processing %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); } #ifdef FSA_TRACE if (new_actions != A_NOTHING) { crm_trace("Adding FSA actions %.16llx for %s/%s", new_actions, fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); fsa_dump_actions(new_actions, "\tFSA scheduled"); } else if (fsa_data->fsa_input != I_NULL && new_actions == A_NOTHING) { crm_debug("No action specified for input,state (%s,%s)", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); } if (fsa_data->actions != A_NOTHING) { crm_trace("Adding input actions %.16llx for %s/%s", new_actions, fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); fsa_dump_actions(fsa_data->actions, "\tInput scheduled"); } #endif /* logging : *before* the state is changed */ if (is_set(fsa_actions, A_ERROR)) { do_fsa_action(fsa_data, A_ERROR, do_log); } if (is_set(fsa_actions, A_WARN)) { do_fsa_action(fsa_data, A_WARN, do_log); } if (is_set(fsa_actions, A_LOG)) { do_fsa_action(fsa_data, A_LOG, do_log); } /* update state variables */ last_state = fsa_state; fsa_state = crmd_fsa_state[fsa_data->fsa_input][fsa_state]; /* * Remove certain actions during shutdown */ if (fsa_state == S_STOPPING || ((fsa_input_register & R_SHUTDOWN) == R_SHUTDOWN)) { clear_bit_inplace(fsa_actions, startup_actions); } /* * Hook for change of state. * Allows actions to be added or removed when entering a state */ if (last_state != fsa_state) { fsa_actions = do_state_transition(fsa_actions, last_state, fsa_state, fsa_data); } else { do_dot_log(DOT_PREFIX "\t// FSA input: State=%s \tCause=%s" " \tInput=%s \tOrigin=%s() \tid=%d", fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_input2string(fsa_data->fsa_input), fsa_data->origin, fsa_data->id); } /* start doing things... */ s_crmd_fsa_actions(fsa_data); delete_fsa_input(fsa_data); fsa_data = NULL; } if (g_list_length(fsa_message_queue) > 0 || fsa_actions != A_NOTHING || do_fsa_stall) { crm_debug("Exiting the FSA: queue=%d, fsa_actions=0x%llx, stalled=%s", g_list_length(fsa_message_queue), fsa_actions, do_fsa_stall ? "true" : "false"); } else { crm_trace("Exiting the FSA"); } /* cleanup inputs? */ if (register_copy != fsa_input_register) { long long same = register_copy & fsa_input_register; fsa_dump_inputs(LOG_DEBUG, "Added input:", fsa_input_register ^ same); fsa_dump_inputs(LOG_DEBUG, "Removed input:", register_copy ^ same); } fsa_dump_queue(LOG_DEBUG); return fsa_state; } void s_crmd_fsa_actions(fsa_data_t * fsa_data) { /* * Process actions in order of priority but do only one * action at a time to avoid complicating the ordering. */ CRM_CHECK(fsa_data != NULL, return); while (fsa_actions != A_NOTHING && do_fsa_stall == FALSE) { /* regular action processing in order of action priority * * Make sure all actions that connect to required systems * are performed first */ if (fsa_actions & A_ERROR) { do_fsa_action(fsa_data, A_ERROR, do_log); } else if (fsa_actions & A_WARN) { do_fsa_action(fsa_data, A_WARN, do_log); } else if (fsa_actions & A_LOG) { do_fsa_action(fsa_data, A_LOG, do_log); /* get out of here NOW! before anything worse happens */ } else if (fsa_actions & A_EXIT_1) { do_fsa_action(fsa_data, A_EXIT_1, do_exit); /* essential start tasks */ } else if (fsa_actions & A_STARTUP) { do_fsa_action(fsa_data, A_STARTUP, do_startup); } else if (fsa_actions & A_CIB_START) { do_fsa_action(fsa_data, A_CIB_START, do_cib_control); } else if (fsa_actions & A_HA_CONNECT) { do_fsa_action(fsa_data, A_HA_CONNECT, do_ha_control); } else if (fsa_actions & A_READCONFIG) { do_fsa_action(fsa_data, A_READCONFIG, do_read_config); /* sub-system start/connect */ } else if (fsa_actions & A_LRM_CONNECT) { do_fsa_action(fsa_data, A_LRM_CONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_CONNECT) { - do_fsa_action(fsa_data, A_CCM_CONNECT, do_ccm_control); +#if SUPPORT_HEARTBEAT + if (is_heartbeat_cluster()) { + do_fsa_action(fsa_data, A_CCM_CONNECT, do_ccm_control); + } +#endif + fsa_actions &= ~A_CCM_CONNECT; + } else if (fsa_actions & A_TE_START) { do_fsa_action(fsa_data, A_TE_START, do_te_control); } else if (fsa_actions & A_PE_START) { do_fsa_action(fsa_data, A_PE_START, do_pe_control); /* sub-system restart */ } else if ((fsa_actions & O_CIB_RESTART) == O_CIB_RESTART) { do_fsa_action(fsa_data, O_CIB_RESTART, do_cib_control); } else if ((fsa_actions & O_PE_RESTART) == O_PE_RESTART) { do_fsa_action(fsa_data, O_PE_RESTART, do_pe_control); } else if ((fsa_actions & O_TE_RESTART) == O_TE_RESTART) { do_fsa_action(fsa_data, O_TE_RESTART, do_te_control); /* Timers */ /* else if(fsa_actions & O_DC_TIMER_RESTART) { do_fsa_action(fsa_data, O_DC_TIMER_RESTART, do_timer_control) */ ; } else if (fsa_actions & A_DC_TIMER_STOP) { do_fsa_action(fsa_data, A_DC_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_STOP) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_START) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_START, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_STOP) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_START) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_START, do_timer_control); /* * Highest priority actions */ } else if (fsa_actions & A_MSG_ROUTE) { do_fsa_action(fsa_data, A_MSG_ROUTE, do_msg_route); } else if (fsa_actions & A_RECOVER) { do_fsa_action(fsa_data, A_RECOVER, do_recover); } else if (fsa_actions & A_CL_JOIN_RESULT) { do_fsa_action(fsa_data, A_CL_JOIN_RESULT, do_cl_join_finalize_respond); } else if (fsa_actions & A_CL_JOIN_REQUEST) { do_fsa_action(fsa_data, A_CL_JOIN_REQUEST, do_cl_join_offer_respond); } else if (fsa_actions & A_SHUTDOWN_REQ) { do_fsa_action(fsa_data, A_SHUTDOWN_REQ, do_shutdown_req); } else if (fsa_actions & A_ELECTION_VOTE) { do_fsa_action(fsa_data, A_ELECTION_VOTE, do_election_vote); } else if (fsa_actions & A_ELECTION_COUNT) { do_fsa_action(fsa_data, A_ELECTION_COUNT, do_election_count_vote); } else if (fsa_actions & A_LRM_EVENT) { do_fsa_action(fsa_data, A_LRM_EVENT, do_lrm_event); /* * High priority actions */ } else if (fsa_actions & A_STARTED) { do_fsa_action(fsa_data, A_STARTED, do_started); } else if (fsa_actions & A_CL_JOIN_QUERY) { do_fsa_action(fsa_data, A_CL_JOIN_QUERY, do_cl_join_query); } else if (fsa_actions & A_DC_TIMER_START) { do_fsa_action(fsa_data, A_DC_TIMER_START, do_timer_control); /* * Medium priority actions */ } else if (fsa_actions & A_DC_TAKEOVER) { do_fsa_action(fsa_data, A_DC_TAKEOVER, do_dc_takeover); } else if (fsa_actions & A_DC_RELEASE) { do_fsa_action(fsa_data, A_DC_RELEASE, do_dc_release); } else if (fsa_actions & A_DC_JOIN_FINAL) { do_fsa_action(fsa_data, A_DC_JOIN_FINAL, do_dc_join_final); } else if (fsa_actions & A_ELECTION_CHECK) { do_fsa_action(fsa_data, A_ELECTION_CHECK, do_election_check); } else if (fsa_actions & A_ELECTION_START) { do_fsa_action(fsa_data, A_ELECTION_START, do_election_vote); } else if (fsa_actions & A_TE_HALT) { do_fsa_action(fsa_data, A_TE_HALT, do_te_invoke); } else if (fsa_actions & A_TE_CANCEL) { do_fsa_action(fsa_data, A_TE_CANCEL, do_te_invoke); } else if (fsa_actions & A_DC_JOIN_OFFER_ALL) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ALL, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_OFFER_ONE) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ONE, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_PROCESS_REQ) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_REQ, do_dc_join_filter_offer); } else if (fsa_actions & A_DC_JOIN_PROCESS_ACK) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_ACK, do_dc_join_ack); /* * Low(er) priority actions * Make sure the CIB is always updated before invoking the * PE, and the PE before the TE */ } else if (fsa_actions & A_DC_JOIN_FINALIZE) { do_fsa_action(fsa_data, A_DC_JOIN_FINALIZE, do_dc_join_finalize); } else if (fsa_actions & A_LRM_INVOKE) { do_fsa_action(fsa_data, A_LRM_INVOKE, do_lrm_invoke); } else if (fsa_actions & A_PE_INVOKE) { do_fsa_action(fsa_data, A_PE_INVOKE, do_pe_invoke); } else if (fsa_actions & A_TE_INVOKE) { do_fsa_action(fsa_data, A_TE_INVOKE, do_te_invoke); } else if (fsa_actions & A_CL_JOIN_ANNOUNCE) { do_fsa_action(fsa_data, A_CL_JOIN_ANNOUNCE, do_cl_join_announce); /* sub-system stop */ } else if (fsa_actions & A_DC_RELEASED) { do_fsa_action(fsa_data, A_DC_RELEASED, do_dc_release); } else if (fsa_actions & A_PE_STOP) { do_fsa_action(fsa_data, A_PE_STOP, do_pe_control); } else if (fsa_actions & A_TE_STOP) { do_fsa_action(fsa_data, A_TE_STOP, do_te_control); } else if (fsa_actions & A_SHUTDOWN) { do_fsa_action(fsa_data, A_SHUTDOWN, do_shutdown); } else if (fsa_actions & A_LRM_DISCONNECT) { do_fsa_action(fsa_data, A_LRM_DISCONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_DISCONNECT) { - do_fsa_action(fsa_data, A_CCM_DISCONNECT, do_ccm_control); +#if SUPPORT_HEARTBEAT + if (is_heartbeat_cluster()) { + do_fsa_action(fsa_data, A_CCM_DISCONNECT, do_ccm_control); + } +#endif + fsa_actions &= ~A_CCM_DISCONNECT; + } else if (fsa_actions & A_HA_DISCONNECT) { do_fsa_action(fsa_data, A_HA_DISCONNECT, do_ha_control); } else if (fsa_actions & A_CIB_STOP) { do_fsa_action(fsa_data, A_CIB_STOP, do_cib_control); } else if (fsa_actions & A_STOP) { do_fsa_action(fsa_data, A_STOP, do_stop); /* exit gracefully */ } else if (fsa_actions & A_EXIT_0) { do_fsa_action(fsa_data, A_EXIT_0, do_exit); /* Error checking and reporting */ } else { crm_err("Action %s (0x%llx) not supported ", fsa_action2string(fsa_actions), fsa_actions); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, fsa_data, NULL, __FUNCTION__); } } } void log_fsa_input(fsa_data_t * stored_msg) { crm_trace("Processing queued input %d", stored_msg->id); if (stored_msg->fsa_cause == C_CCM_CALLBACK) { crm_trace("FSA processing CCM callback from %s", stored_msg->origin); } else if (stored_msg->fsa_cause == C_LRM_OP_CALLBACK) { crm_trace("FSA processing LRM callback from %s", stored_msg->origin); } else if (stored_msg->data == NULL) { crm_trace("FSA processing input from %s", stored_msg->origin); } else { ha_msg_input_t *ha_input = fsa_typed_data_adv(stored_msg, fsa_dt_ha_msg, __FUNCTION__); crm_trace("FSA processing XML message from %s", stored_msg->origin); crm_log_xml_trace(ha_input->xml, "FSA message data"); } } long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data) { long long tmp = actions; gboolean clear_recovery_bit = TRUE; enum crmd_fsa_cause cause = msg_data->fsa_cause; enum crmd_fsa_input current_input = msg_data->fsa_input; const char *state_from = fsa_state2string(cur_state); const char *state_to = fsa_state2string(next_state); const char *input = fsa_input2string(current_input); CRM_LOG_ASSERT(cur_state != next_state); do_dot_log(DOT_PREFIX "\t%s -> %s [ label=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); crm_notice("State transition %s -> %s [ input=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); /* the last two clauses might cause trouble later */ if (election_timeout != NULL && next_state != S_ELECTION && cur_state != S_RELEASE_DC) { crm_timer_stop(election_timeout); /* } else { */ /* crm_timer_start(election_timeout); */ } #if 0 if ((fsa_input_register & R_SHUTDOWN)) { set_bit_inplace(tmp, A_DC_TIMER_STOP); } #endif if (next_state == S_INTEGRATION) { set_bit_inplace(tmp, A_INTEGRATE_TIMER_START); } else { set_bit_inplace(tmp, A_INTEGRATE_TIMER_STOP); } if (next_state == S_FINALIZE_JOIN) { set_bit_inplace(tmp, A_FINALIZE_TIMER_START); } else { set_bit_inplace(tmp, A_FINALIZE_TIMER_STOP); } if (next_state != S_PENDING) { set_bit_inplace(tmp, A_DC_TIMER_STOP); } if (next_state != S_ELECTION) { highest_born_on = 0; } if (next_state != S_IDLE) { crm_timer_stop(recheck_timer); } if (cur_state == S_FINALIZE_JOIN && next_state == S_POLICY_ENGINE) { populate_cib_nodes(FALSE); do_update_cib_nodes(TRUE, __FUNCTION__); } switch (next_state) { case S_PENDING: fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); /* fall through */ case S_ELECTION: crm_trace("Resetting our DC to NULL on transition to %s", fsa_state2string(next_state)); update_dc(NULL); break; case S_NOT_DC: if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we have a new DC"); set_bit_inplace(tmp, A_SHUTDOWN_REQ); } CRM_LOG_ASSERT(fsa_our_dc != NULL); if (fsa_our_dc == NULL) { crm_err("Reached S_NOT_DC without a DC" " being recorded"); } break; case S_RECOVERY: clear_recovery_bit = FALSE; break; case S_FINALIZE_JOIN: CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_warn("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (g_hash_table_size(welcomed_nodes) > 0) { char *msg = crm_strdup(" Welcome reply not received from"); crm_warn("%u cluster nodes failed to respond" " to the join offer.", g_hash_table_size(welcomed_nodes)); g_hash_table_foreach(welcomed_nodes, ghash_print_node, msg); crm_free(msg); } else { crm_debug("All %d cluster nodes " "responded to the join offer.", g_hash_table_size(integrated_nodes)); } break; case S_POLICY_ENGINE: CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_info("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (g_hash_table_size(finalized_nodes) > 0) { char *msg = crm_strdup(" Confirm not received from"); crm_err("%u cluster nodes failed to confirm" " their join.", g_hash_table_size(finalized_nodes)); g_hash_table_foreach(finalized_nodes, ghash_print_node, msg); crm_free(msg); } else if (g_hash_table_size(confirmed_nodes) == crm_active_members()) { crm_debug("All %u cluster nodes are" " eligible to run resources.", crm_active_members()); } else if (g_hash_table_size(confirmed_nodes) > crm_active_members()) { crm_err("We have more confirmed nodes than our membership does: %d vs. %d", g_hash_table_size(confirmed_nodes), crm_active_members()); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if (saved_ccm_membership_id != crm_peer_seq) { crm_info("Membership changed: %llu -> %llu - join restart", saved_ccm_membership_id, crm_peer_seq); register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } else { crm_warn("Only %u of %u cluster " "nodes are eligible to run resources - continue %d", g_hash_table_size(confirmed_nodes), crm_active_members(), g_hash_table_size(welcomed_nodes)); } /* initialize_join(FALSE); */ break; case S_STOPPING: case S_TERMINATE: /* possibly redundant */ set_bit_inplace(fsa_input_register, R_SHUTDOWN); break; case S_IDLE: CRM_LOG_ASSERT(AM_I_DC); dump_rsc_info(); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we are the DC"); set_bit_inplace(tmp, A_SHUTDOWN_REQ); } if (recheck_timer->period_ms > 0) { crm_debug("Starting %s", get_timer_desc(recheck_timer)); crm_timer_start(recheck_timer); } break; default: break; } if (clear_recovery_bit && next_state != S_PENDING) { tmp &= ~A_RECOVER; } else if (clear_recovery_bit == FALSE) { tmp |= A_RECOVER; } if (tmp != actions) { /* fsa_dump_actions(actions ^ tmp, "New actions"); */ actions = tmp; } return actions; } void dump_rsc_info(void) { } void ghash_print_node(gpointer key, gpointer value, gpointer user_data) { const char *text = user_data; const char *uname = key; const char *value_s = value; crm_info("%s: %s %s", text, uname, value_s); } diff --git a/crmd/fsa_defines.h b/crmd/fsa_defines.h index b991863523..0efefc3668 100644 --- a/crmd/fsa_defines.h +++ b/crmd/fsa_defines.h @@ -1,492 +1,492 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef FSA_DEFINES__H # define FSA_DEFINES__H /*====================================== * States the DC/CRMd can be in *======================================*/ enum crmd_fsa_state { S_IDLE = 0, /* Nothing happening */ S_ELECTION, /* Take part in the election algorithm as * described below */ S_INTEGRATION, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_FINALIZE_JOIN, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_NOT_DC, /* we are in crmd/slave mode */ S_POLICY_ENGINE, /* Determin the next stable state of the cluster */ S_RECOVERY, /* Something bad happened, check everything is ok * before continuing and attempt to recover if * required */ S_RELEASE_DC, /* we were the DC, but now we arent anymore, * possibly by our own request, and we should * release all unnecessary sub-systems, finish * any pending actions, do general cleanup and * unset anything that makes us think we are * special :) */ S_STARTING, /* we are just starting out */ S_PENDING, /* we are not a full/active member yet */ S_STOPPING, /* We are in the final stages of shutting down */ S_TERMINATE, /* We are going to shutdown, this is the equiv of * "Sending TERM signal to all processes" in Linux * and in worst case scenarios could be considered * a self STONITH */ S_TRANSITION_ENGINE, /* Attempt to make the calculated next stable * state of the cluster a reality */ S_HALT, /* Freeze - dont do anything * Something ad happened that needs the admin to fix * Wait for I_ELECTION */ /* ----------- Last input found in table is above ---------- */ S_ILLEGAL /* This is an illegal FSA state */ /* (must be last) */ }; # define MAXSTATE S_ILLEGAL /* A state diagram can be constructed from the dc_fsa.dot with the following command: dot -Tpng crmd_fsa.dot > crmd_fsa.png Description: Once we start and do some basic sanity checks, we go into the S_NOT_DC state and await instructions from the DC or input from the CCM which indicates the election algorithm needs to run. If the election algorithm is triggered we enter the S_ELECTION state from where we can either go back to the S_NOT_DC state or progress to the S_INTEGRATION state (or S_RELEASE_DC if we used to be the DC but arent anymore). The election algorithm has been adapted from http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR521 Loosly known as the Bully Algorithm, its major points are: - Election is initiated by any node (N) notices that the coordinator is no longer responding - Concurrent multiple elections are possible - Algorithm + N sends ELECTION messages to all nodes that occur earlier in the CCM's membership list. + If no one responds, N wins and becomes coordinator + N sends out COORDINATOR messages to all other nodes in the partition + If one of higher-ups answers, it takes over. N is done. Once the election is complete, if we are the DC, we enter the S_INTEGRATION state which is a DC-in-waiting style state. We are the DC, but we shouldnt do anything yet because we may not have an up-to-date picture of the cluster. There may of course be times when this fails, so we should go back to the S_RECOVERY stage and check everything is ok. We may also end up here if a new node came online, since each node is authorative on itself and we would want to incorporate its information into the CIB. Once we have the latest CIB, we then enter the S_POLICY_ENGINE state where invoke the Policy Engine. It is possible that between invoking the Policy Engine and recieving an answer, that we recieve more input. In this case we would discard the orginal result and invoke it again. Once we are satisfied with the output from the Policy Engine we enter S_TRANSITION_ENGINE and feed the Policy Engine's output to the Transition Engine who attempts to make the Policy Engine's calculation a reality. If the transition completes successfully, we enter S_IDLE, otherwise we go back to S_POLICY_ENGINE with the current unstable state and try again. Of course we may be asked to shutdown at any time, however we must progress to S_NOT_DC before doing so. Once we have handed over DC duties to another node, we can then shut down like everyone else, that is by asking the DC for permission and waiting it to take all our resources away. The case where we are the DC and the only node in the cluster is a special case and handled as an escalation which takes us to S_SHUTDOWN. Similarly if any other point in the shutdown fails or stalls, this is escalated and we end up in S_TERMINATE. At any point, the CRMd/DC can relay messages for its sub-systems, but outbound messages (from sub-systems) should probably be blocked until S_INTEGRATION (for the DC case) or the join protocol has completed (for the CRMd case) */ /*====================================== * * Inputs/Events/Stimuli to be given to the finite state machine * * Some of these a true events, and others a synthesised based on * the "register" (see below) and the contents or source of messages. * * At this point, my plan is to have a loop of some sort that keeps * going until recieving I_NULL * *======================================*/ enum crmd_fsa_input { /* 0 */ I_NULL, /* Nothing happened */ /* 1 */ I_CIB_OP, /* An update to the CIB occurred */ I_CIB_UPDATE, /* An update to the CIB occurred */ I_DC_TIMEOUT, /* We have lost communication with the DC */ I_ELECTION, /* Someone started an election */ I_PE_CALC, /* The Policy Engine needs to be invoked */ I_RELEASE_DC, /* The election completed and we were not * elected, but we were the DC beforehand */ I_ELECTION_DC, /* The election completed and we were (re-)elected * DC */ I_ERROR, /* Something bad happened (more serious than * I_FAIL) and may not have been due to the action * being performed. For example, we may have lost * our connection to the CIB. */ /* 9 */ I_FAIL, /* The action failed to complete successfully */ I_INTEGRATED, I_FINALIZED, I_NODE_JOIN, /* A node has entered the cluster */ I_NOT_DC, /* We are not and were not the DC before or after * the current operation or state */ I_RECOVERED, /* The recovery process completed successfully */ I_RELEASE_FAIL, /* We could not give up DC status for some reason */ I_RELEASE_SUCCESS, /* We are no longer the DC */ I_RESTART, /* The current set of actions needs to be * restarted */ I_TE_SUCCESS, /* Some non-resource, non-ccm action is required * of us, eg. ping */ /* 20 */ I_ROUTER, /* Do our job as router and forward this to the * right place */ I_SHUTDOWN, /* We are asking to shutdown */ I_STOP, /* We have been told to shutdown */ I_TERMINATE, /* Actually exit */ I_STARTUP, I_PE_SUCCESS, /* The action completed successfully */ I_JOIN_OFFER, /* The DC is offering membership */ I_JOIN_REQUEST, /* The client is requesting membership */ I_JOIN_RESULT, /* If not the DC: The result of a join request * Else: A client is responding with its local state info */ I_WAIT_FOR_EVENT, /* we may be waiting for an async task to "happen" * and until it does, we cant do anything else */ I_DC_HEARTBEAT, /* The DC is telling us that it is alive and well */ I_LRM_EVENT, /* 30 */ I_PENDING, I_HALT, /* ------------ Last input found in table is above ----------- */ I_ILLEGAL /* This is an illegal value for an FSA input */ /* (must be last) */ }; # define MAXINPUT I_ILLEGAL # define I_MESSAGE I_ROUTER /*====================================== * * actions * * Some of the actions below will always occur together for now, but I can * forsee that this may not always be the case. So I've spilt them up so * that if they ever do need to be called independantly in the future, it * wont be a problem. * * For example, separating A_LRM_CONNECT from A_STARTUP might be useful * if we ever try to recover from a faulty or disconnected LRM. * *======================================*/ /* Dont do anything */ # define A_NOTHING 0x0000000000000000ULL /* -- Startup actions -- */ /* Hook to perform any actions (other than starting the CIB, * connecting to HA or the CCM) that might be needed as part * of the startup. */ # define A_STARTUP 0x0000000000000001ULL /* Hook to perform any actions that might be needed as part * after startup is successful. */ # define A_STARTED 0x0000000000000002ULL /* Connect to Heartbeat */ # define A_HA_CONNECT 0x0000000000000004ULL # define A_HA_DISCONNECT 0x0000000000000008ULL # define A_INTEGRATE_TIMER_START 0x0000000000000010ULL # define A_INTEGRATE_TIMER_STOP 0x0000000000000020ULL # define A_FINALIZE_TIMER_START 0x0000000000000040ULL # define A_FINALIZE_TIMER_STOP 0x0000000000000080ULL /* -- Election actions -- */ # define A_DC_TIMER_START 0x0000000000000100ULL # define A_DC_TIMER_STOP 0x0000000000000200ULL # define A_ELECTION_COUNT 0x0000000000000400ULL # define A_ELECTION_VOTE 0x0000000000000800ULL # define A_ELECTION_START 0x0000000000001000ULL /* -- Message processing -- */ /* Process the queue of requests */ # define A_MSG_PROCESS 0x0000000000002000ULL /* Send the message to the correct recipient */ # define A_MSG_ROUTE 0x0000000000004000ULL /* Send a welcome message to new node(s) */ # define A_DC_JOIN_OFFER_ONE 0x0000000000008000ULL /* -- Server Join protocol actions -- */ /* Send a welcome message to all nodes */ # define A_DC_JOIN_OFFER_ALL 0x0000000000010000ULL /* Process the remote node's ack of our join message */ # define A_DC_JOIN_PROCESS_REQ 0x0000000000020000ULL /* Send out the reults of the Join phase */ # define A_DC_JOIN_FINALIZE 0x0000000000040000ULL /* Send out the reults of the Join phase */ # define A_DC_JOIN_PROCESS_ACK 0x0000000000080000ULL /* -- Client Join protocol actions -- */ # define A_CL_JOIN_QUERY 0x0000000000100000ULL # define A_CL_JOIN_ANNOUNCE 0x0000000000200000ULL /* Request membership to the DC list */ # define A_CL_JOIN_REQUEST 0x0000000000400000ULL /* Did the DC accept or reject the request */ # define A_CL_JOIN_RESULT 0x0000000000800000ULL /* -- Recovery, DC start/stop -- */ /* Something bad happened, try to recover */ # define A_RECOVER 0x0000000001000000ULL /* Hook to perform any actions (apart from starting, the TE, PE * and gathering the latest CIB) that might be necessary before * giving up the responsibilities of being the DC. */ # define A_DC_RELEASE 0x0000000002000000ULL /* */ # define A_DC_RELEASED 0x0000000004000000ULL /* Hook to perform any actions (apart from starting, the TE, PE * and gathering the latest CIB) that might be necessary before * taking over the responsibilities of being the DC. */ # define A_DC_TAKEOVER 0x0000000008000000ULL /* -- Shutdown actions -- */ # define A_SHUTDOWN 0x0000000010000000ULL # define A_STOP 0x0000000020000000ULL # define A_EXIT_0 0x0000000040000000ULL # define A_EXIT_1 0x0000000080000000ULL # define A_SHUTDOWN_REQ 0x0000000100000000ULL # define A_ELECTION_CHECK 0x0000000200000000ULL # define A_DC_JOIN_FINAL 0x0000000400000000ULL /* -- CCM actions -- */ # define A_CCM_CONNECT 0x0000001000000000ULL # define A_CCM_DISCONNECT 0x0000002000000000ULL /* -- CIB actions -- */ # define A_CIB_START 0x0000020000000000ULL # define A_CIB_STOP 0x0000040000000000ULL /* -- Transition Engine actions -- */ /* Attempt to reach the newly calculated cluster state. This is * only called once per transition (except if it is asked to * stop the transition or start a new one). * Once given a cluster state to reach, the TE will determin * tasks that can be performed in parallel, execute them, wait * for replies and then determin the next set until the new * state is reached or no further tasks can be taken. */ # define A_TE_INVOKE 0x0000100000000000ULL # define A_TE_START 0x0000200000000000ULL # define A_TE_STOP 0x0000400000000000ULL # define A_TE_CANCEL 0x0000800000000000ULL # define A_TE_HALT 0x0001000000000000ULL /* -- Policy Engine actions -- */ /* Calculate the next state for the cluster. This is only * invoked once per needed calculation. */ # define A_PE_INVOKE 0x0002000000000000ULL # define A_PE_START 0x0004000000000000ULL # define A_PE_STOP 0x0008000000000000ULL /* -- Misc actions -- */ /* Add a system generate "block" so that resources arent moved * to or are activly moved away from the affected node. This * way we can return quickly even if busy with other things. */ # define A_NODE_BLOCK 0x0010000000000000ULL /* Update our information in the local CIB */ # define A_UPDATE_NODESTATUS 0x0020000000000000ULL # define A_CIB_BUMPGEN 0x0040000000000000ULL # define A_READCONFIG 0x0080000000000000ULL /* -- LRM Actions -- */ /* Connect to the Local Resource Manager */ # define A_LRM_CONNECT 0x0100000000000000ULL /* Disconnect from the Local Resource Manager */ # define A_LRM_DISCONNECT 0x0200000000000000ULL # define A_LRM_INVOKE 0x0400000000000000ULL # define A_LRM_EVENT 0x0800000000000000ULL /* -- Logging actions -- */ # define A_LOG 0x1000000000000000ULL # define A_ERROR 0x2000000000000000ULL # define A_WARN 0x4000000000000000ULL # define O_EXIT (A_SHUTDOWN|A_STOP|A_CCM_DISCONNECT|A_LRM_DISCONNECT|A_HA_DISCONNECT|A_EXIT_0|A_CIB_STOP) # define O_RELEASE (A_DC_TIMER_STOP|A_DC_RELEASE|A_PE_STOP|A_TE_STOP|A_DC_RELEASED) # define O_PE_RESTART (A_PE_START|A_PE_STOP) # define O_TE_RESTART (A_TE_START|A_TE_STOP) # define O_CIB_RESTART (A_CIB_START|A_CIB_STOP) # define O_DC_TIMER_RESTART (A_DC_TIMER_STOP|A_DC_TIMER_START) /*====================================== * * "register" contents * * Things we may want to remember regardless of which state we are in. * * These also count as inputs for synthesizing I_* * *======================================*/ # define R_THE_DC 0x00000001ULL /* Are we the DC? */ # define R_STARTING 0x00000002ULL /* Are we starting up? */ # define R_SHUTDOWN 0x00000004ULL /* Are we trying to shut down? */ # define R_STAYDOWN 0x00000008ULL /* Should we restart? */ # define R_JOIN_OK 0x00000010ULL /* Have we completed the join process */ # define R_READ_CONFIG 0x00000040ULL # define R_INVOKE_PE 0x00000080ULL /* Does the PE needed to be invoked at the next appropriate point? */ # define R_CIB_CONNECTED 0x00000100ULL /* Is the CIB connected? */ # define R_PE_CONNECTED 0x00000200ULL /* Is the Policy Engine connected? */ # define R_TE_CONNECTED 0x00000400ULL /* Is the Transition Engine connected? */ # define R_LRM_CONNECTED 0x00000800ULL /* Is the Local Resource Manager connected? */ # define R_CIB_REQUIRED 0x00001000ULL /* Is the CIB required? */ # define R_PE_REQUIRED 0x00002000ULL /* Is the Policy Engine required? */ # define R_TE_REQUIRED 0x00004000ULL /* Is the Transition Engine required? */ # define R_ST_REQUIRED 0x00008000ULL /* Is the Stonith daemon required? */ # define R_CIB_DONE 0x00010000ULL /* Have we calculated the CIB? */ # define R_HAVE_CIB 0x00020000ULL /* Do we have an up-to-date CIB */ # define R_CIB_ASKED 0x00040000ULL /* Have we asked for an up-to-date CIB */ -# define R_CCM_DATA 0x00100000ULL /* Have we got CCM data yet */ +# define R_MEMBERSHIP 0x00100000ULL /* Have we got CCM data yet */ # define R_PEER_DATA 0x00200000ULL /* Have we got T_CL_STATUS data yet */ # define R_HA_DISCONNECTED 0x00400000ULL /* did we sign out of our own accord */ # define R_CCM_DISCONNECTED 0x00800000ULL /* did we sign out of our own accord */ # define R_REQ_PEND 0x01000000ULL /* Are there Requests waiting for processing? */ # define R_PE_PEND 0x02000000ULL /* Has the PE been invoked and we're awaiting a reply? */ # define R_TE_PEND 0x04000000ULL /* Has the TE been invoked and we're awaiting completion? */ # define R_RESP_PEND 0x08000000ULL /* Do we have clients waiting on a response? if so perhaps we shouldnt stop yet */ # define R_IN_TRANSITION 0x10000000ULL /* */ # define R_SENT_RSC_STOP 0x20000000ULL /* Have we sent a stop action to all * resources in preparation for * shutting down */ # define R_IN_RECOVERY 0x80000000ULL enum crmd_fsa_cause { C_UNKNOWN = 0, C_STARTUP, C_IPC_MESSAGE, C_HA_MESSAGE, C_CCM_CALLBACK, C_CRMD_STATUS_CALLBACK, C_LRM_OP_CALLBACK, C_LRM_MONITOR_CALLBACK, C_TIMER_POPPED, C_SHUTDOWN, C_HEARTBEAT_FAILED, C_SUBSYSTEM_CONNECT, C_HA_DISCONNECT, C_FSA_INTERNAL, C_ILLEGAL }; extern const char *fsa_input2string(enum crmd_fsa_input input); extern const char *fsa_state2string(enum crmd_fsa_state state); extern const char *fsa_cause2string(enum crmd_fsa_cause cause); extern const char *fsa_action2string(long long action); #endif diff --git a/crmd/membership.c b/crmd/membership.c new file mode 100644 index 0000000000..f5e1389fbf --- /dev/null +++ b/crmd/membership.c @@ -0,0 +1,253 @@ +/* + * Copyright (C) 2004 Andrew Beekhof + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* put these first so that uuid_t is defined without conflicts */ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +gboolean membership_flux_hack = FALSE; +void post_cache_update(int instance); + +void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); +void check_dead_member(const char *uname, GHashTable * members); + +int last_peer_update = 0; + +extern GHashTable *voted; + +struct update_data_s { + const char *state; + const char *caller; + xmlNode *updates; + gboolean overwrite_join; +}; + +static void +reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) +{ + crm_node_t *node = value; + + if (crm_is_member_active(node) == FALSE) { + check_dead_member(node->uname, NULL); + fail_incompletable_actions(transition_graph, node->uuid); + } +} + +extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); + +void +check_dead_member(const char *uname, GHashTable * members) +{ + CRM_CHECK(uname != NULL, return); + if (members != NULL && g_hash_table_lookup(members, uname) != NULL) { + crm_err("%s didnt really leave the membership!", uname); + return; + } + erase_node_from_join(uname); + if (voted != NULL) { + g_hash_table_remove(voted, uname); + } + + if (safe_str_eq(fsa_our_uname, uname)) { + crm_err("We're not part of the cluster anymore"); + register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); + + } else if (AM_I_DC == FALSE && safe_str_eq(uname, fsa_our_dc)) { + crm_warn("Our DC node (%s) left the cluster", uname); + register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); + + } else if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { + check_join_state(fsa_state, __FUNCTION__); + } +} + + +gboolean ever_had_quorum = FALSE; + +void +post_cache_update(int instance) +{ + xmlNode *no_op = NULL; + + crm_peer_seq = instance; + crm_debug("Updated cache after membership event %d.", instance); + + g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); + set_bit_inplace(fsa_input_register, R_MEMBERSHIP); + + if (AM_I_DC) { + populate_cib_nodes(FALSE); + do_update_cib_nodes(FALSE, __FUNCTION__); + } + + /* + * If we lost nodes, we should re-check the election status + * Safe to call outside of an election + */ + register_fsa_action(A_ELECTION_CHECK); + + /* Membership changed, remind everyone we're here. + * This will aid detection of duplicate DCs + */ + no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, + AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); + send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); + free_xml(no_op); +} + +static void +crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) +{ + fsa_data_t *msg_data = NULL; + + last_peer_update = 0; + + if (rc == cib_ok) { + crm_trace("Node update %d complete", call_id); + + } else { + crm_err("Node update %d failed", call_id); + crm_log_xml_debug(msg, "failed"); + register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); + } +} + +void +ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) +{ + xmlNode *tmp1 = NULL; + const char *join = NULL; + crm_node_t *node = value; + struct update_data_s *data = (struct update_data_s *)user_data; + + data->state = XML_BOOLEAN_NO; + if (safe_str_eq(node->state, CRM_NODE_ACTIVE)) { + data->state = XML_BOOLEAN_YES; + } + + crm_debug("Updating %s: %s (overwrite=%s) hash_size=%d", + node->uname, data->state, data->overwrite_join ? "true" : "false", + g_hash_table_size(confirmed_nodes)); + + if (data->overwrite_join) { + if ((node->processes & crm_proc_crmd) == FALSE) { + join = CRMD_JOINSTATE_DOWN; + + } else { + const char *peer_member = g_hash_table_lookup(confirmed_nodes, node->uname); + + if (peer_member != NULL) { + join = CRMD_JOINSTATE_MEMBER; + } else { + join = CRMD_JOINSTATE_PENDING; + } + } + } + + tmp1 = + create_node_state(node->uname, (node->processes & crm_proc_ais) ? ACTIVESTATUS : DEADSTATUS, + data->state, + (node->processes & crm_proc_crmd) ? ONLINESTATUS : OFFLINESTATUS, join, + NULL, FALSE, data->caller); + + add_node_copy(data->updates, tmp1); + free_xml(tmp1); +} + +void +do_update_cib_nodes(gboolean overwrite, const char *caller) +{ + int call_id = 0; + int call_options = cib_scope_local | cib_quorum_override; + struct update_data_s update_data; + xmlNode *fragment = NULL; + + if (crm_peer_cache == NULL) { + /* We got a replace notification before being connected to + * the CCM. + * So there is no need to update the local CIB with our values + * - since we have none. + */ + return; + + } else if (AM_I_DC == FALSE) { + return; + } + + fragment = create_xml_node(NULL, XML_CIB_TAG_STATUS); + + update_data.caller = caller; + update_data.updates = fragment; + update_data.overwrite_join = overwrite; + + g_hash_table_foreach(crm_peer_cache, ghash_update_cib_node, &update_data); + + fsa_cib_update(XML_CIB_TAG_STATUS, fragment, call_options, call_id, NULL); + add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, crmd_node_update_complete); + last_peer_update = call_id; + + free_xml(fragment); +} + +static void +cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) +{ + fsa_data_t *msg_data = NULL; + + if (rc == cib_ok) { + crm_trace("Quorum update %d complete", call_id); + + } else { + crm_err("Quorum update %d failed", call_id); + crm_log_xml_debug(msg, "failed"); + register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); + } +} + +void +crm_update_quorum(gboolean quorum, gboolean force_update) +{ + ever_had_quorum |= quorum; + if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { + int call_id = 0; + xmlNode *update = NULL; + int call_options = cib_scope_local | cib_quorum_override; + + update = create_xml_node(NULL, XML_TAG_CIB); + crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); + set_uuid(update, XML_ATTR_DC_UUID, fsa_our_uname); + + fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); + crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id); + add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_quorum_update_complete); + free_xml(update); + } + fsa_has_quorum = quorum; +} diff --git a/crmd/utils.c b/crmd/utils.c index 5eb4d3cbee..92dac8d53d 100644 --- a/crmd/utils.c +++ b/crmd/utils.c @@ -1,1208 +1,1208 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include <../tools/attrd.h> /* A_DC_TIMER_STOP, A_DC_TIMER_START, * A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START * A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START */ void do_timer_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean timer_op_ok = TRUE; if (action & A_DC_TIMER_STOP) { timer_op_ok = crm_timer_stop(election_trigger); } else if (action & A_FINALIZE_TIMER_STOP) { timer_op_ok = crm_timer_stop(finalization_timer); } else if (action & A_INTEGRATE_TIMER_STOP) { timer_op_ok = crm_timer_stop(integration_timer); /* } else if(action & A_ELECTION_TIMEOUT_STOP) { */ /* timer_op_ok = crm_timer_stop(election_timeout); */ } /* dont start a timer that wasnt already running */ if (action & A_DC_TIMER_START && timer_op_ok) { crm_timer_start(election_trigger); if (AM_I_DC) { /* there can be only one */ register_fsa_input(cause, I_ELECTION, NULL); } } else if (action & A_FINALIZE_TIMER_START) { crm_timer_start(finalization_timer); } else if (action & A_INTEGRATE_TIMER_START) { crm_timer_start(integration_timer); /* } else if(action & A_ELECTION_TIMEOUT_START) { */ /* crm_timer_start(election_timeout); */ } } const char * get_timer_desc(fsa_timer_t * timer) { if (timer == election_trigger) { return "Election Trigger"; } else if (timer == election_timeout) { return "Election Timeout"; } else if (timer == shutdown_escalation_timer) { return "Shutdown Escalation"; } else if (timer == integration_timer) { return "Integration Timer"; } else if (timer == finalization_timer) { return "Finalization Timer"; } else if (timer == transition_timer) { return "New Transition Timer"; } else if (timer == wait_timer) { return "Wait Timer"; } else if (timer == recheck_timer) { return "PEngine Recheck Timer"; } return "Unknown Timer"; } gboolean crm_timer_popped(gpointer data) { fsa_timer_t *timer = (fsa_timer_t *) data; if (timer == wait_timer || timer == recheck_timer || timer == transition_timer || timer == finalization_timer || timer == election_trigger) { crm_info("%s (%s) just popped (%dms)", get_timer_desc(timer), fsa_input2string(timer->fsa_input), timer->period_ms); } else { crm_err("%s (%s) just popped in state %s! (%dms)", get_timer_desc(timer), fsa_input2string(timer->fsa_input), fsa_state2string(fsa_state), timer->period_ms); } if (timer->repeat == FALSE) { crm_timer_stop(timer); /* make it _not_ go off again */ } if (timer->fsa_input == I_INTEGRATED) { crm_info("Welcomed: %d, Integrated: %d", g_hash_table_size(welcomed_nodes), g_hash_table_size(integrated_nodes)); if (g_hash_table_size(welcomed_nodes) == 0) { /* If we don't even have ourself, start again */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, NULL, __FUNCTION__); } else { register_fsa_input_before(C_TIMER_POPPED, timer->fsa_input, NULL); } } else if (timer == recheck_timer && fsa_state != S_IDLE) { crm_debug("Discarding %s event in state: %s", fsa_input2string(timer->fsa_input), fsa_state2string(fsa_state)); } else if (timer == finalization_timer && fsa_state != S_FINALIZE_JOIN) { crm_debug("Discarding %s event in state: %s", fsa_input2string(timer->fsa_input), fsa_state2string(fsa_state)); } else if (timer->fsa_input != I_NULL) { register_fsa_input(C_TIMER_POPPED, timer->fsa_input, NULL); } crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); return TRUE; } gboolean crm_timer_start(fsa_timer_t * timer) { const char *timer_desc = get_timer_desc(timer); if (timer->source_id == 0 && timer->period_ms > 0) { timer->source_id = g_timeout_add(timer->period_ms, timer->callback, (void *)timer); CRM_ASSERT(timer->source_id != 0); crm_debug("Started %s (%s:%dms), src=%d", timer_desc, fsa_input2string(timer->fsa_input), timer->period_ms, timer->source_id); } else if (timer->period_ms < 0) { crm_err("Tried to start %s (%s:%dms) with a -ve period", timer_desc, fsa_input2string(timer->fsa_input), timer->period_ms); } else { crm_debug("%s (%s:%dms) already running: src=%d", timer_desc, fsa_input2string(timer->fsa_input), timer->period_ms, timer->source_id); return FALSE; } return TRUE; } gboolean crm_timer_stop(fsa_timer_t * timer) { const char *timer_desc = get_timer_desc(timer); if (timer == NULL) { crm_err("Attempted to stop NULL timer"); return FALSE; } else if (timer->source_id != 0) { crm_trace("Stopping %s (%s:%dms), src=%d", timer_desc, fsa_input2string(timer->fsa_input), timer->period_ms, timer->source_id); g_source_remove(timer->source_id); timer->source_id = 0; } else { crm_trace("%s (%s:%dms) already stopped", timer_desc, fsa_input2string(timer->fsa_input), timer->period_ms); return FALSE; } return TRUE; } const char * fsa_input2string(enum crmd_fsa_input input) { const char *inputAsText = NULL; switch (input) { case I_NULL: inputAsText = "I_NULL"; break; case I_CIB_OP: inputAsText = "I_CIB_OP (unused)"; break; case I_CIB_UPDATE: inputAsText = "I_CIB_UPDATE"; break; case I_DC_TIMEOUT: inputAsText = "I_DC_TIMEOUT"; break; case I_ELECTION: inputAsText = "I_ELECTION"; break; case I_PE_CALC: inputAsText = "I_PE_CALC"; break; case I_RELEASE_DC: inputAsText = "I_RELEASE_DC"; break; case I_ELECTION_DC: inputAsText = "I_ELECTION_DC"; break; case I_ERROR: inputAsText = "I_ERROR"; break; case I_FAIL: inputAsText = "I_FAIL"; break; case I_INTEGRATED: inputAsText = "I_INTEGRATED"; break; case I_FINALIZED: inputAsText = "I_FINALIZED"; break; case I_NODE_JOIN: inputAsText = "I_NODE_JOIN"; break; case I_JOIN_OFFER: inputAsText = "I_JOIN_OFFER"; break; case I_JOIN_REQUEST: inputAsText = "I_JOIN_REQUEST"; break; case I_JOIN_RESULT: inputAsText = "I_JOIN_RESULT"; break; case I_NOT_DC: inputAsText = "I_NOT_DC"; break; case I_RECOVERED: inputAsText = "I_RECOVERED"; break; case I_RELEASE_FAIL: inputAsText = "I_RELEASE_FAIL"; break; case I_RELEASE_SUCCESS: inputAsText = "I_RELEASE_SUCCESS"; break; case I_RESTART: inputAsText = "I_RESTART"; break; case I_PE_SUCCESS: inputAsText = "I_PE_SUCCESS"; break; case I_ROUTER: inputAsText = "I_ROUTER"; break; case I_SHUTDOWN: inputAsText = "I_SHUTDOWN"; break; case I_STARTUP: inputAsText = "I_STARTUP"; break; case I_TE_SUCCESS: inputAsText = "I_TE_SUCCESS"; break; case I_STOP: inputAsText = "I_STOP"; break; case I_DC_HEARTBEAT: inputAsText = "I_DC_HEARTBEAT"; break; case I_WAIT_FOR_EVENT: inputAsText = "I_WAIT_FOR_EVENT"; break; case I_LRM_EVENT: inputAsText = "I_LRM_EVENT"; break; case I_PENDING: inputAsText = "I_PENDING"; break; case I_HALT: inputAsText = "I_HALT"; break; case I_TERMINATE: inputAsText = "I_TERMINATE"; break; case I_ILLEGAL: inputAsText = "I_ILLEGAL"; break; } if (inputAsText == NULL) { crm_err("Input %d is unknown", input); inputAsText = ""; } return inputAsText; } const char * fsa_state2string(enum crmd_fsa_state state) { const char *stateAsText = NULL; switch (state) { case S_IDLE: stateAsText = "S_IDLE"; break; case S_ELECTION: stateAsText = "S_ELECTION"; break; case S_INTEGRATION: stateAsText = "S_INTEGRATION"; break; case S_FINALIZE_JOIN: stateAsText = "S_FINALIZE_JOIN"; break; case S_NOT_DC: stateAsText = "S_NOT_DC"; break; case S_POLICY_ENGINE: stateAsText = "S_POLICY_ENGINE"; break; case S_RECOVERY: stateAsText = "S_RECOVERY"; break; case S_RELEASE_DC: stateAsText = "S_RELEASE_DC"; break; case S_PENDING: stateAsText = "S_PENDING"; break; case S_STOPPING: stateAsText = "S_STOPPING"; break; case S_TERMINATE: stateAsText = "S_TERMINATE"; break; case S_TRANSITION_ENGINE: stateAsText = "S_TRANSITION_ENGINE"; break; case S_STARTING: stateAsText = "S_STARTING"; break; case S_HALT: stateAsText = "S_HALT"; break; case S_ILLEGAL: stateAsText = "S_ILLEGAL"; break; } if (stateAsText == NULL) { crm_err("State %d is unknown", state); stateAsText = ""; } return stateAsText; } const char * fsa_cause2string(enum crmd_fsa_cause cause) { const char *causeAsText = NULL; switch (cause) { case C_UNKNOWN: causeAsText = "C_UNKNOWN"; break; case C_STARTUP: causeAsText = "C_STARTUP"; break; case C_IPC_MESSAGE: causeAsText = "C_IPC_MESSAGE"; break; case C_HA_MESSAGE: causeAsText = "C_HA_MESSAGE"; break; case C_CCM_CALLBACK: causeAsText = "C_CCM_CALLBACK"; break; case C_TIMER_POPPED: causeAsText = "C_TIMER_POPPED"; break; case C_SHUTDOWN: causeAsText = "C_SHUTDOWN"; break; case C_HEARTBEAT_FAILED: causeAsText = "C_HEARTBEAT_FAILED"; break; case C_SUBSYSTEM_CONNECT: causeAsText = "C_SUBSYSTEM_CONNECT"; break; case C_LRM_OP_CALLBACK: causeAsText = "C_LRM_OP_CALLBACK"; break; case C_LRM_MONITOR_CALLBACK: causeAsText = "C_LRM_MONITOR_CALLBACK"; break; case C_CRMD_STATUS_CALLBACK: causeAsText = "C_CRMD_STATUS_CALLBACK"; break; case C_HA_DISCONNECT: causeAsText = "C_HA_DISCONNECT"; break; case C_FSA_INTERNAL: causeAsText = "C_FSA_INTERNAL"; break; case C_ILLEGAL: causeAsText = "C_ILLEGAL"; break; } if (causeAsText == NULL) { crm_err("Cause %d is unknown", cause); causeAsText = ""; } return causeAsText; } const char * fsa_action2string(long long action) { const char *actionAsText = NULL; switch (action) { case A_NOTHING: actionAsText = "A_NOTHING"; break; case A_ELECTION_START: actionAsText = "A_ELECTION_START"; break; case A_DC_JOIN_FINAL: actionAsText = "A_DC_JOIN_FINAL"; break; case A_READCONFIG: actionAsText = "A_READCONFIG"; break; case O_RELEASE: actionAsText = "O_RELEASE"; break; case A_STARTUP: actionAsText = "A_STARTUP"; break; case A_STARTED: actionAsText = "A_STARTED"; break; case A_HA_CONNECT: actionAsText = "A_HA_CONNECT"; break; case A_HA_DISCONNECT: actionAsText = "A_HA_DISCONNECT"; break; case A_LRM_CONNECT: actionAsText = "A_LRM_CONNECT"; break; case A_LRM_EVENT: actionAsText = "A_LRM_EVENT"; break; case A_LRM_INVOKE: actionAsText = "A_LRM_INVOKE"; break; case A_LRM_DISCONNECT: actionAsText = "A_LRM_DISCONNECT"; break; case A_CL_JOIN_QUERY: actionAsText = "A_CL_JOIN_QUERY"; break; case A_DC_TIMER_STOP: actionAsText = "A_DC_TIMER_STOP"; break; case A_DC_TIMER_START: actionAsText = "A_DC_TIMER_START"; break; case A_INTEGRATE_TIMER_START: actionAsText = "A_INTEGRATE_TIMER_START"; break; case A_INTEGRATE_TIMER_STOP: actionAsText = "A_INTEGRATE_TIMER_STOP"; break; case A_FINALIZE_TIMER_START: actionAsText = "A_FINALIZE_TIMER_START"; break; case A_FINALIZE_TIMER_STOP: actionAsText = "A_FINALIZE_TIMER_STOP"; break; case A_ELECTION_COUNT: actionAsText = "A_ELECTION_COUNT"; break; case A_ELECTION_VOTE: actionAsText = "A_ELECTION_VOTE"; break; case A_ELECTION_CHECK: actionAsText = "A_ELECTION_CHECK"; break; case A_CL_JOIN_ANNOUNCE: actionAsText = "A_CL_JOIN_ANNOUNCE"; break; case A_CL_JOIN_REQUEST: actionAsText = "A_CL_JOIN_REQUEST"; break; case A_CL_JOIN_RESULT: actionAsText = "A_CL_JOIN_RESULT"; break; case A_DC_JOIN_OFFER_ALL: actionAsText = "A_DC_JOIN_OFFER_ALL"; break; case A_DC_JOIN_OFFER_ONE: actionAsText = "A_DC_JOIN_OFFER_ONE"; break; case A_DC_JOIN_PROCESS_REQ: actionAsText = "A_DC_JOIN_PROCESS_REQ"; break; case A_DC_JOIN_PROCESS_ACK: actionAsText = "A_DC_JOIN_PROCESS_ACK"; break; case A_DC_JOIN_FINALIZE: actionAsText = "A_DC_JOIN_FINALIZE"; break; case A_MSG_PROCESS: actionAsText = "A_MSG_PROCESS"; break; case A_MSG_ROUTE: actionAsText = "A_MSG_ROUTE"; break; case A_RECOVER: actionAsText = "A_RECOVER"; break; case A_DC_RELEASE: actionAsText = "A_DC_RELEASE"; break; case A_DC_RELEASED: actionAsText = "A_DC_RELEASED"; break; case A_DC_TAKEOVER: actionAsText = "A_DC_TAKEOVER"; break; case A_SHUTDOWN: actionAsText = "A_SHUTDOWN"; break; case A_SHUTDOWN_REQ: actionAsText = "A_SHUTDOWN_REQ"; break; case A_STOP: actionAsText = "A_STOP "; break; case A_EXIT_0: actionAsText = "A_EXIT_0"; break; case A_EXIT_1: actionAsText = "A_EXIT_1"; break; case A_CCM_CONNECT: actionAsText = "A_CCM_CONNECT"; break; case A_CCM_DISCONNECT: actionAsText = "A_CCM_DISCONNECT"; break; case O_CIB_RESTART: actionAsText = "O_CIB_RESTART"; break; case A_CIB_START: actionAsText = "A_CIB_START"; break; case A_CIB_STOP: actionAsText = "A_CIB_STOP"; break; case A_TE_INVOKE: actionAsText = "A_TE_INVOKE"; break; case O_TE_RESTART: actionAsText = "O_TE_RESTART"; break; case A_TE_START: actionAsText = "A_TE_START"; break; case A_TE_STOP: actionAsText = "A_TE_STOP"; break; case A_TE_HALT: actionAsText = "A_TE_HALT"; break; case A_TE_CANCEL: actionAsText = "A_TE_CANCEL"; break; case A_PE_INVOKE: actionAsText = "A_PE_INVOKE"; break; case O_PE_RESTART: actionAsText = "O_PE_RESTART"; break; case A_PE_START: actionAsText = "A_PE_START"; break; case A_PE_STOP: actionAsText = "A_PE_STOP"; break; case A_NODE_BLOCK: actionAsText = "A_NODE_BLOCK"; break; case A_UPDATE_NODESTATUS: actionAsText = "A_UPDATE_NODESTATUS"; break; case A_LOG: actionAsText = "A_LOG "; break; case A_ERROR: actionAsText = "A_ERROR "; break; case A_WARN: actionAsText = "A_WARN "; break; /* Composite actions */ case A_DC_TIMER_START | A_CL_JOIN_QUERY: actionAsText = "A_DC_TIMER_START|A_CL_JOIN_QUERY"; break; } if (actionAsText == NULL) { crm_err("Action %.16llx is unknown", action); actionAsText = ""; } return actionAsText; } void fsa_dump_inputs(int log_level, const char *text, long long input_register) { if (input_register == A_NOTHING) { return; } if (text == NULL) { text = "Input register contents:"; } if (is_set(input_register, R_THE_DC)) { crm_trace( "%s %.16llx (R_THE_DC)", text, R_THE_DC); } if (is_set(input_register, R_STARTING)) { crm_trace( "%s %.16llx (R_STARTING)", text, R_STARTING); } if (is_set(input_register, R_SHUTDOWN)) { crm_trace( "%s %.16llx (R_SHUTDOWN)", text, R_SHUTDOWN); } if (is_set(input_register, R_STAYDOWN)) { crm_trace( "%s %.16llx (R_STAYDOWN)", text, R_STAYDOWN); } if (is_set(input_register, R_JOIN_OK)) { crm_trace( "%s %.16llx (R_JOIN_OK)", text, R_JOIN_OK); } if (is_set(input_register, R_READ_CONFIG)) { crm_trace( "%s %.16llx (R_READ_CONFIG)", text, R_READ_CONFIG); } if (is_set(input_register, R_INVOKE_PE)) { crm_trace( "%s %.16llx (R_INVOKE_PE)", text, R_INVOKE_PE); } if (is_set(input_register, R_CIB_CONNECTED)) { crm_trace( "%s %.16llx (R_CIB_CONNECTED)", text, R_CIB_CONNECTED); } if (is_set(input_register, R_PE_CONNECTED)) { crm_trace( "%s %.16llx (R_PE_CONNECTED)", text, R_PE_CONNECTED); } if (is_set(input_register, R_TE_CONNECTED)) { crm_trace( "%s %.16llx (R_TE_CONNECTED)", text, R_TE_CONNECTED); } if (is_set(input_register, R_LRM_CONNECTED)) { crm_trace( "%s %.16llx (R_LRM_CONNECTED)", text, R_LRM_CONNECTED); } if (is_set(input_register, R_CIB_REQUIRED)) { crm_trace( "%s %.16llx (R_CIB_REQUIRED)", text, R_CIB_REQUIRED); } if (is_set(input_register, R_PE_REQUIRED)) { crm_trace( "%s %.16llx (R_PE_REQUIRED)", text, R_PE_REQUIRED); } if (is_set(input_register, R_TE_REQUIRED)) { crm_trace( "%s %.16llx (R_TE_REQUIRED)", text, R_TE_REQUIRED); } if (is_set(input_register, R_REQ_PEND)) { crm_trace( "%s %.16llx (R_REQ_PEND)", text, R_REQ_PEND); } if (is_set(input_register, R_PE_PEND)) { crm_trace( "%s %.16llx (R_PE_PEND)", text, R_PE_PEND); } if (is_set(input_register, R_TE_PEND)) { crm_trace( "%s %.16llx (R_TE_PEND)", text, R_TE_PEND); } if (is_set(input_register, R_RESP_PEND)) { crm_trace( "%s %.16llx (R_RESP_PEND)", text, R_RESP_PEND); } if (is_set(input_register, R_CIB_DONE)) { crm_trace( "%s %.16llx (R_CIB_DONE)", text, R_CIB_DONE); } if (is_set(input_register, R_HAVE_CIB)) { crm_trace( "%s %.16llx (R_HAVE_CIB)", text, R_HAVE_CIB); } if (is_set(input_register, R_CIB_ASKED)) { crm_trace( "%s %.16llx (R_CIB_ASKED)", text, R_CIB_ASKED); } - if (is_set(input_register, R_CCM_DATA)) { - crm_trace( "%s %.16llx (R_CCM_DATA)", text, R_CCM_DATA); + if (is_set(input_register, R_MEMBERSHIP)) { + crm_trace( "%s %.16llx (R_MEMBERSHIP)", text, R_MEMBERSHIP); } if (is_set(input_register, R_PEER_DATA)) { crm_trace( "%s %.16llx (R_PEER_DATA)", text, R_PEER_DATA); } if (is_set(input_register, R_IN_RECOVERY)) { crm_trace( "%s %.16llx (R_IN_RECOVERY)", text, R_IN_RECOVERY); } } void fsa_dump_actions(long long action, const char *text) { if (is_set(action, A_READCONFIG)) { crm_trace( "Action %.16llx (A_READCONFIG) %s", A_READCONFIG, text); } if (is_set(action, A_STARTUP)) { crm_trace( "Action %.16llx (A_STARTUP) %s", A_STARTUP, text); } if (is_set(action, A_STARTED)) { crm_trace( "Action %.16llx (A_STARTED) %s", A_STARTED, text); } if (is_set(action, A_HA_CONNECT)) { crm_trace( "Action %.16llx (A_CONNECT) %s", A_HA_CONNECT, text); } if (is_set(action, A_HA_DISCONNECT)) { crm_trace( "Action %.16llx (A_DISCONNECT) %s", A_HA_DISCONNECT, text); } if (is_set(action, A_LRM_CONNECT)) { crm_trace( "Action %.16llx (A_LRM_CONNECT) %s", A_LRM_CONNECT, text); } if (is_set(action, A_LRM_EVENT)) { crm_trace( "Action %.16llx (A_LRM_EVENT) %s", A_LRM_EVENT, text); } if (is_set(action, A_LRM_INVOKE)) { crm_trace( "Action %.16llx (A_LRM_INVOKE) %s", A_LRM_INVOKE, text); } if (is_set(action, A_LRM_DISCONNECT)) { crm_trace( "Action %.16llx (A_LRM_DISCONNECT) %s", A_LRM_DISCONNECT, text); } if (is_set(action, A_DC_TIMER_STOP)) { crm_trace( "Action %.16llx (A_DC_TIMER_STOP) %s", A_DC_TIMER_STOP, text); } if (is_set(action, A_DC_TIMER_START)) { crm_trace( "Action %.16llx (A_DC_TIMER_START) %s", A_DC_TIMER_START, text); } if (is_set(action, A_INTEGRATE_TIMER_START)) { crm_trace( "Action %.16llx (A_INTEGRATE_TIMER_START) %s", A_INTEGRATE_TIMER_START, text); } if (is_set(action, A_INTEGRATE_TIMER_STOP)) { crm_trace( "Action %.16llx (A_INTEGRATE_TIMER_STOP) %s", A_INTEGRATE_TIMER_STOP, text); } if (is_set(action, A_FINALIZE_TIMER_START)) { crm_trace( "Action %.16llx (A_FINALIZE_TIMER_START) %s", A_FINALIZE_TIMER_START, text); } if (is_set(action, A_FINALIZE_TIMER_STOP)) { crm_trace( "Action %.16llx (A_FINALIZE_TIMER_STOP) %s", A_FINALIZE_TIMER_STOP, text); } if (is_set(action, A_ELECTION_COUNT)) { crm_trace( "Action %.16llx (A_ELECTION_COUNT) %s", A_ELECTION_COUNT, text); } if (is_set(action, A_ELECTION_VOTE)) { crm_trace( "Action %.16llx (A_ELECTION_VOTE) %s", A_ELECTION_VOTE, text); } if (is_set(action, A_ELECTION_CHECK)) { crm_trace( "Action %.16llx (A_ELECTION_CHECK) %s", A_ELECTION_CHECK, text); } if (is_set(action, A_CL_JOIN_ANNOUNCE)) { crm_trace( "Action %.16llx (A_CL_JOIN_ANNOUNCE) %s", A_CL_JOIN_ANNOUNCE, text); } if (is_set(action, A_CL_JOIN_REQUEST)) { crm_trace( "Action %.16llx (A_CL_JOIN_REQUEST) %s", A_CL_JOIN_REQUEST, text); } if (is_set(action, A_CL_JOIN_RESULT)) { crm_trace( "Action %.16llx (A_CL_JOIN_RESULT) %s", A_CL_JOIN_RESULT, text); } if (is_set(action, A_DC_JOIN_OFFER_ALL)) { crm_trace( "Action %.16llx (A_DC_JOIN_OFFER_ALL) %s", A_DC_JOIN_OFFER_ALL, text); } if (is_set(action, A_DC_JOIN_OFFER_ONE)) { crm_trace( "Action %.16llx (A_DC_JOIN_OFFER_ONE) %s", A_DC_JOIN_OFFER_ONE, text); } if (is_set(action, A_DC_JOIN_PROCESS_REQ)) { crm_trace( "Action %.16llx (A_DC_JOIN_PROCESS_REQ) %s", A_DC_JOIN_PROCESS_REQ, text); } if (is_set(action, A_DC_JOIN_PROCESS_ACK)) { crm_trace( "Action %.16llx (A_DC_JOIN_PROCESS_ACK) %s", A_DC_JOIN_PROCESS_ACK, text); } if (is_set(action, A_DC_JOIN_FINALIZE)) { crm_trace( "Action %.16llx (A_DC_JOIN_FINALIZE) %s", A_DC_JOIN_FINALIZE, text); } if (is_set(action, A_MSG_PROCESS)) { crm_trace( "Action %.16llx (A_MSG_PROCESS) %s", A_MSG_PROCESS, text); } if (is_set(action, A_MSG_ROUTE)) { crm_trace( "Action %.16llx (A_MSG_ROUTE) %s", A_MSG_ROUTE, text); } if (is_set(action, A_RECOVER)) { crm_trace( "Action %.16llx (A_RECOVER) %s", A_RECOVER, text); } if (is_set(action, A_DC_RELEASE)) { crm_trace( "Action %.16llx (A_DC_RELEASE) %s", A_DC_RELEASE, text); } if (is_set(action, A_DC_RELEASED)) { crm_trace( "Action %.16llx (A_DC_RELEASED) %s", A_DC_RELEASED, text); } if (is_set(action, A_DC_TAKEOVER)) { crm_trace( "Action %.16llx (A_DC_TAKEOVER) %s", A_DC_TAKEOVER, text); } if (is_set(action, A_SHUTDOWN)) { crm_trace( "Action %.16llx (A_SHUTDOWN) %s", A_SHUTDOWN, text); } if (is_set(action, A_SHUTDOWN_REQ)) { crm_trace( "Action %.16llx (A_SHUTDOWN_REQ) %s", A_SHUTDOWN_REQ, text); } if (is_set(action, A_STOP)) { crm_trace( "Action %.16llx (A_STOP ) %s", A_STOP, text); } if (is_set(action, A_EXIT_0)) { crm_trace( "Action %.16llx (A_EXIT_0) %s", A_EXIT_0, text); } if (is_set(action, A_EXIT_1)) { crm_trace( "Action %.16llx (A_EXIT_1) %s", A_EXIT_1, text); } if (is_set(action, A_CCM_CONNECT)) { crm_trace( "Action %.16llx (A_CCM_CONNECT) %s", A_CCM_CONNECT, text); } if (is_set(action, A_CCM_DISCONNECT)) { crm_trace( "Action %.16llx (A_CCM_DISCONNECT) %s", A_CCM_DISCONNECT, text); } if (is_set(action, A_CIB_START)) { crm_trace( "Action %.16llx (A_CIB_START) %s", A_CIB_START, text); } if (is_set(action, A_CIB_STOP)) { crm_trace( "Action %.16llx (A_CIB_STOP) %s", A_CIB_STOP, text); } if (is_set(action, A_TE_INVOKE)) { crm_trace( "Action %.16llx (A_TE_INVOKE) %s", A_TE_INVOKE, text); } if (is_set(action, A_TE_START)) { crm_trace( "Action %.16llx (A_TE_START) %s", A_TE_START, text); } if (is_set(action, A_TE_STOP)) { crm_trace( "Action %.16llx (A_TE_STOP) %s", A_TE_STOP, text); } if (is_set(action, A_TE_CANCEL)) { crm_trace( "Action %.16llx (A_TE_CANCEL) %s", A_TE_CANCEL, text); } if (is_set(action, A_PE_INVOKE)) { crm_trace( "Action %.16llx (A_PE_INVOKE) %s", A_PE_INVOKE, text); } if (is_set(action, A_PE_START)) { crm_trace( "Action %.16llx (A_PE_START) %s", A_PE_START, text); } if (is_set(action, A_PE_STOP)) { crm_trace( "Action %.16llx (A_PE_STOP) %s", A_PE_STOP, text); } if (is_set(action, A_NODE_BLOCK)) { crm_trace( "Action %.16llx (A_NODE_BLOCK) %s", A_NODE_BLOCK, text); } if (is_set(action, A_UPDATE_NODESTATUS)) { crm_trace( "Action %.16llx (A_UPDATE_NODESTATUS) %s", A_UPDATE_NODESTATUS, text); } if (is_set(action, A_LOG)) { crm_trace( "Action %.16llx (A_LOG ) %s", A_LOG, text); } if (is_set(action, A_ERROR)) { crm_trace( "Action %.16llx (A_ERROR ) %s", A_ERROR, text); } if (is_set(action, A_WARN)) { crm_trace( "Action %.16llx (A_WARN ) %s", A_WARN, text); } } void create_node_entry(const char *uuid, const char *uname, const char *type) { /* make sure a node entry exists for the new node * * this will add anyone except the first ever node in the cluster * since it will also be the DC which doesnt go through the * join process (with itself). We can include a special case * later if desired. */ xmlNode *tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE); crm_trace("Creating node entry for %s", uname); set_uuid(tmp1, XML_ATTR_UUID, uname); crm_xml_add(tmp1, XML_ATTR_UNAME, uname); crm_xml_add(tmp1, XML_ATTR_TYPE, type); fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(tmp1); } xmlNode * create_node_state(const char *uname, const char *ha_state, const char *ccm_state, const char *crmd_state, const char *join_state, const char *exp_state, gboolean clear_shutdown, const char *src) { xmlNode *node_state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_trace("%s Creating node state entry for %s", src, uname); set_uuid(node_state, XML_ATTR_UUID, uname); if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) { crm_debug("Node %s is not a cluster member", uname); free_xml(node_state); return NULL; } crm_xml_add(node_state, XML_ATTR_UNAME, uname); crm_xml_add(node_state, XML_CIB_ATTR_HASTATE, ha_state); crm_xml_add(node_state, XML_CIB_ATTR_INCCM, ccm_state); crm_xml_add(node_state, XML_CIB_ATTR_CRMDSTATE, crmd_state); crm_xml_add(node_state, XML_CIB_ATTR_JOINSTATE, join_state); crm_xml_add(node_state, XML_CIB_ATTR_EXPSTATE, exp_state); crm_xml_add(node_state, XML_ATTR_ORIGIN, src); if (clear_shutdown) { crm_xml_add(node_state, XML_CIB_ATTR_SHUTDOWN, "0"); } crm_log_xml_trace(node_state, "created"); return node_state; } extern GHashTable *ipc_clients; void process_client_disconnect(crmd_client_t * curr_client) { struct crm_subsystem_s *the_subsystem = NULL; CRM_CHECK(curr_client != NULL, return); crm_trace("received HUP from %s", curr_client->table_key); if (curr_client->sub_sys == NULL) { crm_trace("Client hadn't registered with us yet"); } else if (strcasecmp(CRM_SYSTEM_PENGINE, curr_client->sub_sys) == 0) { the_subsystem = pe_subsystem; } else if (strcasecmp(CRM_SYSTEM_TENGINE, curr_client->sub_sys) == 0) { the_subsystem = te_subsystem; } else if (strcasecmp(CRM_SYSTEM_CIB, curr_client->sub_sys) == 0) { the_subsystem = cib_subsystem; } if (the_subsystem != NULL) { the_subsystem->ipc = NULL; the_subsystem->client = NULL; crm_info("Received HUP from %s:[%d]", the_subsystem->name, the_subsystem->pid); } else { /* else that was a transient client */ crm_trace("Received HUP from transient client"); } if (curr_client->table_key != NULL) { /* * Key is destroyed below as: * curr_client->table_key * Value is cleaned up by: * crmd_ipc_connection_destroy * which will also call: * G_main_del_IPC_Channel */ g_hash_table_remove(ipc_clients, curr_client->table_key); } } gboolean update_dc(xmlNode * msg) { char *last_dc = fsa_our_dc; const char *dc_version = NULL; const char *welcome_from = NULL; if (msg != NULL) { gboolean invalid = FALSE; dc_version = crm_element_value(msg, F_CRM_VERSION); welcome_from = crm_element_value(msg, F_CRM_HOST_FROM); CRM_CHECK(dc_version != NULL, return FALSE); CRM_CHECK(welcome_from != NULL, return FALSE); if (AM_I_DC && safe_str_neq(welcome_from, fsa_our_uname)) { invalid = TRUE; } else if (fsa_our_dc && safe_str_neq(welcome_from, fsa_our_dc)) { invalid = TRUE; } if (invalid) { CRM_CHECK(fsa_our_dc != NULL, crm_err("We have no DC")); if (AM_I_DC) { crm_err("Not updating DC to %s (%s): we are also a DC", welcome_from, dc_version); } else { crm_warn("New DC %s is not %s", welcome_from, fsa_our_dc); } register_fsa_action(A_CL_JOIN_QUERY | A_DC_TIMER_START); return FALSE; } } crm_free(fsa_our_dc_version); fsa_our_dc_version = NULL; fsa_our_dc = NULL; /* Free'd as last_dc */ if (welcome_from != NULL) { fsa_our_dc = crm_strdup(welcome_from); } if (dc_version != NULL) { fsa_our_dc_version = crm_strdup(dc_version); } if (safe_str_eq(fsa_our_dc, last_dc)) { /* do nothing */ } else if (fsa_our_dc != NULL) { crm_info("Set DC to %s (%s)", crm_str(fsa_our_dc), crm_str(fsa_our_dc_version)); } else if (last_dc != NULL) { crm_debug("Unset DC. Was %s", crm_str(last_dc)); } crm_free(last_dc); return TRUE; } #define STATUS_PATH_MAX 512 static void erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *xpath = user_data; do_crm_log(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Deletion of \"%s\": %s (rc=%d)", xpath, cib_error2string(rc), rc); crm_free(xpath); } void erase_status_tag(const char *uname, const char *tag, int options) { int rc = cib_ok; char xpath[STATUS_PATH_MAX]; int cib_opts = cib_quorum_override | cib_xpath | options; if (fsa_cib_conn && uname) { snprintf(xpath, STATUS_PATH_MAX, "//node_state[@uname='%s']/%s", uname, tag); crm_info("Deleting xpath: %s", xpath); rc = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL, cib_opts); add_cib_op_callback(fsa_cib_conn, rc, FALSE, crm_strdup(xpath), erase_xpath_callback); } } static IPC_Channel *attrd = NULL; static gboolean attrd_dispatch(IPC_Channel * client, gpointer user_data) { xmlNode *msg = NULL; gboolean stay_connected = TRUE; while (IPC_ISRCONN(client)) { if (client->ops->is_message_pending(client) == 0) { break; } msg = xmlfromIPC(client, MAX_IPC_DELAY); if (msg != NULL) { crm_log_xml_err(msg, "attrd:msg"); free_xml(msg); } } if (client->ch_status != IPC_CONNECT) { stay_connected = FALSE; } return stay_connected; } static void attrd_connection_destroy(gpointer user_data) { crm_err("Lost connection to attrd"); attrd = NULL; return; } void update_attrd(const char *host, const char *name, const char *value, const char *user_name) { int retries = 0; gboolean rc = FALSE; do { rc = FALSE; if (attrd == NULL) { crm_info("Connecting to attrd..."); attrd = init_client_ipc_comms_nodispatch(T_ATTRD); if (attrd) { G_main_add_IPC_Channel(G_PRIORITY_LOW, attrd, FALSE, attrd_dispatch, NULL, attrd_connection_destroy); } } if (attrd != NULL) { rc = attrd_update_delegate(attrd, 'U', host, name, value, XML_CIB_TAG_STATUS, NULL, NULL, user_name); } else { crm_warn("Could not connect to %s", T_ATTRD); } if (rc == FALSE) { attrd = NULL; if (retries < 5) { retries++; sleep(retries); } } } while(rc == FALSE && retries < 5); if (rc == FALSE) { crm_err("Could not send %s %s %s (%d)", T_ATTRD, name ? "update" : "refresh", name?name:"", is_set(fsa_input_register, R_SHUTDOWN)); if(is_set(fsa_input_register, R_SHUTDOWN)) { register_fsa_input(C_FSA_INTERNAL, I_FAIL, NULL); } } else if(retries) { crm_debug("Needed %d retries to send %s %s %s", retries, T_ATTRD, name ? "update" : "refresh", name?name:""); } }