diff --git a/crm/crmd/join_dc.c b/crm/crmd/join_dc.c index 2b7fa429f6..053f8c6403 100644 --- a/crm/crmd/join_dc.c +++ b/crm/crmd/join_dc.c @@ -1,622 +1,621 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include GHashTable *welcomed_nodes = NULL; GHashTable *integrated_nodes = NULL; GHashTable *finalized_nodes = NULL; GHashTable *confirmed_nodes = NULL; char *max_epoch = NULL; char *max_generation_from = NULL; crm_data_t *max_generation_xml = NULL; void initialize_join(gboolean before); gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data); void join_send_offer(gpointer key, gpointer value, gpointer user_data); void finalize_sync_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data); gboolean process_join_ack_msg( const char *join_from, crm_data_t *lrm_update, int join_id); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void join_update_complete_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data); void finalize_join(const char *caller); static int current_join_id = 0; /* A_DC_JOIN_OFFER_ALL */ enum crmd_fsa_input do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { /* reset everyones status back to down or in_ccm in the CIB * * any nodes that are active in the CIB but not in the CCM list * will be seen as offline by the PE anyway */ do_update_cib_nodes(NULL, TRUE); crm_info("0) Offering membership to %d clients", fsa_membership_copy->members_size); initialize_join(TRUE); current_join_id++; g_hash_table_foreach( fsa_membership_copy->members, join_send_offer, NULL); /* dont waste time by invoking the PE yet; */ crm_debug("1) Waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); return I_NULL; } /* A_DC_JOIN_OFFER_ONE */ enum crmd_fsa_input do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { oc_node_t member; gpointer a_node = NULL; ha_msg_input_t *welcome = fsa_typed_data(fsa_dt_ha_msg); const char *join_to = NULL; if(welcome == NULL) { crm_err("Attempt to send welcome message " "without a message to reply to!"); return I_NULL; } join_to = cl_get_string(welcome->msg, F_CRM_HOST_FROM); if(a_node != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) { /* note: it _is_ possible that a node will have been * sick or starting up when the original offer was made. * however, it will either re-announce itself in due course * _or_ we can re-store the original offer on the client. */ crm_debug("Re-offering membership to %s...", join_to); } crm_info("Processing annouce request from %s in state %s", join_to, fsa_state2string(cur_state)); /* always offer to the DC (ourselves) * this ensures the correct value for max_generation_from */ member.node_uname = crm_strdup(fsa_our_uname); join_send_offer(NULL, &member, NULL); crm_free(member.node_uname); member.node_uname = crm_strdup(join_to); join_send_offer(NULL, &member, NULL); crm_free(member.node_uname); /* this was a genuine join request, cancel any existing * transition and invoke the PE */ if(need_transition(fsa_state)) { register_fsa_action(A_TE_CANCEL); } /* dont waste time by invoking the pe yet; */ crm_debug("1) Waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); return I_NULL; } /* A_DC_JOIN_PROCESS_REQ */ enum crmd_fsa_input do_dc_join_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { crm_data_t *generation = NULL; int join_id = -1; gboolean ack_nack_bool = TRUE; const char *ack_nack = CRMD_JOINSTATE_MEMBER; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = cl_get_string(join_ack->msg,F_CRM_HOST_FROM); const char *ref = cl_get_string(join_ack->msg,XML_ATTR_REFERENCE); gpointer join_node = g_hash_table_lookup(fsa_membership_copy->members, join_from); crm_debug_3("2) Processing req from %s", join_from); generation = join_ack->xml; ha_msg_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); crm_log_xml_debug_2(max_generation_xml, "Max generation"); crm_log_xml_debug_2(generation, "Their generation"); if(join_node == NULL) { crm_err("Node %s is not a member", join_from); ack_nack_bool = FALSE; } else if(generation == NULL) { crm_err("Generation was NULL"); ack_nack_bool = FALSE; } else if(join_id != current_join_id) { crm_debug("Response from %s was for invalid join: %d vs. %d", join_from, join_id, current_join_id); check_join_state(cur_state, __FUNCTION__); return I_NULL; } else if(max_generation_xml == NULL) { max_generation_xml = copy_xml(generation); max_generation_from = crm_strdup(join_from); } else if(cib_compare_generation(max_generation_xml, generation) < 0) { crm_debug("%s has a better generation number than" " the current max %s", join_from, max_generation_from); crm_free(max_generation_from); free_xml(max_generation_xml); max_generation_from = crm_strdup(join_from); max_generation_xml = copy_xml(join_ack->xml); } if(ack_nack_bool == FALSE) { /* NACK this client */ ack_nack = CRMD_STATE_INACTIVE; crm_err("2) NACK'ing node %s (ref %s)", join_from, ref); } else { crm_debug("2) Welcoming node %s after ACK (ref %s)", join_from, ref); } /* add them to our list of CRMD_STATE_ACTIVE nodes */ g_hash_table_insert( integrated_nodes, crm_strdup(join_from), crm_strdup(ack_nack)); crm_debug_2("%u nodes have been integrated", g_hash_table_size(integrated_nodes)); g_hash_table_remove(welcomed_nodes, join_from); if(check_join_state(cur_state, __FUNCTION__) == FALSE) { /* dont waste time by invoking the PE yet; */ crm_debug_2("Still waiting on %d outstanding join acks", g_hash_table_size(welcomed_nodes)); } return I_NULL; } #define JOIN_AFTER_SYNC 1 /* A_DC_JOIN_FINALIZE */ enum crmd_fsa_input do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { enum cib_errors rc = cib_ok; /* This we can do straight away and avoid clients timing us out * while we compute the latest CIB */ #if JOIN_AFTER_SYNC crm_debug("Finializing join for %d clients", g_hash_table_size(integrated_nodes)); #else crm_debug("Notifying %d clients of join results", g_hash_table_size(integrated_nodes)); g_hash_table_foreach_remove( integrated_nodes, finalize_join_for, NULL); #endif clear_bit_inplace(fsa_input_register, R_HAVE_CIB); if(max_generation_from == NULL || safe_str_eq(max_generation_from, fsa_our_uname)){ set_bit_inplace(fsa_input_register, R_HAVE_CIB); } if(is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { /* ask for the agreed best CIB */ crm_info("Asking %s for its copy of the CIB", crm_str(max_generation_from)); - + crm_log_xml_debug(max_generation_xml, "Requesting version"); + set_bit_inplace(fsa_input_register, R_CIB_ASKED); fsa_cib_conn->call_timeout = 10; rc = fsa_cib_conn->cmds->sync_from( fsa_cib_conn, max_generation_from, NULL, cib_quorum_override); fsa_cib_conn->call_timeout = 0; /* back to the default */ add_cib_op_callback(rc, FALSE, crm_strdup(max_generation_from), finalize_sync_callback); return I_NULL; } finalize_join(__FUNCTION__); return I_NULL; } void finalize_sync_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { CRM_DEV_ASSERT(cib_not_master != rc); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); if(rc != cib_ok) { - crm_err("Sync from %s resulted in an error: %s", - (char*)user_data, cib_error2string(rc)); -#if 0 - register_fsa_error_adv( - C_FSA_INTERNAL, I_ERROR, NULL, NULL, __FUNCTION__); -#else + crm_log_maybe(rc==cib_old_data?LOG_WARNING:LOG_ERR, + "Sync from %s resulted in an error: %s", + (char*)user_data, cib_error2string(rc)); + /* restart the whole join process */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__); -#endif + } else if(AM_I_DC && fsa_state == S_FINALIZE_JOIN) { finalize_join(__FUNCTION__); } else { crm_debug("No longer the DC in S_FINALIZE_JOIN: %s/%s", AM_I_DC?"DC":"CRMd", fsa_state2string(fsa_state)); } crm_free(user_data); } void finalize_join(const char *caller) { crm_data_t *cib = createEmptyCib(); crm_data_t *cib_update = NULL; set_bit_inplace(fsa_input_register, R_HAVE_CIB); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); set_uuid(fsa_cluster_conn, cib, XML_ATTR_DC_UUID, fsa_our_uname); crm_debug_3("Update %s in the CIB to our uuid: %s", XML_ATTR_DC_UUID, crm_element_value(cib, XML_ATTR_DC_UUID)); cib_update = create_cib_fragment(cib, NULL); fsa_cib_conn->cmds->modify( fsa_cib_conn, NULL, cib_update, NULL, cib_quorum_override); free_xml(cib_update); free_xml(cib); crm_debug_3("Bumping the epoch and syncing to %d clients", g_hash_table_size(finalized_nodes)); fsa_cib_conn->cmds->bump_epoch( fsa_cib_conn, cib_scope_local|cib_quorum_override); #if JOIN_AFTER_SYNC /* make sure dc_uuid is re-set to us */ if(check_join_state(fsa_state, caller) == FALSE) { crm_debug("Notifying %d clients of join results", g_hash_table_size(integrated_nodes)); g_hash_table_foreach_remove( integrated_nodes, finalize_join_for, NULL); } #else check_join_state(cur_state, caller); rc = fsa_cib_conn->cmds->sync(fsa_cib_conn, NULL, cib_quorum_override); #endif } /* A_DC_JOIN_PROCESS_ACK */ enum crmd_fsa_input do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = cl_get_string(join_ack->msg, F_CRM_HOST_FROM); const char *op = cl_get_string(join_ack->msg, F_CRM_TASK); if(safe_str_neq(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring op=%s message", op); } else { int join_id = -1; ha_msg_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); process_join_ack_msg(join_from, join_ack->xml, join_id); } return I_NULL; } gboolean process_join_ack_msg(const char *join_from, crm_data_t *lrm_update, int join_id) { /* now update them to "member" */ int call_id = 0; crm_data_t *update = NULL; crm_data_t *fragment = NULL; const char *join_state = NULL; crm_debug_2("Processing ack from %s", join_from); join_state = (const char *) g_hash_table_lookup(finalized_nodes, join_from); if(join_state == NULL) { crm_err("Join not in progress: ignoring join from %s", join_from); return FALSE; } else if(safe_str_neq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_err("Node %s wasnt invited to join the cluster",join_from); g_hash_table_remove(finalized_nodes, join_from); return FALSE; } else if(join_id != current_join_id) { crm_err("Node %s responded to an invalid join: %d vs. %d", join_from, join_id, current_join_id); g_hash_table_remove(finalized_nodes, join_from); return FALSE; } g_hash_table_remove(finalized_nodes, join_from); if(g_hash_table_lookup(confirmed_nodes, join_from) != NULL) { crm_err("hash already contains confirmation from %s",join_from); } g_hash_table_insert(confirmed_nodes, crm_strdup(join_from), crm_strdup(CRMD_JOINSTATE_MEMBER)); crm_info("4) Updating node state to %s for %s", CRMD_JOINSTATE_MEMBER, join_from); #if 0 ???dig into the fragment and clear shutdown?? /* the slave will re-ask if it wants to be shutdown */ crm_xml_add(lrm_update, XML_CIB_ATTR_CLEAR_SHUTDOWN, XML_BOOLEAN_TRUE); #endif /* update CIB with the current LRM status from the node * We dont need to notify the TE of these updates, a transition will * be started in due time */ call_id = fsa_cib_conn->cmds->modify( fsa_cib_conn, XML_CIB_TAG_STATUS, lrm_update, NULL, cib_scope_local|cib_quorum_override); add_cib_op_callback(call_id, TRUE,NULL, join_update_complete_callback); free_xml(fragment); free_xml(update); return TRUE; } gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const char *join_state = NULL; HA_Message *acknak = NULL; if(key == NULL || value == NULL) { return TRUE; } join_to = (const char *)key; join_state = (const char *)value; /* make sure the node exists in the config section */ create_node_entry(join_to, join_to, CRMD_JOINSTATE_MEMBER); /* send the ack/nack to the node */ acknak = create_request( CRM_OP_JOIN_ACKNAK, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); ha_msg_add_int(acknak, F_CRM_JOIN_ID, current_join_id); /* set the ack/nack */ if(safe_str_eq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_debug("3) ACK'ing join request from %s, state %s", join_to, join_state); ha_msg_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE); g_hash_table_insert( finalized_nodes, crm_strdup(join_to), crm_strdup(CRMD_JOINSTATE_MEMBER)); } else { crm_warn("3) NACK'ing join request from %s, state %s", join_to, join_state); ha_msg_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_FALSE); } send_msg_via_ha(fsa_cluster_conn, acknak); return TRUE; } void initialize_join(gboolean before) { /* clear out/reset a bunch of stuff */ crm_debug("Initializing join data"); g_hash_table_destroy(welcomed_nodes); g_hash_table_destroy(integrated_nodes); g_hash_table_destroy(finalized_nodes); g_hash_table_destroy(confirmed_nodes); if(before) { if(max_generation_from != NULL) { crm_free(max_generation_from); max_generation_from = NULL; } if(max_generation_xml != NULL) { free_xml(max_generation_xml); max_generation_xml = NULL; } clear_bit_inplace(fsa_input_register, R_HAVE_CIB); clear_bit_inplace(fsa_input_register, R_CIB_ASKED); } welcomed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } void join_send_offer(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const char *crm_online = NULL; const oc_node_t *member = (const oc_node_t*)value; if(member != NULL) { join_to = member->node_uname; } if(join_to == NULL) { crm_err("No recipient for welcome message"); return; } g_hash_table_remove(confirmed_nodes, join_to); g_hash_table_remove(finalized_nodes, join_to); g_hash_table_remove(integrated_nodes, join_to); g_hash_table_remove(welcomed_nodes, join_to); crm_online = g_hash_table_lookup(crmd_peer_state, join_to); if(safe_str_eq(crm_online, ONLINESTATUS)) { HA_Message *offer = create_request( CRM_OP_JOIN_OFFER, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); ha_msg_add_int(offer, F_CRM_JOIN_ID, current_join_id); /* send the welcome */ crm_debug("Sending %s(%d) to %s", CRM_OP_JOIN_OFFER, current_join_id, join_to); send_msg_via_ha(fsa_cluster_conn, offer); g_hash_table_insert(welcomed_nodes, crm_strdup(join_to), crm_strdup(CRMD_JOINSTATE_PENDING)); } else { crm_debug("Peer process on %s is not active", join_to); } } gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source) { crm_debug_2("Invoked by %s in state: %s", source, fsa_state2string(cur_state)); if(cur_state == S_INTEGRATION) { if(g_hash_table_size(welcomed_nodes) == 0) { crm_debug("Integration of %d peers complete: %s", g_hash_table_size(integrated_nodes), source); register_fsa_input_before( C_FSA_INTERNAL, I_INTEGRATED, NULL); return TRUE; } } else if(cur_state == S_FINALIZE_JOIN) { if(is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { crm_debug("Delaying I_FINALIZED until we have the CIB"); return TRUE; } else if(g_hash_table_size(integrated_nodes) == 0 && g_hash_table_size(finalized_nodes) == 0) { crm_debug("Join process complete: %s", source); register_fsa_input_later( C_FSA_INTERNAL, I_FINALIZED, NULL); } else if(g_hash_table_size(integrated_nodes) != 0 && g_hash_table_size(finalized_nodes) != 0) { crm_err("Waiting on %d integrated nodes" " AND %d confirmations", g_hash_table_size(integrated_nodes), g_hash_table_size(finalized_nodes)); } else if(g_hash_table_size(integrated_nodes) != 0) { crm_debug("Still waiting on %d integrated nodes", g_hash_table_size(integrated_nodes)); } else if(g_hash_table_size(finalized_nodes) != 0) { crm_debug_2("Still waiting on %d confirmations", g_hash_table_size(finalized_nodes)); } } return FALSE; } void join_update_complete_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { fsa_data_t *msg_data = NULL; if(rc == cib_ok) { check_join_state(fsa_state, __FUNCTION__); } else { crm_err("Join update failed"); crm_log_message(LOG_DEBUG, msg); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } diff --git a/crm/crmd/lrm.c b/crm/crmd/lrm.c index 14dcd2e5cd..0c0a1494ec 100644 --- a/crm/crmd/lrm.c +++ b/crm/crmd/lrm.c @@ -1,1299 +1,1299 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include /* for access */ #include #include #include #include #include #include #include #include #include #include #include #include char *make_stop_id(const char *rsc, int call_id); void ghash_print_pending(gpointer key, gpointer value, gpointer user_data); gboolean stop_all_resources(void); gboolean resource_stopped(gpointer key, gpointer value, gpointer user_data); gboolean build_operation_update( crm_data_t *rsc_list, lrm_op_t *op, const char *src, int lpc); gboolean build_active_RAs(crm_data_t *rsc_list); void do_update_resource(lrm_op_t *op); enum crmd_fsa_input do_lrm_rsc_op( lrm_rsc_t *rsc, char *rid, const char *operation, crm_data_t *msg, HA_Message *request); enum crmd_fsa_input do_fake_lrm_op(gpointer data); void stop_recurring_action( gpointer key, gpointer value, gpointer user_data); gboolean remove_recurring_action( gpointer key, gpointer value, gpointer user_data); void free_recurring_op(gpointer value); void nack_rsc_op(lrm_op_t* op, HA_Message *msg); GHashTable *xml2list(crm_data_t *parent); GHashTable *monitors = NULL; GHashTable *resources = NULL; GHashTable *resources_confirmed = NULL; GHashTable *shutdown_ops = NULL; int num_lrm_register_fails = 0; int max_lrm_register_fails = 30; enum crmd_rscstate { crmd_rscstate_NULL, crmd_rscstate_START, crmd_rscstate_START_PENDING, crmd_rscstate_START_OK, crmd_rscstate_START_FAIL, crmd_rscstate_STOP, crmd_rscstate_STOP_PENDING, crmd_rscstate_STOP_OK, crmd_rscstate_STOP_FAIL, crmd_rscstate_MON, crmd_rscstate_MON_PENDING, crmd_rscstate_MON_OK, crmd_rscstate_MON_FAIL, crmd_rscstate_GENERIC_PENDING, crmd_rscstate_GENERIC_OK, crmd_rscstate_GENERIC_FAIL }; const char *crmd_rscstate2string(enum crmd_rscstate state); const char * crmd_rscstate2string(enum crmd_rscstate state) { switch(state) { case crmd_rscstate_NULL: return NULL; case crmd_rscstate_START: return CRMD_ACTION_START; case crmd_rscstate_START_PENDING: return CRMD_ACTION_START_PENDING; case crmd_rscstate_START_OK: return CRMD_ACTION_STARTED; case crmd_rscstate_START_FAIL: return CRMD_ACTION_START_FAIL; case crmd_rscstate_STOP: return CRMD_ACTION_STOP; case crmd_rscstate_STOP_PENDING: return CRMD_ACTION_STOP_PENDING; case crmd_rscstate_STOP_OK: return CRMD_ACTION_STOPPED; case crmd_rscstate_STOP_FAIL: return CRMD_ACTION_STOP_FAIL; case crmd_rscstate_MON: return CRMD_ACTION_MON; case crmd_rscstate_MON_PENDING: return CRMD_ACTION_MON_PENDING; case crmd_rscstate_MON_OK: return CRMD_ACTION_MON_OK; case crmd_rscstate_MON_FAIL: return CRMD_ACTION_MON_FAIL; case crmd_rscstate_GENERIC_PENDING: return CRMD_ACTION_GENERIC_PENDING; case crmd_rscstate_GENERIC_OK: return CRMD_ACTION_GENERIC_OK; case crmd_rscstate_GENERIC_FAIL: return CRMD_ACTION_GENERIC_FAIL; } return ""; } /* A_LRM_CONNECT */ enum crmd_fsa_input do_lrm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int ret = HA_OK; if(action & A_LRM_DISCONNECT) { if(fsa_lrm_conn) { fsa_lrm_conn->lrm_ops->signoff(fsa_lrm_conn); } /* TODO: Clean up the hashtable */ } if(action & A_LRM_CONNECT) { crm_debug_4("LRM: connect..."); ret = HA_OK; monitors = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, free_recurring_op); resources = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); resources_confirmed = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); shutdown_ops = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); fsa_lrm_conn = ll_lrm_new(XML_CIB_TAG_LRM); if(NULL == fsa_lrm_conn) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); ret = HA_FAIL; } if(ret == HA_OK) { crm_debug_4("LRM: sigon..."); ret = fsa_lrm_conn->lrm_ops->signon( fsa_lrm_conn, CRM_SYSTEM_CRMD); } if(ret != HA_OK) { if(++num_lrm_register_fails < max_lrm_register_fails) { crm_warn("Failed to sign on to the LRM %d" " (%d max) times", num_lrm_register_fails, max_lrm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return I_NULL; } } if(ret == HA_OK) { crm_debug_4("LRM: set_lrm_callback..."); ret = fsa_lrm_conn->lrm_ops->set_lrm_callback( fsa_lrm_conn, lrm_op_callback); if(ret != HA_OK) { crm_err("Failed to set LRM callbacks"); } } if(ret != HA_OK) { crm_err("Failed to sign on to the LRM %d" " (max) times", num_lrm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return I_NULL; } /* TODO: create a destroy handler that causes * some recovery to happen */ G_main_add_IPC_Channel(G_PRIORITY_LOW, fsa_lrm_conn->lrm_ops->ipcchan(fsa_lrm_conn), FALSE, lrm_dispatch, fsa_lrm_conn, default_ipc_connection_destroy); set_bit_inplace(fsa_input_register, R_LRM_CONNECTED); } if(action & ~(A_LRM_CONNECT|A_LRM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } return I_NULL; } gboolean stop_all_resources(void) { GListPtr lrm_list = NULL; crm_info("Makeing sure all active resources are stopped before exit"); if(fsa_lrm_conn == NULL) { return TRUE; } else if(is_set(fsa_input_register, R_SENT_RSC_STOP)) { crm_debug("Already sent stop operation"); return TRUE; } lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn); slist_iter( rsc_id, char, lrm_list, lpc, const char *last_op = g_hash_table_lookup(resources, rsc_id); if(safe_str_neq(last_op, CRMD_ACTION_STOP)) { crm_warn("Resource %s was active at shutdown", rsc_id); do_lrm_rsc_op(NULL, rsc_id, CRMD_ACTION_STOP, NULL, NULL); } ); set_bit_inplace(fsa_input_register, R_SENT_RSC_STOP); if(g_hash_table_size(shutdown_ops) == 0) { register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } else { crm_info("Waiting for %d pending stop operations " " to complete before exiting", g_hash_table_size(shutdown_ops)); } return TRUE; } gboolean build_operation_update( crm_data_t *xml_rsc, lrm_op_t *op, const char *src, int lpc) { int len = 0; char *tmp = NULL; char *fail_state = NULL; const char *state = NULL; crm_data_t *xml_op = NULL; char *op_id = NULL; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return FALSE; } crm_debug("%s: Updating resouce %s after %s %s op", src, op->rsc_id, op_status2text(op->op_status), op->op_type); if(op->op_status == LRM_OP_CANCELLED) { crm_debug("Ignoring cancelled op"); return TRUE; } if(safe_str_eq(op->op_type, CRMD_ACTION_NOTIFY)) { const char *n_type = g_hash_table_lookup( op->params, "notify_type"); const char *n_task = g_hash_table_lookup( op->params, "notify_operation"); CRM_DEV_ASSERT(n_type != NULL); CRM_DEV_ASSERT(n_task != NULL); op_id = generate_notify_key(op->rsc_id, n_type, n_task); /* these are not yet allowed to fail */ op->op_status = LRM_OP_DONE; op->rc = 0; } else { op_id = generate_op_key(op->rsc_id, op->op_type, op->interval); } /* Handle recurring ops - infer last op_status */ if(op->op_status == LRM_OP_PENDING && op->interval > 0) { if(op->rc == 0) { crm_debug("Mapping pending operation to DONE"); op->op_status = LRM_OP_DONE; } else { crm_debug("Mapping pending operation to ERROR"); op->op_status = LRM_OP_ERROR; } } xml_op = find_entity(xml_rsc, XML_LRM_TAG_RSC_OP, op_id); if(xml_op != NULL) { const char *old_status_s = crm_element_value( xml_op, XML_LRM_ATTR_OPSTATUS); int old_status = crm_atoi(old_status_s, "-2"); int log_level = LOG_ERR; if(old_status_s == NULL) { crm_err("No value for "XML_LRM_ATTR_OPSTATUS); } else if(old_status == op->op_status) { /* safe to mask */ log_level = LOG_WARNING; } else if(old_status == LRM_OP_PENDING){ /* ??safe to mask?? */ /* log_level = LOG_WARNING; */ } crm_log_maybe(log_level, "Duplicate %s operations in get_cur_state()", op_id); crm_log_maybe(log_level-2, "New entry: %s %s (call=%d, status=%s)", op_id, op->user_data, op->call_id, op_status2text(op->op_status)); crm_log_xml(log_level-2, "Existing entry", xml_op); crm_free(op_id); return FALSE; } else { xml_op = create_xml_node(xml_rsc, XML_LRM_TAG_RSC_OP); } crm_xml_add(xml_op, XML_ATTR_ID, op_id); crm_free(op_id); crm_xml_add(xml_rsc, XML_LRM_ATTR_LASTOP, op->op_type); crm_xml_add(xml_op, XML_LRM_ATTR_TASK, op->op_type); crm_xml_add(xml_op, "origin", src); if(op->user_data == NULL) { op->user_data = generate_transition_key(-1, fsa_our_uname); } fail_state = generate_transition_magic(op->user_data, op->op_status); crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY, op->user_data); crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC, fail_state); crm_free(fail_state); switch(op->op_status) { case LRM_OP_PENDING: break; case LRM_OP_CANCELLED: crm_err("What to do here"); break; case LRM_OP_ERROR: case LRM_OP_TIMEOUT: case LRM_OP_NOTSUPPORTED: crm_debug("Resource action %s/%s failed: %d", op->rsc_id, op->op_type, op->op_status); len = strlen(op->op_type); len += strlen("_failed_"); crm_malloc0(fail_state, sizeof(char)*len); if(fail_state != NULL) { sprintf(fail_state, "%s_failed", op->op_type); } crm_xml_add(xml_op, XML_LRM_ATTR_RSCSTATE, fail_state); crm_xml_add(xml_rsc, XML_LRM_ATTR_RSCSTATE, fail_state); crm_free(fail_state); break; case LRM_OP_DONE: if(safe_str_eq(CRMD_ACTION_START, op->op_type)) { state = CRMD_ACTION_STARTED; } else if(safe_str_eq(CRMD_ACTION_STOP, op->op_type)) { state = CRMD_ACTION_STOPPED; } else if(safe_str_eq(CRMD_ACTION_MON, op->op_type)) { state = CRMD_ACTION_STARTED; } else { crm_warn("Using status \"%s\" for op \"%s\"" "... this is still in the experimental stage.", CRMD_ACTION_GENERIC_OK, op->op_type); state = CRMD_ACTION_GENERIC_OK; } crm_xml_add(xml_op, XML_LRM_ATTR_RSCSTATE, state); crm_xml_add(xml_rsc, XML_LRM_ATTR_RSCSTATE, state); break; } tmp = crm_itoa(op->call_id); crm_xml_add(xml_op, XML_LRM_ATTR_CALLID, tmp); crm_free(tmp); /* set these on 'xml_rsc' too to make life easy for the TE */ tmp = crm_itoa(op->rc); crm_xml_add(xml_op, XML_LRM_ATTR_RC, tmp); crm_xml_add(xml_rsc, XML_LRM_ATTR_RC, tmp); crm_free(tmp); tmp = crm_itoa(op->op_status); crm_xml_add(xml_op, XML_LRM_ATTR_OPSTATUS, tmp); crm_xml_add(xml_rsc, XML_LRM_ATTR_OPSTATUS, tmp); crm_free(tmp); set_node_tstamp(xml_op); #if 0 if(safe_str_neq(op->op_type, CRMD_ACTION_STOP)) { /* this will enable us to later determin that the * resource's parameters have changed and we should force * a restart * however it will come at the cost of a potentially much * larger CIB * it may also requires MAXDEPTH to be increased */ crm_data_t *args_xml = NULL; args_xml = create_xml_node(xml_op, XML_TAG_PARAMS); g_hash_table_foreach(op->params, hash2field, args_xml); } #endif return TRUE; } gboolean build_active_RAs(crm_data_t *rsc_list) { GList *op_list = NULL; GList *lrm_list = NULL; gboolean found_op = FALSE; state_flag_t cur_state = 0; if(fsa_lrm_conn == NULL) { return FALSE; } lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn); slist_iter( rid, char, lrm_list, lpc, lrm_rsc_t *the_rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); crm_data_t *xml_rsc = create_xml_node( rsc_list, XML_LRM_TAG_RESOURCE); int max_call_id = -1; crm_debug("Processing lrm_rsc_t entry %s", rid); if(the_rsc == NULL) { crm_err("NULL resource returned from the LRM"); continue; } crm_xml_add(xml_rsc, XML_ATTR_ID, the_rsc->id); crm_xml_add(xml_rsc, XML_ATTR_TYPE, the_rsc->type); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, the_rsc->class); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER,the_rsc->provider); op_list = the_rsc->ops->get_cur_state(the_rsc, &cur_state); crm_debug_2("\tcurrent state:%s", cur_state==LRM_RSC_IDLE?"Idle":"Busy"); slist_iter( op, lrm_op_t, op_list, llpc, crm_debug_2("Processing op %s for %s (status=%d, rc=%d)", op->op_type, the_rsc->id, op->op_status, op->rc); if(max_call_id < op->call_id) { build_operation_update(xml_rsc, op, __FUNCTION__, llpc); } else if(max_call_id > op->call_id) { crm_err("Bad call_id in list=%d. Previous call_id=%d", op->call_id, max_call_id); } else { crm_debug("Skipping duplicate entry for call_id=%d", op->call_id); } max_call_id = op->call_id; found_op = TRUE; ); if(found_op == FALSE && g_list_length(op_list) != 0) { crm_err("Could not properly determin last op" " for %s from %d entries", the_rsc->id, g_list_length(op_list)); } g_list_free(op_list); ); g_list_free(lrm_list); return TRUE; } crm_data_t* do_lrm_query(gboolean is_replace) { crm_data_t *xml_result= NULL; crm_data_t *xml_state = NULL; crm_data_t *xml_data = NULL; crm_data_t *rsc_list = NULL; const char *exp_state = CRMD_STATE_ACTIVE; if(is_set(fsa_input_register, R_SHUTDOWN)) { exp_state = CRMD_STATE_INACTIVE; } xml_state = create_node_state( fsa_our_uname, fsa_our_uname, ACTIVESTATUS, XML_BOOLEAN_TRUE, ONLINESTATUS, CRMD_JOINSTATE_MEMBER, exp_state, __FUNCTION__); xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM); rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES); /* Build a list of active (not always running) resources */ build_active_RAs(rsc_list); if(is_replace) { crm_xml_add(xml_state, XML_CIB_ATTR_REPLACE, XML_CIB_TAG_LRM); } xml_result = create_cib_fragment(xml_state, NULL); crm_log_xml_debug_3(xml_state, "Current state of the LRM"); return xml_result; } /* A_LRM_INVOKE */ enum crmd_fsa_input do_lrm_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { const char *crm_op = NULL; const char *operation = NULL; enum crmd_fsa_input next_input = I_NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); crm_op = cl_get_string(input->msg, F_CRM_TASK); operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK); if(crm_op != NULL && safe_str_eq(crm_op, "lrm_query")) { crm_data_t *data = do_lrm_query(FALSE); HA_Message *reply = create_reply(input->msg, data); if(relay_message(reply, TRUE) == FALSE) { crm_err("Unable to route reply"); crm_log_message(LOG_ERR, reply); crm_msg_del(reply); } free_xml(data); } else if(operation != NULL) { char rid[64]; lrm_rsc_t *rsc = NULL; const char *id_from_cib = NULL; crm_data_t *xml_rsc = find_xml_node( input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_DEV_ASSERT(xml_rsc != NULL); if(crm_assert_failed) { crm_log_xml_err(input->xml, "Bad resource"); return I_NULL; } id_from_cib = crm_element_value(xml_rsc, XML_ATTR_ID); CRM_DEV_ASSERT(id_from_cib != NULL); if(crm_assert_failed) { crm_err("No value for %s in %s.", XML_ATTR_ID, crm_element_name(xml_rsc)); crm_log_xml_err(input->xml, "Bad command"); return I_NULL; } /* only the first 16 chars are used by the LRM */ strncpy(rid, id_from_cib, 64); rid[63] = 0; rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); next_input = do_lrm_rsc_op(rsc, rid, operation, input->xml, input->msg); } else { crm_err("Operation was neither a lrm_query, nor a rsc op. %s", crm_str(crm_op)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } return next_input; } struct recurring_op_s { char *rsc_id; int call_id; }; void nack_rsc_op(lrm_op_t* op, HA_Message *msg) { HA_Message *reply = NULL; crm_data_t *update, *iter; crm_data_t *fragment; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return; } CRM_DEV_ASSERT(msg != NULL); if(crm_assert_failed) { return; } - crm_err("NACK'ing resource op"); + crm_info("NACK'ing resource op: %s for %s", op->op_type, op->rsc_id); update = create_xml_node(NULL, XML_CIB_TAG_STATE); set_uuid(fsa_cluster_conn, update, XML_ATTR_UUID, fsa_our_uname); crm_xml_add(update, XML_ATTR_UNAME, fsa_our_uname); iter = create_xml_node(update, XML_CIB_TAG_LRM); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); op->rc = 99; op->op_status = LRM_OP_ERROR; build_operation_update(iter, op, __FUNCTION__, 0); fragment = create_cib_fragment(update, NULL); reply = create_reply(msg, fragment); - crm_log_xml_info(update, "NACK Update"); - crm_log_message_adv(LOG_INFO, "NACK'd msg", msg); - crm_log_message_adv(LOG_INFO, "NACK Reply", reply); + crm_log_xml_debug_2(update, "NACK Update"); + crm_log_message_adv(LOG_DEBUG_2, "NACK'd msg", msg); + crm_log_message_adv(LOG_DEBUG_2, "NACK Reply", reply); if(relay_message(reply, TRUE) == FALSE) { crm_log_message_adv(LOG_ERR, "Unable to route reply", reply); crm_msg_del(reply); } free_xml(fragment); free_xml(update); } enum crmd_fsa_input do_lrm_rsc_op(lrm_rsc_t *rsc, char *rid, const char *operation, crm_data_t *msg, HA_Message *request) { int call_id = 0; char *op_id = NULL; lrm_op_t* op = NULL; const char *type = NULL; const char *class = NULL; const char *provider = NULL; const char *transition = NULL; GHashTable *params = NULL; fsa_data_t *msg_data = NULL; CRM_DEV_ASSERT(rid != NULL); if(rsc != NULL) { class = rsc->class; type = rsc->type; } else if(msg != NULL) { crm_data_t *xml_rsc = find_xml_node( msg, XML_CIB_TAG_RESOURCE, TRUE); class = crm_element_value(xml_rsc, XML_AGENT_ATTR_CLASS); CRM_DEV_ASSERT(class != NULL); if(crm_assert_failed) { return I_NULL; } type = crm_element_value(xml_rsc, XML_ATTR_TYPE); CRM_DEV_ASSERT(type != NULL); if(crm_assert_failed) { return I_NULL; } provider = crm_element_value(xml_rsc, XML_AGENT_ATTR_PROVIDER); } if(msg != NULL) { transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY); if(transition == NULL) { crm_err("Missing transition"); crm_log_message(LOG_ERR, msg); } } if(rsc == NULL) { /* check if its already there */ rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); } if(rsc == NULL) { /* add it to the list */ crm_debug_2("adding rsc %s before operation", rid); if(msg != NULL) { params = xml2list(msg); } else { CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); } fsa_lrm_conn->lrm_ops->add_rsc( fsa_lrm_conn, rid, class, type, provider, params); rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); } if(rsc == NULL) { crm_err("Could not add resource %s to LRM", rid); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } /* stop the monitor before stopping the resource */ if(safe_str_eq(operation, CRMD_ACTION_STOP)) { g_hash_table_foreach(monitors, stop_recurring_action, rsc); g_hash_table_foreach_remove( monitors, remove_recurring_action, rsc); } /* now do the op */ crm_info("Performing op %s on %s", operation, rid); crm_malloc0(op, sizeof(lrm_op_t)); op->op_type = crm_strdup(operation); op->op_status = LRM_OP_PENDING; op->user_data = NULL; op->user_data_len = 0; if(transition != NULL) { op->user_data = crm_strdup(transition); op->user_data_len = 1+strlen(op->user_data); } else { CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); } if(params == NULL) { if(msg != NULL) { params = xml2list(msg); } else { CRM_DEV_ASSERT(safe_str_eq( CRMD_ACTION_STOP, operation)); } } op->params = params; op->interval = crm_get_msec(g_hash_table_lookup(op->params,"interval")); op->timeout = crm_get_msec(g_hash_table_lookup(op->params, "timeout")); op->start_delay = crm_get_msec( g_hash_table_lookup(op->params,"start_delay")); /* sanity */ if(op->interval < 0) { op->interval = 0; } if(op->timeout < 0) { op->timeout = 0; } if(op->start_delay < 0) { op->start_delay = 0; } if(g_hash_table_lookup(op->params, "timeout") != NULL) { char *timeout_ms = crm_itoa(op->timeout); g_hash_table_replace( op->params, crm_strdup("timeout"), timeout_ms); } if(g_hash_table_lookup(op->params, "interval") != NULL) { char *interval_ms = crm_itoa(op->interval); g_hash_table_replace( op->params, crm_strdup("interval"), interval_ms); } if(g_hash_table_lookup(op->params, "start_delay") != NULL) { char *delay_ms = crm_itoa(op->start_delay); g_hash_table_replace( op->params, crm_strdup("start_delay"), delay_ms); } if(safe_str_eq(operation, CRMD_ACTION_START) || safe_str_eq(operation, CRMD_ACTION_STOP)) { char *tmp = g_hash_table_lookup(op->params, "interval"); /* CRM_DEV_ASSERT(op->interval == 0); */ /* CRM_DEV_ASSERT(tmp == NULL); */ if(op->interval != 0) { crm_err("Interval for %s oepration was not 0", operation); } if(tmp != NULL) { crm_warn("An interval (%s) was specified for a" " %s operation", tmp, operation); } } if(safe_str_neq(operation, CRMD_ACTION_STOP)) { if((AM_I_DC == FALSE && fsa_state != S_NOT_DC) || (AM_I_DC && fsa_state != S_TRANSITION_ENGINE)) { crm_info("Discarding attempt to perform action %s on %s" " in state %s", operation, rid, fsa_state2string(fsa_state)); op->rsc_id = crm_strdup(rsc->id); nack_rsc_op(op, request); free_lrm_op(op); crm_free(op_id); return I_NULL; } } if(op->interval > 0) { struct recurring_op_s *existing_op = NULL; op_id = generate_op_key(rsc->id, op->op_type, op->interval); existing_op = g_hash_table_lookup(monitors, op_id); if(existing_op != NULL) { crm_debug("Cancelling previous invocation of" " %s on %s (%d)", crm_str(op_id), rsc->id,existing_op->call_id); /*cancel it so we can then restart it without conflict*/ rsc->ops->cancel_op(rsc, existing_op->call_id); g_hash_table_remove(monitors, op_id); } } op->app_name = crm_strdup(CRM_SYSTEM_CRMD); if(safe_str_eq(operation, CRMD_ACTION_MON)) { op->target_rc = CHANGED; } else { op->target_rc = EVERYTIME; } if(safe_str_eq(CRMD_ACTION_MON, operation)) { const char *last_op = g_hash_table_lookup(resources, rsc->id); if(safe_str_eq(last_op, CRMD_ACTION_STOP)) { crm_warn("Attempting to schedule %s after a stop.", op_id); free_lrm_op(op); crm_free(op_id); return I_NULL; } } g_hash_table_replace( resources, crm_strdup(rsc->id), crm_strdup(operation)); call_id = rsc->ops->perform_op(rsc, op); if(call_id <= 0) { crm_err("Operation %s on %s failed: %d", operation, rid, call_id); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } else if(op->interval > 0) { struct recurring_op_s *op = NULL; crm_malloc0(op, sizeof(struct recurring_op_s)); crm_debug("Adding recurring %s op for %s (%d)", op_id, rsc->id, call_id); op->call_id = call_id; op->rsc_id = crm_strdup(rsc->id); g_hash_table_insert(monitors, op_id, op); op_id = NULL; } else { /* record all non-recurring operations so we can wait * for them to complete during shutdown */ char *call_id_s = make_stop_id(rsc->id, call_id); g_hash_table_replace( shutdown_ops, call_id_s, crm_strdup(rsc->id)); crm_debug("Recording pending op: %s", call_id_s); } crm_free(op_id); free_lrm_op(op); return I_NULL; } void stop_recurring_action(gpointer key, gpointer value, gpointer user_data) { lrm_rsc_t *rsc = user_data; struct recurring_op_s *op = (struct recurring_op_s*)value; if(safe_str_eq(op->rsc_id, rsc->id)) { if(op->call_id > 0) { crm_debug("Stopping recurring op %d for %s (%s)", op->call_id, rsc->id, (char*)key); rsc->ops->cancel_op(rsc, op->call_id); } else { crm_err("Invalid call_id %d for %s", op->call_id, rsc->id); /* TODO: we probably need to look up the LRM to find it */ } } } gboolean remove_recurring_action(gpointer key, gpointer value, gpointer user_data) { lrm_rsc_t *rsc = user_data; struct recurring_op_s *op = (struct recurring_op_s*)value; if(safe_str_eq(op->rsc_id, rsc->id)) { return TRUE; } return FALSE; } void free_recurring_op(gpointer value) { struct recurring_op_s *op = (struct recurring_op_s*)value; crm_free(op->rsc_id); crm_free(op); } void free_lrm_op(lrm_op_t *op) { g_hash_table_destroy(op->params); crm_free(op->user_data); crm_free(op->output); crm_free(op->rsc_id); crm_free(op->op_type); crm_free(op->app_name); crm_free(op); } static void dup_attr(gpointer key, gpointer value, gpointer user_data) { g_hash_table_replace(user_data, crm_strdup(key), crm_strdup(value)); } lrm_op_t * copy_lrm_op(const lrm_op_t *op) { lrm_op_t *op_copy = NULL; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return NULL; } CRM_ASSERT(op->rsc_id != NULL); crm_malloc0(op_copy, sizeof(lrm_op_t)); op_copy->op_type = crm_strdup(op->op_type); /* input fields */ op_copy->params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(op->params != NULL) { g_hash_table_foreach(op->params, dup_attr, op_copy->params); } op_copy->timeout = op->timeout; op_copy->interval = op->interval; op_copy->target_rc = op->target_rc; /* in the CRM, this is always a string */ if(op->user_data != NULL) { op_copy->user_data = crm_strdup(op->user_data); } /* output fields */ op_copy->op_status = op->op_status; op_copy->rc = op->rc; op_copy->call_id = op->call_id; op_copy->output = NULL; op_copy->rsc_id = crm_strdup(op->rsc_id); if(op->app_name != NULL) { op_copy->app_name = crm_strdup(op->app_name); } if(op->output != NULL) { op_copy->output = crm_strdup(op->output); } return op_copy; } lrm_rsc_t * copy_lrm_rsc(const lrm_rsc_t *rsc) { lrm_rsc_t *rsc_copy = NULL; if(rsc == NULL) { return NULL; } crm_malloc0(rsc_copy, sizeof(lrm_rsc_t)); rsc_copy->id = crm_strdup(rsc->id); rsc_copy->type = crm_strdup(rsc->type); rsc_copy->class = NULL; rsc_copy->provider = NULL; if(rsc->class != NULL) { rsc_copy->class = crm_strdup(rsc->class); } if(rsc->provider != NULL) { rsc_copy->provider = crm_strdup(rsc->provider); } /* GHashTable* params; */ rsc_copy->params = NULL; rsc_copy->ops = NULL; return rsc_copy; } void do_update_resource(lrm_op_t* op) { /* */ crm_data_t *update, *iter; crm_data_t *fragment; int rc = cib_ok; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return; } update = create_xml_node(NULL, XML_CIB_TAG_STATE); set_uuid(fsa_cluster_conn, update, XML_ATTR_UUID, fsa_our_uname); crm_xml_add(update, XML_ATTR_UNAME, fsa_our_uname); iter = create_xml_node(update, XML_CIB_TAG_LRM); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, op, __FUNCTION__, 0); fragment = create_cib_fragment(update, NULL); /* make it an asyncronous call and be done with it * * Best case: * the resource state will be discovered during * the next signup or election. * * Bad case: * we are shutting down and there is no DC at the time, * but then why were we shutting down then anyway? * (probably because of an internal error) * * Worst case: * we get shot for having resources "running" when the really weren't * * the alternative however means blocking here for too long, which * isnt acceptable */ rc = fsa_cib_conn->cmds->modify( fsa_cib_conn, XML_CIB_TAG_STATUS, fragment, NULL, cib_quorum_override); if(rc > 0) { /* the return code is a call number, not an error code */ crm_debug_3("Sent resource state update message: %d", rc); } else { crm_err("Resource state update failed: %s", cib_error2string(rc)); CRM_DEV_ASSERT(rc == cib_ok); } free_xml(fragment); free_xml(update); } enum crmd_fsa_input do_lrm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data) { lrm_op_t* op = NULL; const char *last_op = NULL; if(msg_data->fsa_cause != C_LRM_OP_CALLBACK) { register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } op = fsa_typed_data(fsa_dt_lrm); CRM_DEV_ASSERT(op != NULL); CRM_DEV_ASSERT(op != NULL && op->rsc_id != NULL); if(crm_assert_failed) { return I_NULL; } if(op->op_status == LRM_OP_DONE && op->rc != EXECRA_OK) { crm_warn("Mapping operation %d status with a rc=%d" " to status %d", op->op_status, op->rc, LRM_OP_ERROR); op->op_status = LRM_OP_ERROR; } switch(op->op_status) { case LRM_OP_PENDING: /* this really shouldnt happen */ crm_err("LRM operation (%d) %s on %s %s: %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status), execra_code2string(op->rc)); break; case LRM_OP_ERROR: crm_err("LRM operation (%d) %s on %s %s: %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status), execra_code2string(op->rc)); crm_debug("Result: %s", op->output); break; case LRM_OP_CANCELLED: crm_warn("LRM operation (%d) %s on %s %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status)); return I_NULL; break; case LRM_OP_TIMEOUT: last_op = g_hash_table_lookup( resources_confirmed, crm_strdup(op->rsc_id)); if(safe_str_eq(last_op, CRMD_ACTION_STOP) && safe_str_eq(op->op_type, CRMD_ACTION_MON)) { crm_err("LRM sent a timed out %s operation" " _after_ a confirmed stop", op->op_type); return I_NULL; } crm_err("LRM operation (%d) %s on %s %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status)); break; case LRM_OP_NOTSUPPORTED: crm_err("LRM operation (%d) %s on %s %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status)); break; case LRM_OP_DONE: crm_debug("LRM operation (%d) %s on %s %s", op->call_id, op->op_type, crm_str(op->rsc_id), op_status2text(op->op_status)); break; } g_hash_table_replace(resources_confirmed, crm_strdup(op->rsc_id), crm_strdup(op->op_type)); do_update_resource(op); if(g_hash_table_size(shutdown_ops) > 0) { char *op_id = make_stop_id(op->rsc_id, op->call_id); if(g_hash_table_remove(shutdown_ops, op_id)) { crm_debug("Op %d (%s %s) confirmed", op->call_id, op->op_type, op->rsc_id); } else if(op->interval == 0) { crm_err("Op %d (%s %s) not matched: %s", op->call_id, op->op_type, op->rsc_id, op_id); } crm_free(op_id); } if(is_set(fsa_input_register, R_SENT_RSC_STOP)) { if(g_hash_table_size(shutdown_ops) == 0) { register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } else { crm_debug("Still waiting for %d pending stop operations" " to complete before exiting", g_hash_table_size(shutdown_ops)); g_hash_table_foreach( shutdown_ops, ghash_print_pending, NULL); } } return I_NULL; } char * make_stop_id(const char *rsc, int call_id) { char *op_id = NULL; crm_malloc0(op_id, strlen(rsc) + 34); if(op_id != NULL) { snprintf(op_id, strlen(rsc) + 34, "%s:%d", rsc, call_id); } return op_id; } void ghash_print_pending(gpointer key, gpointer value, gpointer user_data) { const char *uname = key; crm_debug("Pending action: %s", uname); } gboolean resource_stopped(gpointer key, gpointer value, gpointer user_data) { const char *this_rsc = value; const char *target_rsc = user_data; if(safe_str_eq(this_rsc, target_rsc)) { return TRUE; } return FALSE; } diff --git a/crm/tengine/utils.c b/crm/tengine/utils.c index 3865c45d58..bc5f837896 100644 --- a/crm/tengine/utils.c +++ b/crm/tengine/utils.c @@ -1,527 +1,527 @@ -/* $Id: utils.c,v 1.44 2005/09/12 11:09:55 andrew Exp $ */ +/* $Id: utils.c,v 1.45 2005/09/14 15:24:48 andrew Exp $ */ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include extern cib_t *te_cib_conn; extern int global_transition_timer; extern int transition_counter; void print_input(const char *prefix, action_t *input, gboolean to_file); void print_action(const char *prefix, action_t *action, gboolean to_file); gboolean timer_callback(gpointer data); int unconfirmed_actions(void) { int unconfirmed = 0; crm_debug_2("Unconfirmed actions..."); slist_iter( synapse, synapse_t, graph, lpc, /* lookup event */ slist_iter( action, action_t, synapse->actions, lpc2, if(action->invoked && action->complete == FALSE) { unconfirmed++; crm_debug("Action %d: unconfirmed",action->id); } ); ); return unconfirmed; } void send_complete(const char *text, crm_data_t *msg, te_reason_t reason, te_fsa_input_t input) { int log_level = LOG_DEBUG; int unconfirmed = unconfirmed_actions(); int pending_callbacks = num_cib_op_callbacks(); HA_Message *cmd = NULL; const char *op = CRM_OP_TEABORT; static te_reason_t last_reason = te_done; static crm_data_t *last_msg = NULL; static const char *last_text = NULL; te_fsa_state_t last_state = te_fsa_state; te_fsa_state = te_state_matrix[input][te_fsa_state]; if(te_fsa_state != last_state) { crm_debug("State change %d->%d", last_state, te_fsa_state); } if(te_fsa_state == s_abort_pending && (unconfirmed == 0 || reason == te_timeout || reason == te_abort_timeout)) { crm_debug("Faking i_cmd_complete: %d/%d", last_state, input); te_fsa_state = te_state_matrix[i_cmd_complete][te_fsa_state]; crm_debug("State change %d->%d", last_state, te_fsa_state); crm_debug("Stopping abort timer"); stop_te_timer(abort_timer); } else if(te_fsa_state ==s_abort_pending && te_fsa_state !=last_state) { abort_timer->timeout = transition_timer->timeout; crm_info("Starting abort timer: %dms", abort_timer->timeout); start_te_timer(abort_timer); } if(te_fsa_state == s_updates_pending && pending_callbacks == 0) { crm_debug("Faking i_cib_complete: %d/%d", last_state, input); te_fsa_state = te_state_matrix[i_cib_complete][te_fsa_state]; crm_debug("State change %d->%d", last_state, te_fsa_state); } if(te_fsa_state == last_state && (last_state==s_abort_pending || last_state==s_updates_pending)) { crm_info("Transaction already cancelled"); } switch(reason) { case te_update: te_log_action( LOG_INFO, "%d - Transition status: %s by CIB update: %s", transition_counter, last_state!=s_idle?"Aborted":"Triggered", text); if(msg != NULL) { if(safe_str_eq(crm_element_name(msg), XML_TAG_CIB)) { crm_data_t *status = get_object_root(XML_CIB_TAG_STATUS, msg); crm_data_t *generation = create_xml_node(NULL, XML_TAG_CIB); crm_debug("Cause:" " full CIB replace/update"); copy_in_properties(generation, msg); crm_log_xml_debug(generation, "[generation]"); crm_log_xml_debug(status, "[in ]"); free_xml(generation); } else { crm_log_xml_debug(msg, "Cause"); } } break; case te_halt: te_log_action( LOG_INFO, "%d - Transition status: Stopped%s%s", transition_counter, text?": ":"", text?text:""); break; case te_abort_confirmed: te_log_action( LOG_INFO, "%d - Transition status: Confirmed Stopped%s%s", transition_counter, text?": ":"", text?text:""); break; case te_abort: te_log_action( LOG_INFO,"%d - Transition status: Stopped%s%s", transition_counter, text?": ":"", text?text:""); break; case te_done: te_log_action( LOG_INFO,"%d - Transition status: Complete%s%s", transition_counter, text?": ":"", text?text:""); break; case te_abort_timeout: te_log_action( LOG_ERR, "%d - Transition status: Abort timed out after %dms", transition_counter, abort_timer->timeout); log_level = LOG_WARNING; break; case te_timeout: te_log_action( LOG_ERR, "%d - Transition status: Timed out after %dms", transition_counter, transition_timer->timeout); log_level = LOG_WARNING; break; case te_failed: te_log_action( - LOG_ERR, "%d - Transition status: Aborted by failed action: %s", + LOG_WARNING, "%d - Transition status: Aborted by failed action: %s", transition_counter, text); crm_log_xml_debug(msg, "Cause"); log_level = LOG_WARNING; break; } if(te_fsa_state != s_idle) { if(last_text == NULL) { /* store the original input */ crm_debug("Storing TE input."); last_msg = NULL; if(msg != NULL) { last_msg = copy_xml(msg); } last_text = text; last_reason = reason; } crm_info("%d - Delay abort until %d updates and %d actions complete (state=%d).", transition_counter, pending_callbacks, unconfirmed, te_fsa_state); return; } else if(last_text != NULL) { /* restore the original reason we aborted */ crm_debug("Restoring TE input."); msg = last_msg; text = last_text; reason = last_reason; last_msg = NULL; last_text = NULL; } CRM_DEV_ASSERT(pending_callbacks == 0); print_state(log_level); initialize_graph(); switch(reason) { case te_abort: case te_abort_timeout: case te_failed: case te_update: op = CRM_OP_TEABORT; break; case te_halt: op = CRM_OP_TECOMPLETE; break; case te_abort_confirmed: op = CRM_OP_TEABORTED; break; case te_done: op = CRM_OP_TECOMPLETE; break; case te_timeout: op = CRM_OP_TETIMEOUT; break; } cmd = create_request( op, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_TENGINE, NULL); if(text != NULL) { ha_msg_add(cmd, "message", text); } free_xml(last_msg); #ifdef TESTING if(reason == te_done) { crm_log_message(LOG_INFO, cmd); } else { crm_log_message(LOG_ERR, cmd); } g_main_quit(mainloop); return; #else send_ipc_message(crm_ch, cmd); #endif #if 0 if(is_ipc_empty(crm_ch) && is_ipc_empty(te_cib_conn->cmds->channel(te_cib_conn)) ) { static gboolean mem_needs_init = TRUE; if(mem_needs_init) { crm_debug("Reached a stable point:" " reseting memory usage stats to zero"); crm_zero_mem_stats(NULL); mem_needs_init = FALSE; } else { crm_err("Reached a stable point:" " checking memory usage"); crm_mem_stats(NULL); } } #endif } void print_state(unsigned int log_level) { gboolean first_synapse = TRUE; if(graph == NULL && log_level > LOG_DEBUG) { crm_debug("## Empty transition graph ##"); return; } slist_iter( synapse, synapse_t, graph, lpc, first_synapse = FALSE; crm_log_maybe(log_level, "Synapse %d %s", synapse->id, synapse->confirmed?"was confirmed":synapse->complete?"was executed":"is pending"); if(synapse->confirmed == FALSE) { slist_iter( action, action_t, synapse->actions, lpc2, print_action("\t", action, log_level); ); } if(synapse->complete == FALSE) { slist_iter( input, action_t, synapse->inputs, lpc2, print_input("\t", input, log_level); ); } ); if(first_synapse && log_level > LOG_DEBUG) { crm_debug("## Empty transition graph ##"); return; } } void print_input(const char *prefix, action_t *input, int log_level) { do_crm_log(log_level, __FILE__, __FUNCTION__, "%s[Input %d] %s (%s)", prefix, input->id, input->complete?"Satisfied":"Pending", actiontype2text(input->type)); if(input->complete == FALSE) { crm_log_xml(log_level+2, "\t\t\tRaw input: ", input->xml); } } void print_action(const char *prefix, action_t *action, int log_level) { do_crm_log(log_level, __FILE__, __FUNCTION__, "%s[Action %d] %s (%s fail)", prefix, action->id, action->complete?"Completed": action->invoked?"In-flight": action->sent_update?"Update sent":"Pending", action->can_fail?"can":"cannot"); switch(action->type) { case action_type_pseudo: do_crm_log(log_level, __FILE__, __FUNCTION__, "%s\tPseudo Op: %s", prefix, crm_element_value( action->xml, XML_LRM_ATTR_TASK)); break; case action_type_rsc: do_crm_log(log_level, __FILE__, __FUNCTION__, "%s\tResource Op: %s/%s on %s (%s)", prefix, crm_element_value( action->xml, XML_LRM_ATTR_RSCID), crm_element_value( action->xml, XML_LRM_ATTR_TASK), crm_element_value( action->xml, XML_LRM_ATTR_TARGET), crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID)); break; case action_type_crm: do_crm_log(log_level, __FILE__, __FUNCTION__, "%s\tCRM Op: %s on %s (%s)", prefix, crm_element_value( action->xml, XML_LRM_ATTR_TASK), crm_element_value( action->xml, XML_LRM_ATTR_TARGET), crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID)); break; } if(action->timeout > 0 || action->timer->source_id > 0) { do_crm_log(log_level, __FILE__, __FUNCTION__, "%s\ttimeout=%d, timer=%d", prefix, action->timeout, action->timer->source_id); } if(action->complete == FALSE) { crm_log_xml(log_level+2, "\t\t\tRaw action: ", action->xml); } } gboolean timer_callback(gpointer data) { te_timer_t *timer = NULL; if(data == NULL) { crm_err("Timer popped with no data"); return FALSE; } timer = (te_timer_t*)data; if(timer->source_id > 0) { Gmain_timeout_remove(timer->source_id); } timer->source_id = -1; crm_warn("Timer popped in state=%d", te_fsa_state); if(timer->reason == timeout_abort) { crm_err("Transition abort timeout reached..." " marking transition complete."); send_complete(XML_ATTR_TIMEOUT, NULL, te_abort_timeout, i_cmd_complete); return TRUE; } else if(te_fsa_state != s_in_transition) { crm_debug("Ignoring timeout while not in transition"); return TRUE; } else if(timer->reason == timeout_timeout) { /* global timeout - abort the transition */ crm_warn("Transition timeout reached..." " marking transition complete."); crm_warn("Some actions may not have been executed."); send_complete(XML_ATTR_TIMEOUT, NULL, te_timeout, i_cancel); return TRUE; } else if(timer->action == NULL) { crm_err("Action not present!"); return FALSE; } else if(timer->reason == timeout_action_warn) { print_action("Action missed its timeout", timer->action, LOG_WARNING); return TRUE; } else { /* fail the action * - which may or may not abort the transition */ /* TODO: send a cancel notice to the LRM */ /* TODO: use the ack from above to update the CIB */ return cib_action_update(timer->action, LRM_OP_TIMEOUT); } } gboolean start_te_timer(te_timer_t *timer) { if(((int)timer->source_id) < 0 && timer->timeout > 0) { timer->source_id = Gmain_timeout_add( timer->timeout, timer_callback, (void*)timer); return TRUE; } else if(timer->timeout < 0) { crm_err("Tried to start timer with -ve period"); } else { crm_debug_3("#!!#!!# Timer already running (%d)", timer->source_id); } return FALSE; } gboolean stop_te_timer(te_timer_t *timer) { if(timer == NULL) { return FALSE; } if(((int)timer->source_id) > 0) { Gmain_timeout_remove(timer->source_id); timer->source_id = -2; } else { return FALSE; } return TRUE; } const char * actiontype2text(action_type_e type) { switch(type) { case action_type_pseudo: return "pseduo"; case action_type_rsc: return "rsc"; case action_type_crm: return "crm"; } return ""; } const char * get_rsc_state(const char *task, op_status_t status) { if(safe_str_eq(CRMD_ACTION_START, task)) { if(status == LRM_OP_PENDING) { return CRMD_ACTION_START_PENDING; } else if(status == LRM_OP_DONE) { return CRMD_ACTION_STARTED; } else { return CRMD_ACTION_START_FAIL; } } else if(safe_str_eq(CRMD_ACTION_STOP, task)) { if(status == LRM_OP_PENDING) { return CRMD_ACTION_STOP_PENDING; } else if(status == LRM_OP_DONE) { return CRMD_ACTION_STOPPED; } else { return CRMD_ACTION_STOP_FAIL; } } else { if(safe_str_eq(CRMD_ACTION_MON, task)) { if(status == LRM_OP_PENDING) { return CRMD_ACTION_MON_PENDING; } else if(status == LRM_OP_DONE) { return CRMD_ACTION_MON_OK; } else { return CRMD_ACTION_MON_FAIL; } } else { const char *rsc_state = NULL; if(status == LRM_OP_PENDING) { rsc_state = CRMD_ACTION_GENERIC_PENDING; } else if(status == LRM_OP_DONE) { rsc_state = CRMD_ACTION_GENERIC_OK; } else { rsc_state = CRMD_ACTION_GENERIC_FAIL; } crm_warn("Using status \"%s\" for op \"%s\"..." " this is still in the experimental stage.", rsc_state, task); return rsc_state; } } }