diff --git a/crmd/fsa_defines.h b/crmd/fsa_defines.h index 7b2ca00087..ad8ca98cb6 100644 --- a/crmd/fsa_defines.h +++ b/crmd/fsa_defines.h @@ -1,493 +1,499 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef FSA_DEFINES__H # define FSA_DEFINES__H /*====================================== * States the DC/CRMd can be in *======================================*/ enum crmd_fsa_state { S_IDLE = 0, /* Nothing happening */ S_ELECTION, /* Take part in the election algorithm as * described below */ S_INTEGRATION, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_FINALIZE_JOIN, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_NOT_DC, /* we are in crmd/slave mode */ S_POLICY_ENGINE, /* Determin the next stable state of the cluster */ S_RECOVERY, /* Something bad happened, check everything is ok * before continuing and attempt to recover if * required */ S_RELEASE_DC, /* we were the DC, but now we arent anymore, * possibly by our own request, and we should * release all unnecessary sub-systems, finish * any pending actions, do general cleanup and * unset anything that makes us think we are * special :) */ S_STARTING, /* we are just starting out */ S_PENDING, /* we are not a full/active member yet */ S_STOPPING, /* We are in the final stages of shutting down */ S_TERMINATE, /* We are going to shutdown, this is the equiv of * "Sending TERM signal to all processes" in Linux * and in worst case scenarios could be considered * a self STONITH */ S_TRANSITION_ENGINE, /* Attempt to make the calculated next stable * state of the cluster a reality */ S_HALT, /* Freeze - dont do anything * Something ad happened that needs the admin to fix * Wait for I_ELECTION */ /* ----------- Last input found in table is above ---------- */ S_ILLEGAL /* This is an illegal FSA state */ /* (must be last) */ }; # define MAXSTATE S_ILLEGAL /* A state diagram can be constructed from the dc_fsa.dot with the following command: dot -Tpng crmd_fsa.dot > crmd_fsa.png Description: Once we start and do some basic sanity checks, we go into the S_NOT_DC state and await instructions from the DC or input from the CCM which indicates the election algorithm needs to run. If the election algorithm is triggered we enter the S_ELECTION state from where we can either go back to the S_NOT_DC state or progress to the S_INTEGRATION state (or S_RELEASE_DC if we used to be the DC but arent anymore). The election algorithm has been adapted from http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR521 Loosly known as the Bully Algorithm, its major points are: - Election is initiated by any node (N) notices that the coordinator is no longer responding - Concurrent multiple elections are possible - Algorithm + N sends ELECTION messages to all nodes that occur earlier in the CCM's membership list. + If no one responds, N wins and becomes coordinator + N sends out COORDINATOR messages to all other nodes in the partition + If one of higher-ups answers, it takes over. N is done. Once the election is complete, if we are the DC, we enter the S_INTEGRATION state which is a DC-in-waiting style state. We are the DC, but we shouldnt do anything yet because we may not have an up-to-date picture of the cluster. There may of course be times when this fails, so we should go back to the S_RECOVERY stage and check everything is ok. We may also end up here if a new node came online, since each node is authorative on itself and we would want to incorporate its information into the CIB. Once we have the latest CIB, we then enter the S_POLICY_ENGINE state where invoke the Policy Engine. It is possible that between invoking the Policy Engine and receiving an answer, that we receive more input. In this case we would discard the orginal result and invoke it again. Once we are satisfied with the output from the Policy Engine we enter S_TRANSITION_ENGINE and feed the Policy Engine's output to the Transition Engine who attempts to make the Policy Engine's calculation a reality. If the transition completes successfully, we enter S_IDLE, otherwise we go back to S_POLICY_ENGINE with the current unstable state and try again. Of course we may be asked to shutdown at any time, however we must progress to S_NOT_DC before doing so. Once we have handed over DC duties to another node, we can then shut down like everyone else, that is by asking the DC for permission and waiting it to take all our resources away. The case where we are the DC and the only node in the cluster is a special case and handled as an escalation which takes us to S_SHUTDOWN. Similarly if any other point in the shutdown fails or stalls, this is escalated and we end up in S_TERMINATE. At any point, the CRMd/DC can relay messages for its sub-systems, but outbound messages (from sub-systems) should probably be blocked until S_INTEGRATION (for the DC case) or the join protocol has completed (for the CRMd case) */ /*====================================== * * Inputs/Events/Stimuli to be given to the finite state machine * * Some of these a true events, and others a synthesised based on * the "register" (see below) and the contents or source of messages. * * At this point, my plan is to have a loop of some sort that keeps * going until receiving I_NULL * *======================================*/ enum crmd_fsa_input { /* 0 */ I_NULL, /* Nothing happened */ /* 1 */ I_CIB_OP, /* An update to the CIB occurred */ I_CIB_UPDATE, /* An update to the CIB occurred */ I_DC_TIMEOUT, /* We have lost communication with the DC */ I_ELECTION, /* Someone started an election */ I_PE_CALC, /* The Policy Engine needs to be invoked */ I_RELEASE_DC, /* The election completed and we were not * elected, but we were the DC beforehand */ I_ELECTION_DC, /* The election completed and we were (re-)elected * DC */ I_ERROR, /* Something bad happened (more serious than * I_FAIL) and may not have been due to the action * being performed. For example, we may have lost * our connection to the CIB. */ /* 9 */ I_FAIL, /* The action failed to complete successfully */ I_INTEGRATED, I_FINALIZED, I_NODE_JOIN, /* A node has entered the cluster */ I_NOT_DC, /* We are not and were not the DC before or after * the current operation or state */ I_RECOVERED, /* The recovery process completed successfully */ I_RELEASE_FAIL, /* We could not give up DC status for some reason */ I_RELEASE_SUCCESS, /* We are no longer the DC */ I_RESTART, /* The current set of actions needs to be * restarted */ I_TE_SUCCESS, /* Some non-resource, non-ccm action is required * of us, eg. ping */ /* 20 */ I_ROUTER, /* Do our job as router and forward this to the * right place */ I_SHUTDOWN, /* We are asking to shutdown */ I_STOP, /* We have been told to shutdown */ I_TERMINATE, /* Actually exit */ I_STARTUP, I_PE_SUCCESS, /* The action completed successfully */ I_JOIN_OFFER, /* The DC is offering membership */ I_JOIN_REQUEST, /* The client is requesting membership */ I_JOIN_RESULT, /* If not the DC: The result of a join request * Else: A client is responding with its local state info */ I_WAIT_FOR_EVENT, /* we may be waiting for an async task to "happen" * and until it does, we cant do anything else */ I_DC_HEARTBEAT, /* The DC is telling us that it is alive and well */ I_LRM_EVENT, /* 30 */ I_PENDING, I_HALT, /* ------------ Last input found in table is above ----------- */ I_ILLEGAL /* This is an illegal value for an FSA input */ /* (must be last) */ }; # define MAXINPUT I_ILLEGAL # define I_MESSAGE I_ROUTER /*====================================== * * actions * * Some of the actions below will always occur together for now, but I can * forsee that this may not always be the case. So I've spilt them up so * that if they ever do need to be called independantly in the future, it * wont be a problem. * * For example, separating A_LRM_CONNECT from A_STARTUP might be useful * if we ever try to recover from a faulty or disconnected LRM. * *======================================*/ /* Dont do anything */ # define A_NOTHING 0x0000000000000000ULL /* -- Startup actions -- */ /* Hook to perform any actions (other than starting the CIB, * connecting to HA or the CCM) that might be needed as part * of the startup. */ # define A_STARTUP 0x0000000000000001ULL /* Hook to perform any actions that might be needed as part * after startup is successful. */ # define A_STARTED 0x0000000000000002ULL /* Connect to Heartbeat */ # define A_HA_CONNECT 0x0000000000000004ULL # define A_HA_DISCONNECT 0x0000000000000008ULL # define A_INTEGRATE_TIMER_START 0x0000000000000010ULL # define A_INTEGRATE_TIMER_STOP 0x0000000000000020ULL # define A_FINALIZE_TIMER_START 0x0000000000000040ULL # define A_FINALIZE_TIMER_STOP 0x0000000000000080ULL /* -- Election actions -- */ # define A_DC_TIMER_START 0x0000000000000100ULL # define A_DC_TIMER_STOP 0x0000000000000200ULL # define A_ELECTION_COUNT 0x0000000000000400ULL # define A_ELECTION_VOTE 0x0000000000000800ULL # define A_ELECTION_START 0x0000000000001000ULL /* -- Message processing -- */ /* Process the queue of requests */ # define A_MSG_PROCESS 0x0000000000002000ULL /* Send the message to the correct recipient */ # define A_MSG_ROUTE 0x0000000000004000ULL /* Send a welcome message to new node(s) */ # define A_DC_JOIN_OFFER_ONE 0x0000000000008000ULL /* -- Server Join protocol actions -- */ /* Send a welcome message to all nodes */ # define A_DC_JOIN_OFFER_ALL 0x0000000000010000ULL /* Process the remote node's ack of our join message */ # define A_DC_JOIN_PROCESS_REQ 0x0000000000020000ULL /* Send out the reults of the Join phase */ # define A_DC_JOIN_FINALIZE 0x0000000000040000ULL /* Send out the reults of the Join phase */ # define A_DC_JOIN_PROCESS_ACK 0x0000000000080000ULL /* -- Client Join protocol actions -- */ # define A_CL_JOIN_QUERY 0x0000000000100000ULL # define A_CL_JOIN_ANNOUNCE 0x0000000000200000ULL /* Request membership to the DC list */ # define A_CL_JOIN_REQUEST 0x0000000000400000ULL /* Did the DC accept or reject the request */ # define A_CL_JOIN_RESULT 0x0000000000800000ULL /* -- Recovery, DC start/stop -- */ /* Something bad happened, try to recover */ # define A_RECOVER 0x0000000001000000ULL /* Hook to perform any actions (apart from starting, the TE, PE * and gathering the latest CIB) that might be necessary before * giving up the responsibilities of being the DC. */ # define A_DC_RELEASE 0x0000000002000000ULL /* */ # define A_DC_RELEASED 0x0000000004000000ULL /* Hook to perform any actions (apart from starting, the TE, PE * and gathering the latest CIB) that might be necessary before * taking over the responsibilities of being the DC. */ # define A_DC_TAKEOVER 0x0000000008000000ULL /* -- Shutdown actions -- */ # define A_SHUTDOWN 0x0000000010000000ULL # define A_STOP 0x0000000020000000ULL # define A_EXIT_0 0x0000000040000000ULL # define A_EXIT_1 0x0000000080000000ULL # define A_SHUTDOWN_REQ 0x0000000100000000ULL # define A_ELECTION_CHECK 0x0000000200000000ULL # define A_DC_JOIN_FINAL 0x0000000400000000ULL /* -- CCM actions -- */ # define A_CCM_CONNECT 0x0000001000000000ULL # define A_CCM_DISCONNECT 0x0000002000000000ULL /* -- CIB actions -- */ # define A_CIB_START 0x0000020000000000ULL # define A_CIB_STOP 0x0000040000000000ULL /* -- Transition Engine actions -- */ /* Attempt to reach the newly calculated cluster state. This is * only called once per transition (except if it is asked to * stop the transition or start a new one). * Once given a cluster state to reach, the TE will determin * tasks that can be performed in parallel, execute them, wait * for replies and then determin the next set until the new * state is reached or no further tasks can be taken. */ # define A_TE_INVOKE 0x0000100000000000ULL # define A_TE_START 0x0000200000000000ULL # define A_TE_STOP 0x0000400000000000ULL # define A_TE_CANCEL 0x0000800000000000ULL # define A_TE_HALT 0x0001000000000000ULL /* -- Policy Engine actions -- */ /* Calculate the next state for the cluster. This is only * invoked once per needed calculation. */ # define A_PE_INVOKE 0x0002000000000000ULL # define A_PE_START 0x0004000000000000ULL # define A_PE_STOP 0x0008000000000000ULL /* -- Misc actions -- */ /* Add a system generate "block" so that resources arent moved * to or are activly moved away from the affected node. This * way we can return quickly even if busy with other things. */ # define A_NODE_BLOCK 0x0010000000000000ULL /* Update our information in the local CIB */ # define A_UPDATE_NODESTATUS 0x0020000000000000ULL # define A_CIB_BUMPGEN 0x0040000000000000ULL # define A_READCONFIG 0x0080000000000000ULL /* -- LRM Actions -- */ /* Connect to the Local Resource Manager */ # define A_LRM_CONNECT 0x0100000000000000ULL /* Disconnect from the Local Resource Manager */ # define A_LRM_DISCONNECT 0x0200000000000000ULL # define A_LRM_INVOKE 0x0400000000000000ULL # define A_LRM_EVENT 0x0800000000000000ULL /* -- Logging actions -- */ # define A_LOG 0x1000000000000000ULL # define A_ERROR 0x2000000000000000ULL # define A_WARN 0x4000000000000000ULL # define O_EXIT (A_SHUTDOWN|A_STOP|A_CCM_DISCONNECT|A_LRM_DISCONNECT|A_HA_DISCONNECT|A_EXIT_0|A_CIB_STOP) # define O_RELEASE (A_DC_TIMER_STOP|A_DC_RELEASE|A_PE_STOP|A_TE_STOP|A_DC_RELEASED) # define O_PE_RESTART (A_PE_START|A_PE_STOP) # define O_TE_RESTART (A_TE_START|A_TE_STOP) # define O_CIB_RESTART (A_CIB_START|A_CIB_STOP) # define O_LRM_RECONNECT (A_LRM_CONNECT|A_LRM_DISCONNECT) # define O_DC_TIMER_RESTART (A_DC_TIMER_STOP|A_DC_TIMER_START) /*====================================== * * "register" contents * * Things we may want to remember regardless of which state we are in. * * These also count as inputs for synthesizing I_* * *======================================*/ # define R_THE_DC 0x00000001ULL /* Are we the DC? */ # define R_STARTING 0x00000002ULL /* Are we starting up? */ # define R_SHUTDOWN 0x00000004ULL /* Are we trying to shut down? */ # define R_STAYDOWN 0x00000008ULL /* Should we restart? */ # define R_JOIN_OK 0x00000010ULL /* Have we completed the join process */ # define R_READ_CONFIG 0x00000040ULL # define R_INVOKE_PE 0x00000080ULL /* Does the PE needed to be invoked at the next appropriate point? */ # define R_CIB_CONNECTED 0x00000100ULL /* Is the CIB connected? */ # define R_PE_CONNECTED 0x00000200ULL /* Is the Policy Engine connected? */ # define R_TE_CONNECTED 0x00000400ULL /* Is the Transition Engine connected? */ # define R_LRM_CONNECTED 0x00000800ULL /* Is the Local Resource Manager connected? */ # define R_CIB_REQUIRED 0x00001000ULL /* Is the CIB required? */ # define R_PE_REQUIRED 0x00002000ULL /* Is the Policy Engine required? */ # define R_TE_REQUIRED 0x00004000ULL /* Is the Transition Engine required? */ # define R_ST_REQUIRED 0x00008000ULL /* Is the Stonith daemon required? */ # define R_CIB_DONE 0x00010000ULL /* Have we calculated the CIB? */ # define R_HAVE_CIB 0x00020000ULL /* Do we have an up-to-date CIB */ # define R_CIB_ASKED 0x00040000ULL /* Have we asked for an up-to-date CIB */ # define R_MEMBERSHIP 0x00100000ULL /* Have we got CCM data yet */ # define R_PEER_DATA 0x00200000ULL /* Have we got T_CL_STATUS data yet */ # define R_HA_DISCONNECTED 0x00400000ULL /* did we sign out of our own accord */ # define R_CCM_DISCONNECTED 0x00800000ULL /* did we sign out of our own accord */ # define R_REQ_PEND 0x01000000ULL /* Are there Requests waiting for processing? */ # define R_PE_PEND 0x02000000ULL /* Has the PE been invoked and we're awaiting a reply? */ # define R_TE_PEND 0x04000000ULL /* Has the TE been invoked and we're awaiting completion? */ # define R_RESP_PEND 0x08000000ULL /* Do we have clients waiting on a response? if so perhaps we shouldnt stop yet */ # define R_IN_TRANSITION 0x10000000ULL /* */ # define R_SENT_RSC_STOP 0x20000000ULL /* Have we sent a stop action to all * resources in preparation for * shutting down */ # define R_IN_RECOVERY 0x80000000ULL +/* + * Magic RC used within CRMd to indicate direct nacks + * (operation is invalid in current state) + */ +#define CRM_DIRECT_NACK_RC (99) + enum crmd_fsa_cause { C_UNKNOWN = 0, C_STARTUP, C_IPC_MESSAGE, C_HA_MESSAGE, C_CCM_CALLBACK, C_CRMD_STATUS_CALLBACK, C_LRM_OP_CALLBACK, C_LRM_MONITOR_CALLBACK, C_TIMER_POPPED, C_SHUTDOWN, C_HEARTBEAT_FAILED, C_SUBSYSTEM_CONNECT, C_HA_DISCONNECT, C_FSA_INTERNAL, C_ILLEGAL }; extern const char *fsa_input2string(enum crmd_fsa_input input); extern const char *fsa_state2string(enum crmd_fsa_state state); extern const char *fsa_cause2string(enum crmd_fsa_cause cause); extern const char *fsa_action2string(long long action); #endif diff --git a/crmd/lrm.c b/crmd/lrm.c index 95ab6fb7c2..f3743898eb 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -1,2269 +1,2269 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define START_DELAY_THRESHOLD 5 * 60 * 1000 #define MAX_LRM_REG_FAILS 30 struct delete_event_s { int rc; const char *rsc; lrm_state_t *lrm_state; }; gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op); static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id); static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list); static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data); static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options, const char *user_name); static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, const char *rsc_id, const char *operation); static void do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg, xmlNode * request); void send_direct_ack(const char *to_host, const char *to_sys, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id); static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, int log_level); static void lrm_connection_destroy(void) { if (is_set(fsa_input_register, R_LRM_CONNECTED)) { crm_crit("LRM Connection failed"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_LRM_CONNECTED); } else { crm_info("LRM Connection disconnected"); } } static char * make_stop_id(const char *rsc, int call_id) { char *op_id = NULL; op_id = calloc(1, strlen(rsc) + 34); if (op_id != NULL) { snprintf(op_id, strlen(rsc) + 34, "%s:%d", rsc, call_id); } return op_id; } static void copy_instance_keys(gpointer key, gpointer value, gpointer user_data) { if (strstr(key, CRM_META "_") == NULL) { g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value)); } } static void copy_meta_keys(gpointer key, gpointer value, gpointer user_data) { if (strstr(key, CRM_META "_") != NULL) { g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value)); } } static void update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { int target_rc = 0; rsc_history_t *entry = NULL; if (op->rsc_deleted) { crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type); delete_rsc_status(lrm_state, op->rsc_id, cib_quorum_override, NULL); return; } if (safe_str_eq(op->op_type, RSC_NOTIFY)) { return; } crm_debug("Updating history for '%s' with %s op", op->rsc_id, op->op_type); entry = g_hash_table_lookup(lrm_state->resource_history, op->rsc_id); if (entry == NULL && rsc) { entry = calloc(1, sizeof(rsc_history_t)); entry->id = strdup(op->rsc_id); g_hash_table_insert(lrm_state->resource_history, entry->id, entry); entry->rsc.id = entry->id; entry->rsc.type = strdup(rsc->type); entry->rsc.class = strdup(rsc->class); if (rsc->provider) { entry->rsc.provider = strdup(rsc->provider); } else { entry->rsc.provider = NULL; } } else if (entry == NULL) { crm_info("Resource %s no longer exists, not updating cache", op->rsc_id); return; } entry->last_callid = op->call_id; target_rc = rsc_op_expected_rc(op); if (op->op_status == PCMK_LRM_OP_CANCELLED) { if (op->interval > 0) { GList *gIter, *gIterNext; crm_trace("Removing cancelled recurring op: %s_%s_%d", op->rsc_id, op->op_type, op->interval); for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIterNext) { lrmd_event_data_t *existing = gIter->data; gIterNext = gIter->next; if (crm_str_eq(op->rsc_id, existing->rsc_id, TRUE) && safe_str_eq(op->op_type, existing->op_type) && op->interval == existing->interval) { lrmd_free_event(existing); entry->recurring_op_list = g_list_delete_link(entry->recurring_op_list, gIter); } } return; } else { crm_trace("Skipping %s_%s_%d rc=%d, status=%d", op->rsc_id, op->op_type, op->interval, op->rc, op->op_status); } } else if (did_rsc_op_fail(op, target_rc)) { /* We must store failed monitors here * - otherwise the block below will cause them to be forgetten them when a stop happens */ if (entry->failed) { lrmd_free_event(entry->failed); } entry->failed = lrmd_copy_event(op); } else if (op->interval == 0) { if (entry->last) { lrmd_free_event(entry->last); } entry->last = lrmd_copy_event(op); if (op->params && (safe_str_eq(CRMD_ACTION_START, op->op_type) || safe_str_eq("reload", op->op_type) || safe_str_eq(CRMD_ACTION_STATUS, op->op_type))) { if (entry->stop_params) { g_hash_table_destroy(entry->stop_params); } entry->stop_params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_foreach(op->params, copy_instance_keys, entry->stop_params); } } if (op->interval > 0) { GListPtr iter = NULL; for(iter = entry->recurring_op_list; iter; iter = iter->next) { lrmd_event_data_t *o = iter->data; /* op->rsc_id is implied */ if(op->interval == o->interval && strcmp(op->op_type, o->op_type) == 0) { crm_trace("Removing existing recurring op entry: %s_%s_%d", op->rsc_id, op->op_type, op->interval); entry->recurring_op_list = g_list_remove(entry->recurring_op_list, o); break; } } crm_trace("Adding recurring op: %s_%s_%d", op->rsc_id, op->op_type, op->interval); entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op)); } else if (entry->recurring_op_list && safe_str_eq(op->op_type, RSC_STATUS) == FALSE) { GList *gIter = entry->recurring_op_list; crm_trace("Dropping %d recurring ops because of: %s_%s_%d", g_list_length(gIter), op->rsc_id, op->op_type, op->interval); for (; gIter != NULL; gIter = gIter->next) { lrmd_free_event(gIter->data); } g_list_free(entry->recurring_op_list); entry->recurring_op_list = NULL; } } void lrm_op_callback(lrmd_event_data_t * op) { const char *nodename = NULL; lrm_state_t *lrm_state = NULL; CRM_CHECK(op != NULL, return); /* determine the node name for this connection. */ nodename = op->remote_nodename ? op->remote_nodename : fsa_our_uname; if (op->type == lrmd_event_disconnect && (safe_str_eq(nodename, fsa_our_uname))) { /* if this is the local lrmd ipc connection, set the right bits in the * crmd when the connection goes down */ lrm_connection_destroy(); return; } else if (op->type != lrmd_event_exec_complete) { /* we only need to process execution results */ return; } lrm_state = lrm_state_find(nodename); CRM_ASSERT(lrm_state != NULL); process_lrm_event(lrm_state, op); } /* A_LRM_CONNECT */ void do_lrm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* This only pertains to local lrmd connections. Remote connections are handled as * resources within the pengine. Connecting and disconnecting from remote lrmd instances * handled differently than the local. */ lrm_state_t *lrm_state = NULL; if(fsa_our_uname == NULL) { return; /* Nothing to do */ } lrm_state = lrm_state_find_or_create(fsa_our_uname); if (lrm_state == NULL) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } if (action & A_LRM_DISCONNECT) { if (lrm_state_verify_stopped(lrm_state, cur_state, LOG_INFO) == FALSE) { if (action == A_LRM_DISCONNECT) { crmd_fsa_stall(FALSE); return; } } clear_bit(fsa_input_register, R_LRM_CONNECTED); crm_info("Disconnecting from the LRM"); lrm_state_disconnect(lrm_state); lrm_state_reset_tables(lrm_state); crm_notice("Disconnected from the LRM"); } if (action & A_LRM_CONNECT) { int ret = pcmk_ok; crm_debug("Connecting to the LRM"); ret = lrm_state_ipc_connect(lrm_state); if (ret != pcmk_ok) { if (lrm_state->num_lrm_register_fails < MAX_LRM_REG_FAILS) { crm_warn("Failed to sign on to the LRM %d" " (%d max) times", lrm_state->num_lrm_register_fails, MAX_LRM_REG_FAILS); crm_timer_start(wait_timer); crmd_fsa_stall(FALSE); return; } } if (ret != pcmk_ok) { crm_err("Failed to sign on to the LRM %d" " (max) times", lrm_state->num_lrm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } set_bit(fsa_input_register, R_LRM_CONNECTED); crm_info("LRM connection established"); } if (action & ~(A_LRM_CONNECT | A_LRM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, int log_level) { int counter = 0; gboolean rc = TRUE; const char *when = "lrm disconnect"; GHashTableIter gIter; const char *key = NULL; rsc_history_t *entry = NULL; struct recurring_op_s *pending = NULL; crm_debug("Checking for active resources before exit"); if (cur_state == S_TERMINATE) { log_level = LOG_ERR; when = "shutdown"; } else if (is_set(fsa_input_register, R_SHUTDOWN)) { when = "shutdown... waiting"; } if (lrm_state->pending_ops && lrm_state_is_connected(lrm_state) == TRUE) { guint removed = g_hash_table_foreach_remove( lrm_state->pending_ops, stop_recurring_actions, lrm_state); crm_notice("Stopped %u recurring operations at %s (%u ops remaining)", removed, when, g_hash_table_size(lrm_state->pending_ops)); } if (lrm_state->pending_ops) { g_hash_table_iter_init(&gIter, lrm_state->pending_ops); while (g_hash_table_iter_next(&gIter, NULL, (void **)&pending)) { /* Ignore recurring actions in the shutdown calculations */ if (pending->interval == 0) { counter++; } } } if (counter > 0) { do_crm_log(log_level, "%d pending LRM operations at %s", counter, when); if (cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) { g_hash_table_iter_init(&gIter, lrm_state->pending_ops); while (g_hash_table_iter_next(&gIter, (gpointer*)&key, (gpointer*)&pending)) { do_crm_log(log_level, "Pending action: %s (%s)", key, pending->op_key); } } else { rc = FALSE; } return rc; } if (lrm_state->resource_history == NULL) { return rc; } if (cur_state == S_TERMINATE || is_set(fsa_input_register, R_SHUTDOWN)) { /* At this point we're not waiting, we're just shutting down */ when = "shutdown"; } counter = 0; g_hash_table_iter_init(&gIter, lrm_state->resource_history); while (g_hash_table_iter_next(&gIter, NULL, (gpointer*)&entry)) { if (is_rsc_active(lrm_state, entry->id) == FALSE) { continue; } counter++; crm_trace("Found %s active", entry->id); if (lrm_state->pending_ops) { GHashTableIter hIter; g_hash_table_iter_init(&hIter, lrm_state->pending_ops); while (g_hash_table_iter_next(&hIter, (gpointer*)&key, (gpointer*)&pending)) { if (crm_str_eq(entry->id, pending->rsc_id, TRUE)) { crm_notice("%sction %s (%s) incomplete at %s", pending->interval == 0 ? "A" : "Recurring a", key, pending->op_key, when); } } } } if (counter) { crm_err("%d resources were active at %s.", counter, when); } return rc; } static char * get_rsc_metadata(const char *type, const char *class, const char *provider) { int rc = 0; char *metadata = NULL; /* Always use a local connection for this operation */ lrm_state_t *lrm_state = lrm_state_find(fsa_our_uname); CRM_CHECK(type != NULL, return NULL); CRM_CHECK(class != NULL, return NULL); CRM_CHECK(lrm_state != NULL, return NULL); if (provider == NULL) { provider = "heartbeat"; } crm_trace("Retreiving metadata for %s::%s:%s", type, class, provider); rc = lrm_state_get_metadata(lrm_state, class, provider, type, &metadata, 0); if (metadata == NULL) { crm_warn("No metadata found for %s::%s:%s: %s (%d)", type, class, provider, pcmk_strerror(rc), rc); } return metadata; } typedef struct reload_data_s { char *key; char *metadata; time_t last_query; gboolean can_reload; GListPtr restart_list; } reload_data_t; static void g_hash_destroy_reload(gpointer data) { reload_data_t *reload = data; free(reload->key); free(reload->metadata); g_list_free_full(reload->restart_list, free); free(reload); } GHashTable *reload_hash = NULL; static GListPtr get_rsc_restart_list(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { int len = 0; char *key = NULL; char *copy = NULL; const char *value = NULL; const char *provider = NULL; xmlNode *param = NULL; xmlNode *params = NULL; xmlNode *actions = NULL; xmlNode *metadata = NULL; time_t now = time(NULL); reload_data_t *reload = NULL; if (reload_hash == NULL) { reload_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, g_hash_destroy_reload); } provider = rsc->provider; if (provider == NULL) { provider = "heartbeat"; } len = strlen(rsc->type) + strlen(rsc->class) + strlen(provider) + 4; key = malloc(len); if(key) { snprintf(key, len, "%s::%s:%s", rsc->type, rsc->class, provider); reload = g_hash_table_lookup(reload_hash, key); } if (reload && ((now - 9) > reload->last_query) && safe_str_eq(op->op_type, RSC_START)) { reload = NULL; /* re-query */ } if (reload == NULL) { xmlNode *action = NULL; reload = calloc(1, sizeof(reload_data_t)); g_hash_table_replace(reload_hash, key, reload); reload->last_query = now; reload->key = key; key = NULL; reload->metadata = get_rsc_metadata(rsc->type, rsc->class, provider); if(reload->metadata == NULL) { goto cleanup; } metadata = string2xml(reload->metadata); if (metadata == NULL) { crm_err("Metadata for %s::%s:%s is not valid XML", rsc->provider, rsc->class, rsc->type); goto cleanup; } actions = find_xml_node(metadata, "actions", TRUE); for (action = __xml_first_child(actions); action != NULL; action = __xml_next(action)) { if (crm_str_eq((const char *)action->name, "action", TRUE)) { value = crm_element_value(action, "name"); if (safe_str_eq("reload", value)) { reload->can_reload = TRUE; break; } } } if (reload->can_reload == FALSE) { goto cleanup; } params = find_xml_node(metadata, "parameters", TRUE); for (param = __xml_first_child(params); param != NULL; param = __xml_next(param)) { if (crm_str_eq((const char *)param->name, "parameter", TRUE)) { value = crm_element_value(param, "unique"); if (crm_is_true(value)) { value = crm_element_value(param, "name"); if (value == NULL) { crm_err("%s: NULL param", key); continue; } crm_debug("Attr %s is not reloadable", value); copy = strdup(value); CRM_LOG_ASSERT(copy != NULL); if(copy == NULL) { continue; }; reload->restart_list = g_list_append(reload->restart_list, copy); } } } } cleanup: free(key); free_xml(metadata); return reload->restart_list; } static void append_restart_list(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, xmlNode * update, const char *version) { int len = 0; char *list = NULL; char *digest = NULL; const char *value = NULL; xmlNode *restart = NULL; GListPtr restart_list = NULL; GListPtr lpc = NULL; if (op->interval > 0) { /* monitors are not reloadable */ return; } else if (op->params == NULL) { crm_debug("%s has no parameters", ID(update)); return; } else if (rsc == NULL) { return; } else if (crm_str_eq(CRMD_ACTION_STOP, op->op_type, TRUE)) { /* Stopped resources don't need to be reloaded */ return; } else if (compare_version("1.0.8", version) > 0) { /* Caller version does not support reloads */ return; } restart_list = get_rsc_restart_list(rsc, op); if (restart_list == NULL) { /* Resource does not support reloads */ return; } restart = create_xml_node(NULL, XML_TAG_PARAMS); for (lpc = restart_list; lpc != NULL; lpc = lpc->next) { const char *param = (const char *)lpc->data; int start = len; CRM_LOG_ASSERT(param != NULL); if(param == NULL) { continue; }; value = g_hash_table_lookup(op->params, param); if (value != NULL) { crm_xml_add(restart, param, value); } len += strlen(param) + 2; list = realloc_safe(list, len + 1); sprintf(list + start, " %s ", param); } digest = calculate_operation_digest(restart, version); crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list); crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest); crm_trace("%s: %s, %s", rsc->id, digest, list); crm_log_xml_trace(restart, "restart digest source"); free_xml(restart); free(digest); free(list); } static gboolean build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *src) { int target_rc = 0; xmlNode *xml_op = NULL; const char *caller_version = CRM_FEATURE_SET; if (op == NULL) { return FALSE; } else if (AM_I_DC) { } else if (fsa_our_dc_version != NULL) { caller_version = fsa_our_dc_version; } else if (op->params == NULL) { caller_version = fsa_our_dc_version; } else { /* there is a small risk in formerly mixed clusters that * it will be sub-optimal. * however with our upgrade policy, the update we send * should still be completely supported anyway */ caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION); crm_debug("Falling back to operation originator version: %s", caller_version); } target_rc = rsc_op_expected_rc(op); xml_op = create_operation_update(parent, op, caller_version, target_rc, fsa_our_uname, src, LOG_DEBUG); if (xml_op) { append_restart_list(rsc, op, xml_op, caller_version); } return TRUE; } static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id) { rsc_history_t *entry = NULL; entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id); if (entry == NULL || entry->last == NULL) { return FALSE; } crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type, entry->last->interval, entry->last->rc); if (entry->last->rc == PCMK_OCF_OK && safe_str_eq(entry->last->op_type, CRMD_ACTION_STOP)) { return FALSE; } else if (entry->last->rc == PCMK_OCF_OK && safe_str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE)) { /* a stricter check is too complex... * leave that to the PE */ return FALSE; } else if (entry->last->rc == PCMK_OCF_NOT_RUNNING) { return FALSE; } else if (entry->last->interval == 0 && entry->last->rc == PCMK_OCF_NOT_CONFIGURED) { /* Badly configured resources can't be reliably stopped */ return FALSE; } return TRUE; } static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list) { GHashTableIter iter; rsc_history_t *entry = NULL; g_hash_table_iter_init(&iter, lrm_state->resource_history); while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) { GList *gIter = NULL; xmlNode *xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE); crm_xml_add(xml_rsc, XML_ATTR_ID, entry->id); crm_xml_add(xml_rsc, XML_ATTR_TYPE, entry->rsc.type); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, entry->rsc.class); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, entry->rsc.provider); if (entry->last && entry->last->params) { const char *container = g_hash_table_lookup(entry->last->params, CRM_META"_"XML_RSC_ATTR_CONTAINER); if (container) { crm_trace("Resource %s is a part of container resource %s", entry->id, container); crm_xml_add(xml_rsc, XML_RSC_ATTR_CONTAINER, container); } } build_operation_update(xml_rsc, &(entry->rsc), entry->failed, __FUNCTION__); build_operation_update(xml_rsc, &(entry->rsc), entry->last, __FUNCTION__); for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIter->next) { build_operation_update(xml_rsc, &(entry->rsc), gIter->data, __FUNCTION__); } } return FALSE; } xmlNode * do_lrm_query_internal(lrm_state_t * lrm_state, gboolean is_replace) { xmlNode *xml_state = NULL; xmlNode *xml_data = NULL; xmlNode *rsc_list = NULL; const char *uuid = NULL; if (safe_str_eq(lrm_state->node_name, fsa_our_uname)) { crm_node_t *peer = crm_get_peer(0, lrm_state->node_name); xml_state = do_update_node_cib(peer, node_update_cluster|node_update_peer, NULL, __FUNCTION__); /* The next two lines shouldn't be necessary for newer DCs */ crm_xml_add(xml_state, XML_NODE_JOIN_STATE, CRMD_JOINSTATE_MEMBER); crm_xml_add(xml_state, XML_NODE_EXPECTED, CRMD_JOINSTATE_MEMBER); uuid = fsa_our_uuid; } else { xml_state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(xml_state, XML_NODE_IS_REMOTE, "true"); crm_xml_add(xml_state, XML_ATTR_ID, lrm_state->node_name); crm_xml_add(xml_state, XML_ATTR_UNAME, lrm_state->node_name); uuid = lrm_state->node_name; } xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM); crm_xml_add(xml_data, XML_ATTR_ID, uuid); rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES); /* Build a list of active (not always running) resources */ build_active_RAs(lrm_state, rsc_list); crm_log_xml_trace(xml_state, "Current state of the LRM"); return xml_state; } xmlNode * do_lrm_query(gboolean is_replace, const char *node_name) { lrm_state_t *lrm_state = lrm_state_find(node_name); if (!lrm_state) { crm_err("Could not query lrm state for lrmd node %s", node_name); return NULL; } return do_lrm_query_internal(lrm_state, is_replace); } static void notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, int rc) { lrmd_event_data_t *op = NULL; const char *from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM); const char *from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); crm_info("Notifying %s on %s that %s was%s deleted", from_sys, from_host, rsc_id, rc == pcmk_ok ? "" : " not"); op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE); CRM_ASSERT(op != NULL); if (rc == pcmk_ok) { op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_OK; } else { op->op_status = PCMK_LRM_OP_ERROR; op->rc = PCMK_OCF_UNKNOWN_ERROR; } send_direct_ack(from_host, from_sys, NULL, op, rsc_id); lrmd_free_event(op); if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { /* this isn't expected - trigger a new transition */ time_t now = time(NULL); char *now_s = crm_itoa(now); crm_debug("Triggering a refresh after %s deleted %s from the LRM", from_sys, rsc_id); update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "last-lrm-refresh", now_s, FALSE, NULL, NULL); free(now_s); } } static gboolean lrm_remove_deleted_rsc(gpointer key, gpointer value, gpointer user_data) { struct delete_event_s *event = user_data; struct pending_deletion_op_s *op = value; if (crm_str_eq(event->rsc, op->rsc, TRUE)) { notify_deleted(event->lrm_state, op->input, event->rsc, event->rc); return TRUE; } return FALSE; } static gboolean lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data) { const char *rsc = user_data; struct recurring_op_s *pending = value; if (crm_str_eq(rsc, pending->rsc_id, TRUE)) { crm_info("Removing op %s:%d for deleted resource %s", pending->op_key, pending->call_id, rsc); return TRUE; } return FALSE; } /* * Remove the rsc from the CIB * * Avoids refreshing the entire LRM section of this host */ #define rsc_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']" static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options, const char *user_name) { char *rsc_xpath = NULL; int max = 0; int rc = pcmk_ok; CRM_CHECK(rsc_id != NULL, return -ENXIO); max = strlen(rsc_template) + strlen(rsc_id) + strlen(lrm_state->node_name) + 1; rsc_xpath = calloc(1, max); snprintf(rsc_xpath, max, rsc_template, lrm_state->node_name, rsc_id); rc = cib_internal_op(fsa_cib_conn, CIB_OP_DELETE, NULL, rsc_xpath, NULL, NULL, call_options | cib_xpath, user_name); free(rsc_xpath); return rc; } static void delete_rsc_entry(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, GHashTableIter * rsc_gIter, int rc, const char *user_name) { struct delete_event_s event; CRM_CHECK(rsc_id != NULL, return); if (rc == pcmk_ok) { char *rsc_id_copy = strdup(rsc_id); if (rsc_gIter) g_hash_table_iter_remove(rsc_gIter); else g_hash_table_remove(lrm_state->resource_history, rsc_id_copy); crm_debug("sync: Sending delete op for %s", rsc_id_copy); delete_rsc_status(lrm_state, rsc_id_copy, cib_quorum_override, user_name); g_hash_table_foreach_remove(lrm_state->pending_ops, lrm_remove_deleted_op, rsc_id_copy); free(rsc_id_copy); } if (input) { notify_deleted(lrm_state, input, rsc_id, rc); } event.rc = rc; event.rsc = rsc_id; event.lrm_state = lrm_state; g_hash_table_foreach_remove(lrm_state->deletion_ops, lrm_remove_deleted_rsc, &event); } /* * Remove the op from the CIB * * Avoids refreshing the entire LRM section of this host */ #define op_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s']" #define op_call_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s' and @"XML_LRM_ATTR_CALLID"='%d']" static void delete_op_entry(lrm_state_t * lrm_state, lrmd_event_data_t * op, const char *rsc_id, const char *key, int call_id) { xmlNode *xml_top = NULL; if (op != NULL) { xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP); crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id); crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data); if (op->interval > 0) { char *op_id = generate_op_key(op->rsc_id, op->op_type, op->interval); /* Avoid deleting last_failure too (if it was a result of this recurring op failing) */ crm_xml_add(xml_top, XML_ATTR_ID, op_id); free(op_id); } crm_debug("async: Sending delete op for %s_%s_%d (call=%d)", op->rsc_id, op->op_type, op->interval, op->call_id); fsa_cib_conn->cmds->delete(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, cib_quorum_override); } else if (rsc_id != NULL && key != NULL) { int max = 0; char *op_xpath = NULL; if (call_id > 0) { max = strlen(op_call_template) + strlen(rsc_id) + strlen(lrm_state->node_name) + strlen(key) + 10; op_xpath = calloc(1, max); snprintf(op_xpath, max, op_call_template, lrm_state->node_name, rsc_id, key, call_id); } else { max = strlen(op_template) + strlen(rsc_id) + strlen(lrm_state->node_name) + strlen(key) + 1; op_xpath = calloc(1, max); snprintf(op_xpath, max, op_template, lrm_state->node_name, rsc_id, key); } crm_debug("sync: Sending delete op for %s (call=%d)", rsc_id, call_id); fsa_cib_conn->cmds->delete(fsa_cib_conn, op_xpath, NULL, cib_quorum_override | cib_xpath); free(op_xpath); } else { crm_err("Not enough information to delete op entry: rsc=%p key=%p", rsc_id, key); return; } crm_log_xml_trace(xml_top, "op:cancel"); free_xml(xml_top); } void lrm_clear_last_failure(const char *rsc_id, const char *node_name) { char *attr = NULL; GHashTableIter iter; GList *lrm_state_list = lrm_state_get_list(); GList *state_entry; rsc_history_t *entry = NULL; attr = generate_op_key(rsc_id, "last_failure", 0); /* This clears last failure for every lrm state that has this rsc.*/ for (state_entry = lrm_state_list; state_entry != NULL; state_entry = state_entry->next) { lrm_state_t *lrm_state = state_entry->data; if (node_name != NULL) { if (strcmp(node_name, lrm_state->node_name) != 0) { /* filter by node_name if node_name is present */ continue; } } delete_op_entry(lrm_state, NULL, rsc_id, attr, 0); if (!lrm_state->resource_history) { continue; } g_hash_table_iter_init(&iter, lrm_state->resource_history); while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) { if (crm_str_eq(rsc_id, entry->id, TRUE)) { lrmd_free_event(entry->failed); entry->failed = NULL; } } } free(attr); g_list_free(lrm_state_list); } /* Returns: gboolean - cancellation is in progress */ static gboolean cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, gboolean remove) { int rc = pcmk_ok; char *local_key = NULL; struct recurring_op_s *pending = NULL; CRM_CHECK(op != 0, return FALSE); CRM_CHECK(rsc_id != NULL, return FALSE); if (key == NULL) { local_key = make_stop_id(rsc_id, op); key = local_key; } pending = g_hash_table_lookup(lrm_state->pending_ops, key); if (pending) { if (remove && pending->remove == FALSE) { pending->remove = TRUE; crm_debug("Scheduling %s for removal", key); } if (pending->cancelled) { crm_debug("Operation %s already cancelled", key); free(local_key); return FALSE; } pending->cancelled = TRUE; } else { crm_info("No pending op found for %s", key); free(local_key); return FALSE; } crm_debug("Cancelling op %d for %s (%s)", op, rsc_id, key); rc = lrm_state_cancel(lrm_state, pending->rsc_id, pending->op_type, pending->interval); if (rc == pcmk_ok) { crm_debug("Op %d for %s (%s): cancelled", op, rsc_id, key); free(local_key); return TRUE; } crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc_id, key); /* The caller needs to make sure the entry is * removed from the pending_ops list * * Usually by returning TRUE inside the worker function * supplied to g_hash_table_foreach_remove() * * Not removing the entry from pending_ops will block * the node from shutting down */ free(local_key); return FALSE; } struct cancel_data { gboolean done; gboolean remove; const char *key; lrmd_rsc_info_t *rsc; lrm_state_t *lrm_state; }; static gboolean cancel_action_by_key(gpointer key, gpointer value, gpointer user_data) { gboolean remove = FALSE; struct cancel_data *data = user_data; struct recurring_op_s *op = (struct recurring_op_s *)value; if (crm_str_eq(op->op_key, data->key, TRUE)) { data->done = TRUE; remove = !cancel_op(data->lrm_state, data->rsc->id, key, op->call_id, data->remove); } return remove; } static gboolean cancel_op_key(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *key, gboolean remove) { guint removed = 0; struct cancel_data data; CRM_CHECK(rsc != NULL, return FALSE); CRM_CHECK(key != NULL, return FALSE); data.key = key; data.rsc = rsc; data.done = FALSE; data.remove = remove; data.lrm_state = lrm_state; removed = g_hash_table_foreach_remove(lrm_state->pending_ops, cancel_action_by_key, &data); crm_trace("Removed %u op cache entries, new size: %u", removed, g_hash_table_size(lrm_state->pending_ops)); return data.done; } static lrmd_rsc_info_t * get_lrm_resource(lrm_state_t * lrm_state, xmlNode * resource, xmlNode * op_msg, gboolean do_create) { lrmd_rsc_info_t *rsc = NULL; const char *id = ID(resource); const char *type = crm_element_value(resource, XML_ATTR_TYPE); const char *class = crm_element_value(resource, XML_AGENT_ATTR_CLASS); const char *provider = crm_element_value(resource, XML_AGENT_ATTR_PROVIDER); const char *long_id = crm_element_value(resource, XML_ATTR_ID_LONG); crm_trace("Retrieving %s from the LRM.", id); CRM_CHECK(id != NULL, return NULL); rsc = lrm_state_get_rsc_info(lrm_state, id, 0); if (!rsc && long_id) { rsc = lrm_state_get_rsc_info(lrm_state, long_id, 0); } if (!rsc && do_create) { CRM_CHECK(class != NULL, return NULL); CRM_CHECK(type != NULL, return NULL); crm_trace("Adding rsc %s before operation", id); lrm_state_register_rsc(lrm_state, id, class, provider, type, lrmd_opt_drop_recurring); rsc = lrm_state_get_rsc_info(lrm_state, id, 0); if (!rsc) { fsa_data_t *msg_data = NULL; crm_err("Could not add resource %s to LRM %s", id, lrm_state->node_name); /* only register this as a internal error if this involves the local * lrmd. Otherwise we're likely dealing with an unresponsive remote-node * which is not a FSA failure. */ if (lrm_state_is_local(lrm_state) == TRUE) { register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } } } return rsc; } static void delete_resource(lrm_state_t * lrm_state, const char *id, lrmd_rsc_info_t * rsc, GHashTableIter * gIter, const char *sys, const char *host, const char *user, ha_msg_input_t * request, gboolean unregister) { int rc = pcmk_ok; crm_info("Removing resource %s for %s (%s) on %s", id, sys, user ? user : "internal", host); if (rsc && unregister) { rc = lrm_state_unregister_rsc(lrm_state, id, 0); } if (rc == pcmk_ok) { crm_trace("Resource '%s' deleted", id); } else if (rc == -EINPROGRESS) { crm_info("Deletion of resource '%s' pending", id); if (request) { struct pending_deletion_op_s *op = NULL; char *ref = crm_element_value_copy(request->msg, XML_ATTR_REFERENCE); op = calloc(1, sizeof(struct pending_deletion_op_s)); op->rsc = strdup(rsc->id); op->input = copy_ha_msg_input(request); g_hash_table_insert(lrm_state->deletion_ops, ref, op); } return; } else { crm_warn("Deletion of resource '%s' for %s (%s) on %s failed: %d", id, sys, user ? user : "internal", host, rc); } delete_rsc_entry(lrm_state, request, id, gIter, rc, user); } static int get_fake_call_id(lrm_state_t *lrm_state, const char *rsc_id) { int call_id = 0; rsc_history_t *entry = NULL; entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id); /* Make sure the call id is greater than the last successful operation, * otherwise the failure will not result in a possible recovery of the resource * as it could appear the failure occurred before the successful start */ if (entry) { call_id = entry->last_callid + 1; } if (call_id < 0) { call_id = 1; } return call_id; } static void force_reprobe(lrm_state_t *lrm_state, const char *from_sys, const char *from_host, const char *user_name, gboolean is_remote_node) { GHashTableIter gIter; rsc_history_t *entry = NULL; crm_info("clearing resource history on node %s", lrm_state->node_name); g_hash_table_iter_init(&gIter, lrm_state->resource_history); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { /* only unregister the resource during a reprobe if it is not a remote connection * resource. otherwise unregistering the connection will terminate remote-node * membership */ gboolean unregister = TRUE; if (is_remote_lrmd_ra(NULL, NULL, entry->id)) { lrm_state_t *remote_lrm_state = lrm_state_find(entry->id); if (remote_lrm_state) { /* when forcing a reprobe, make sure to clear remote node before * clearing the remote node's connection resource */ force_reprobe(remote_lrm_state, from_sys, from_host, user_name, TRUE); } unregister = FALSE; } delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys, from_host, user_name, NULL, unregister); } /* Now delete the copy in the CIB */ erase_status_tag(lrm_state->node_name, XML_CIB_TAG_LRM, cib_scope_local); /* And finally, _delete_ the value in attrd * Setting it to FALSE results in the PE sending us back here again */ update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node); } /* A_LRM_INVOKE */ void do_lrm_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean create_rsc = TRUE; lrm_state_t *lrm_state = NULL; const char *crm_op = NULL; const char *from_sys = NULL; const char *from_host = NULL; const char *operation = NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); const char *user_name = NULL; const char *target_node = NULL; gboolean is_remote_node = FALSE; gboolean crm_rsc_delete = FALSE; if (input->xml != NULL) { /* Remote node operations are routed here to their remote connections */ target_node = crm_element_value(input->xml, XML_LRM_ATTR_TARGET); } if (target_node == NULL) { target_node = fsa_our_uname; } else if (safe_str_neq(target_node, fsa_our_uname)) { is_remote_node = TRUE; } lrm_state = lrm_state_find(target_node); if (lrm_state == NULL && is_remote_node) { crm_err("no lrmd connection for remote node %s found on cluster node %s. Can not process request.", target_node, fsa_our_uname); return; } CRM_ASSERT(lrm_state != NULL); #if ENABLE_ACL user_name = crm_acl_get_set_user(input->msg, F_CRM_USER, NULL); crm_trace("LRM command from user '%s'", user_name); #endif crm_op = crm_element_value(input->msg, F_CRM_TASK); from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM); if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); } crm_trace("LRM command from: %s", from_sys); if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) { /* remember this delete op came from crm_resource */ crm_rsc_delete = TRUE; operation = CRMD_ACTION_DELETE; } else if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) { operation = CRM_OP_LRM_REFRESH; } else if (safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) { lrmd_event_data_t *op = NULL; lrmd_rsc_info_t *rsc = NULL; xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_CHECK(xml_rsc != NULL, return); /* The lrmd can not fail a resource, it does not understand the * concept of success or failure in relation to a resource, it simply * executes operations and reports the results. We determine what a failure is. * Becaues of this, if we want to fail a resource we have to fake what we * understand a failure to look like. * * To do this we create a fake lrmd operation event for the resource * we want to fail. We then pass that event to the lrmd client callback * so it will be processed as if it actually came from the lrmd. */ op = construct_op(lrm_state, input->xml, ID(xml_rsc), "asyncmon"); CRM_ASSERT(op != NULL); free((char *)op->user_data); op->user_data = NULL; op->call_id = get_fake_call_id(lrm_state, op->rsc_id); op->interval = 0; op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_UNKNOWN_ERROR; op->t_run = time(NULL); op->t_rcchange = op->t_run; #if ENABLE_ACL if (user_name && is_privileged(user_name) == FALSE) { crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc)); send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); return; } #endif rsc = get_lrm_resource(lrm_state, xml_rsc, input->xml, create_rsc); if (rsc) { crm_info("Failing resource %s...", rsc->id); process_lrm_event(lrm_state, op); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_OK; lrmd_free_rsc_info(rsc); } else { crm_info("Cannot find/create resource in order to fail it..."); crm_log_xml_warn(input->msg, "bad input"); } send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); return; } else if (input->xml != NULL) { operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK); } if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) { int rc = pcmk_ok; xmlNode *fragment = do_lrm_query_internal(lrm_state, TRUE); fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name); crm_info("Forced a local LRM refresh: call=%d", rc); if(strcmp(CRM_SYSTEM_CRMD, from_sys) != 0) { xmlNode *reply = create_request( CRM_OP_INVOKE_LRM, fragment, from_host, from_sys, CRM_SYSTEM_LRMD, fsa_our_uuid); crm_debug("ACK'ing refresh from %s (%s)", from_sys, from_host); if (relay_message(reply, TRUE) == FALSE) { crm_log_xml_err(reply, "Unable to route reply"); } free_xml(reply); } free_xml(fragment); } else if (safe_str_eq(crm_op, CRM_OP_LRM_QUERY)) { xmlNode *data = do_lrm_query_internal(lrm_state, FALSE); xmlNode *reply = create_reply(input->msg, data); if (relay_message(reply, TRUE) == FALSE) { crm_err("Unable to route reply"); crm_log_xml_err(reply, "reply"); } free_xml(reply); free_xml(data); } else if (safe_str_eq(operation, CRM_OP_PROBED)) { update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE, user_name, is_remote_node); } else if (safe_str_eq(operation, CRM_OP_REPROBE) || safe_str_eq(crm_op, CRM_OP_REPROBE)) { crm_notice("Forcing the status of all resources to be redetected"); force_reprobe(lrm_state, from_sys, from_host, user_name, is_remote_node); if(strcmp(CRM_SYSTEM_TENGINE, from_sys) != 0 && strcmp(CRM_SYSTEM_TENGINE, from_sys) != 0) { xmlNode *reply = create_request( CRM_OP_INVOKE_LRM, NULL, from_host, from_sys, CRM_SYSTEM_LRMD, fsa_our_uuid); crm_debug("ACK'ing re-probe from %s (%s)", from_sys, from_host); if (relay_message(reply, TRUE) == FALSE) { crm_log_xml_err(reply, "Unable to route reply"); } free_xml(reply); } } else if (operation != NULL) { lrmd_rsc_info_t *rsc = NULL; xmlNode *params = NULL; xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_CHECK(xml_rsc != NULL, return); /* only the first 16 chars are used by the LRM */ params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE); if (safe_str_eq(operation, CRMD_ACTION_DELETE)) { create_rsc = FALSE; } rsc = get_lrm_resource(lrm_state, xml_rsc, input->xml, create_rsc); if (rsc == NULL && create_rsc) { lrmd_event_data_t *op = NULL; /* if the operation couldn't complete because we can't register * the resource, return a generic error */ op = construct_op(lrm_state, input->xml, ID(xml_rsc), operation); CRM_ASSERT(op != NULL); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_UNKNOWN_ERROR; op->t_run = time(NULL); op->t_rcchange = op->t_run; send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); crm_err("Invalid resource definition"); crm_log_xml_warn(input->msg, "bad input"); } else if (rsc == NULL) { lrmd_event_data_t *op = NULL; crm_notice("Not creating resource for a %s event: %s", operation, ID(input->xml)); delete_rsc_entry(lrm_state, input, ID(xml_rsc), NULL, pcmk_ok, user_name); op = construct_op(lrm_state, input->xml, ID(xml_rsc), operation); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_OK; CRM_ASSERT(op != NULL); send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); } else if (safe_str_eq(operation, CRMD_ACTION_CANCEL)) { char *op_key = NULL; char *meta_key = NULL; int call = 0; const char *call_id = NULL; const char *op_task = NULL; const char *op_interval = NULL; gboolean in_progress = FALSE; CRM_CHECK(params != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); meta_key = crm_meta_name(XML_LRM_ATTR_INTERVAL); op_interval = crm_element_value(params, meta_key); free(meta_key); meta_key = crm_meta_name(XML_LRM_ATTR_TASK); op_task = crm_element_value(params, meta_key); free(meta_key); meta_key = crm_meta_name(XML_LRM_ATTR_CALLID); call_id = crm_element_value(params, meta_key); free(meta_key); CRM_CHECK(op_task != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); CRM_CHECK(op_interval != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); op_key = generate_op_key(rsc->id, op_task, crm_parse_int(op_interval, "0")); crm_debug("PE requested op %s (call=%s) be cancelled", op_key, call_id ? call_id : "NA"); call = crm_parse_int(call_id, "0"); if (call == 0) { /* the normal case when the PE cancels a recurring op */ in_progress = cancel_op_key(lrm_state, rsc, op_key, TRUE); } else { /* the normal case when the PE cancels an orphan op */ in_progress = cancel_op(lrm_state, rsc->id, NULL, call, TRUE); } if (in_progress == FALSE) { lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc->id, op_task); crm_info("Nothing known about operation %d for %s", call, op_key); delete_op_entry(lrm_state, NULL, rsc->id, op_key, call); CRM_ASSERT(op != NULL); op->rc = PCMK_OCF_OK; op->op_status = PCMK_LRM_OP_DONE; send_direct_ack(from_host, from_sys, rsc, op, rsc->id); lrmd_free_event(op); /* needed?? surely not otherwise the cancel_op_(_key) wouldn't * have failed in the first place */ g_hash_table_remove(lrm_state->pending_ops, op_key); } free(op_key); } else if (rsc != NULL && safe_str_eq(operation, CRMD_ACTION_DELETE)) { gboolean unregister = TRUE; #if ENABLE_ACL int cib_rc = delete_rsc_status(lrm_state, rsc->id, cib_dryrun | cib_sync_call, user_name); if (cib_rc != pcmk_ok) { lrmd_event_data_t *op = NULL; crm_err ("Attempted deletion of resource status '%s' from CIB for %s (user=%s) on %s failed: (rc=%d) %s", rsc->id, from_sys, user_name ? user_name : "unknown", from_host, cib_rc, pcmk_strerror(cib_rc)); op = construct_op(lrm_state, input->xml, rsc->id, operation); op->op_status = PCMK_LRM_OP_ERROR; if (cib_rc == -EACCES) { op->rc = PCMK_OCF_INSUFFICIENT_PRIV; } else { op->rc = PCMK_OCF_UNKNOWN_ERROR; } send_direct_ack(from_host, from_sys, NULL, op, rsc->id); lrmd_free_event(op); return; } #endif if (crm_rsc_delete == TRUE && is_remote_lrmd_ra(NULL, NULL, rsc->id)) { unregister = FALSE; } delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host, user_name, input, unregister); } else if (rsc != NULL) { do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg); } lrmd_free_rsc_info(rsc); } else { crm_err("Operation was neither a lrm_query, nor a rsc op. %s", crm_str(crm_op)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } static lrmd_event_data_t * construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, const char *rsc_id, const char *operation) { lrmd_event_data_t *op = NULL; const char *op_delay = NULL; const char *op_timeout = NULL; const char *op_interval = NULL; GHashTable *params = NULL; const char *transition = NULL; CRM_ASSERT(rsc_id != NULL); op = calloc(1, sizeof(lrmd_event_data_t)); op->type = lrmd_event_exec_complete; op->op_type = strdup(operation); op->op_status = PCMK_LRM_OP_PENDING; op->rc = -1; op->rsc_id = strdup(rsc_id); op->interval = 0; op->timeout = 0; op->start_delay = 0; if (rsc_op == NULL) { CRM_LOG_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); op->user_data = NULL; /* the stop_all_resources() case * by definition there is no DC (or they'd be shutting * us down). * So we should put our version here. */ op->params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET)); crm_trace("Constructed %s op for %s", operation, rsc_id); return op; } params = xml2list(rsc_op); g_hash_table_remove(params, CRM_META "_op_target_rc"); op_delay = crm_meta_value(params, XML_OP_ATTR_START_DELAY); op_timeout = crm_meta_value(params, XML_ATTR_TIMEOUT); op_interval = crm_meta_value(params, XML_LRM_ATTR_INTERVAL); op->interval = crm_parse_int(op_interval, "0"); op->timeout = crm_parse_int(op_timeout, "0"); op->start_delay = crm_parse_int(op_delay, "0"); if (safe_str_neq(operation, RSC_STOP)) { op->params = params; } else { rsc_history_t *entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id); /* If we do not have stop parameters cached, use * whatever we are given */ if (!entry || !entry->stop_params) { op->params = params; } else { /* Copy the cached parameter list so that we stop the resource * with the old attributes, not the new ones */ op->params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_foreach(params, copy_meta_keys, op->params); g_hash_table_foreach(entry->stop_params, copy_instance_keys, op->params); g_hash_table_destroy(params); params = NULL; } } /* sanity */ if (op->interval < 0) { op->interval = 0; } if (op->timeout <= 0) { op->timeout = op->interval; } if (op->start_delay < 0) { op->start_delay = 0; } transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY); CRM_CHECK(transition != NULL, return op); op->user_data = strdup(transition); if (op->interval != 0) { if (safe_str_eq(operation, CRMD_ACTION_START) || safe_str_eq(operation, CRMD_ACTION_STOP)) { crm_err("Start and Stop actions cannot have an interval: %d", op->interval); op->interval = 0; } } crm_trace("Constructed %s op for %s: interval=%d", operation, rsc_id, op->interval); return op; } void send_direct_ack(const char *to_host, const char *to_sys, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id) { xmlNode *reply = NULL; xmlNode *update, *iter; crm_node_t *peer = NULL; CRM_CHECK(op != NULL, return); if (op->rsc_id == NULL) { CRM_ASSERT(rsc_id != NULL); op->rsc_id = strdup(rsc_id); } if (to_sys == NULL) { to_sys = CRM_SYSTEM_TENGINE; } peer = crm_get_peer(0, fsa_our_uname); update = do_update_node_cib(peer, node_update_none, NULL, __FUNCTION__); iter = create_xml_node(update, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, rsc, op, __FUNCTION__); reply = create_request(CRM_OP_INVOKE_LRM, update, to_host, to_sys, CRM_SYSTEM_LRMD, NULL); crm_log_xml_trace(update, "ACK Update"); crm_debug("ACK'ing resource op %s_%s_%d from %s: %s", op->rsc_id, op->op_type, op->interval, op->user_data, crm_element_value(reply, XML_ATTR_REFERENCE)); if (relay_message(reply, TRUE) == FALSE) { crm_log_xml_err(reply, "Unable to route reply"); } free_xml(update); free_xml(reply); } gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level) { gboolean res = TRUE; GList *lrm_state_list = lrm_state_get_list(); GList *state_entry; for (state_entry = lrm_state_list; state_entry != NULL; state_entry = state_entry->next) { lrm_state_t *lrm_state = state_entry->data; if (!lrm_state_verify_stopped(lrm_state, cur_state, log_level)) { /* keep iterating through all even when false is returned */ res = FALSE; } } set_bit(fsa_input_register, R_SENT_RSC_STOP); g_list_free(lrm_state_list); lrm_state_list = NULL; return res; } struct stop_recurring_action_s { lrmd_rsc_info_t *rsc; lrm_state_t *lrm_state; }; static gboolean stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data) { gboolean remove = FALSE; struct stop_recurring_action_s *event = user_data; struct recurring_op_s *op = (struct recurring_op_s *)value; if (op->interval != 0 && crm_str_eq(op->rsc_id, event->rsc->id, TRUE)) { crm_debug("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, key); remove = !cancel_op(event->lrm_state, event->rsc->id, key, op->call_id, FALSE); } return remove; } static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data) { gboolean remove = FALSE; lrm_state_t *lrm_state = user_data; struct recurring_op_s *op = (struct recurring_op_s *)value; if (op->interval != 0) { crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, key); remove = !cancel_op(lrm_state, op->rsc_id, key, op->call_id, FALSE); } return remove; } static void do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg, xmlNode * request) { int call_id = 0; char *op_id = NULL; lrmd_event_data_t *op = NULL; lrmd_key_value_t *params = NULL; fsa_data_t *msg_data = NULL; const char *transition = NULL; gboolean stop_recurring = FALSE; CRM_CHECK(rsc != NULL, return); CRM_CHECK(operation != NULL, return); if (msg != NULL) { transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY); if (transition == NULL) { crm_log_xml_err(msg, "Missing transition number"); } } op = construct_op(lrm_state, msg, rsc->id, operation); CRM_CHECK(op != NULL, return); if (is_remote_lrmd_ra(NULL, NULL, rsc->id) && op->interval == 0 && strcmp(operation, CRMD_ACTION_MIGRATE) == 0) { /* pcmk remote connections are a special use case. * We never ever want to stop monitoring a connection resource until * the entire migration has completed. If the connection is ever unexpected * severed, even during a migration, this is an event we must detect.*/ stop_recurring = FALSE; } else if (op->interval == 0 && strcmp(operation, CRMD_ACTION_STATUS) != 0 && strcmp(operation, CRMD_ACTION_NOTIFY) != 0) { /* stop any previous monitor operations before changing the resource state */ stop_recurring = TRUE; } if (stop_recurring == TRUE) { guint removed = 0; struct stop_recurring_action_s data; data.rsc = rsc; data.lrm_state = lrm_state; removed = g_hash_table_foreach_remove( lrm_state->pending_ops, stop_recurring_action_by_rsc, &data); crm_debug("Stopped %u recurring operations in preparation for %s_%s_%d", removed, rsc->id, operation, op->interval); } /* now do the op */ crm_info("Performing key=%s op=%s_%s_%d", transition, rsc->id, operation, op->interval); if (fsa_state != S_NOT_DC && fsa_state != S_POLICY_ENGINE && fsa_state != S_TRANSITION_ENGINE) { if (safe_str_neq(operation, "fail") && safe_str_neq(operation, CRMD_ACTION_STOP)) { crm_info("Discarding attempt to perform action %s on %s in state %s", operation, rsc->id, fsa_state2string(fsa_state)); - op->rc = 99; + op->rc = CRM_DIRECT_NACK_RC; op->op_status = PCMK_LRM_OP_ERROR; send_direct_ack(NULL, NULL, rsc, op, rsc->id); lrmd_free_event(op); free(op_id); return; } } op_id = generate_op_key(rsc->id, op->op_type, op->interval); if (op->interval > 0) { /* cancel it so we can then restart it without conflict */ cancel_op_key(lrm_state, rsc, op_id, FALSE); } if (op->params) { char *key = NULL; char *value = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, op->params); while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { params = lrmd_key_value_add(params, key, value); } } call_id = lrm_state_exec(lrm_state, rsc->id, op->op_type, op->user_data, op->interval, op->timeout, op->start_delay, params); if (call_id <= 0 && lrm_state_is_local(lrm_state)) { crm_err("Operation %s on %s failed: %d", operation, rsc->id, call_id); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } else if (call_id <= 0) { crm_err("Operation %s on resource %s failed to execute on remote node %s: %d", operation, rsc->id, lrm_state->node_name, call_id); op->call_id = get_fake_call_id(lrm_state, rsc->id); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_UNKNOWN_ERROR; op->t_run = time(NULL); op->t_rcchange = op->t_run; process_lrm_event(lrm_state, op); } else { /* record all operations so we can wait * for them to complete during shutdown */ char *call_id_s = make_stop_id(rsc->id, call_id); struct recurring_op_s *pending = NULL; pending = calloc(1, sizeof(struct recurring_op_s)); crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s); pending->call_id = call_id; pending->interval = op->interval; pending->op_type = strdup(operation); pending->op_key = strdup(op_id); pending->rsc_id = strdup(rsc->id); g_hash_table_replace(lrm_state->pending_ops, call_id_s, pending); if (op->interval > 0 && op->start_delay > START_DELAY_THRESHOLD) { char *uuid = NULL; int dummy = 0, target_rc = 0; crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id); decode_transition_key(op->user_data, &uuid, &dummy, &dummy, &target_rc); free(uuid); op->rc = target_rc; op->op_status = PCMK_LRM_OP_DONE; send_direct_ack(NULL, NULL, rsc, op, rsc->id); } } free(op_id); lrmd_free_event(op); return; } int last_resource_update = 0; static void cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { switch (rc) { case pcmk_ok: case -pcmk_err_diff_failed: case -pcmk_err_diff_resync: crm_trace("Resource update %d complete: rc=%d", call_id, rc); break; default: crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc)); } if (call_id == last_resource_update) { last_resource_update = 0; trigger_fsa(fsa_source); } } static void remote_node_init_status(const char *node_name, int call_opt) { int call_id = 0; xmlNode *update = create_xml_node(NULL, XML_CIB_TAG_STATUS); simple_remote_node_status(node_name, update,__FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id != pcmk_ok) { crm_debug("Failed to init status section for remote-node %s", node_name); } free_xml(update); } static void remote_node_clear_status(const char *node_name, int call_opt) { if (node_name == NULL) { return; } remote_node_init_status(node_name, call_opt); erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); } static int do_update_resource(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { /* */ int rc = pcmk_ok; xmlNode *update, *iter = NULL; int call_opt = cib_quorum_override; const char *uuid = NULL; CRM_CHECK(op != NULL, return 0); if (fsa_state == S_ELECTION || fsa_state == S_PENDING) { crm_info("Sending update to local CIB in state: %s", fsa_state2string(fsa_state)); call_opt |= cib_scope_local; } iter = create_xml_node(iter, XML_CIB_TAG_STATUS); update = iter; iter = create_xml_node(iter, XML_CIB_TAG_STATE); if (safe_str_eq(lrm_state->node_name, fsa_our_uname)) { uuid = fsa_our_uuid; } else { /* remote nodes uuid and uname are equal */ uuid = lrm_state->node_name; crm_xml_add(iter, XML_NODE_IS_REMOTE, "true"); } CRM_LOG_ASSERT(uuid != NULL); if(uuid == NULL) { rc = -EINVAL; goto done; } crm_xml_add(iter, XML_ATTR_UUID, uuid); crm_xml_add(iter, XML_ATTR_UNAME, lrm_state->node_name); crm_xml_add(iter, XML_ATTR_ORIGIN, __FUNCTION__); iter = create_xml_node(iter, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, rsc, op, __FUNCTION__); if (rsc) { const char *container = NULL; crm_xml_add(iter, XML_ATTR_TYPE, rsc->type); crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->class); crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider); if (op->params) { container = g_hash_table_lookup(op->params, CRM_META"_"XML_RSC_ATTR_CONTAINER); } if (container) { crm_trace("Resource %s is a part of container resource %s", op->rsc_id, container); crm_xml_add(iter, XML_RSC_ATTR_CONTAINER, container); } CRM_CHECK(rsc->type != NULL, crm_err("Resource %s has no value for type", op->rsc_id)); CRM_CHECK(rsc->class != NULL, crm_err("Resource %s has no value for class", op->rsc_id)); /* check to see if we need to initialize remote-node related status sections */ if (safe_str_eq(op->op_type, "start") && op->rc == 0 && op->op_status == PCMK_LRM_OP_DONE) { const char *remote_node = g_hash_table_lookup(op->params, CRM_META"_remote_node"); if (remote_node) { /* A container for a remote-node has started, initalize remote-node's status */ crm_info("Initalizing lrm status for container remote-node %s. Container successfully started.", remote_node); remote_node_clear_status(remote_node, call_opt); } else if (container == FALSE && safe_str_eq(rsc->type, "remote") && safe_str_eq(rsc->provider, "pacemaker")) { /* baremetal remote node connection resource has started, initalize remote-node's status */ crm_info("Initializing lrm status for baremetal remote-node %s", rsc->id); remote_node_clear_status(rsc->id, call_opt); } } } else { crm_warn("Resource %s no longer exists in the lrmd", op->rsc_id); send_direct_ack(NULL, NULL, rsc, op, op->rsc_id); goto cleanup; } crm_log_xml_trace(update, __FUNCTION__); /* make it an asyncronous call and be done with it * * Best case: * the resource state will be discovered during * the next signup or election. * * Bad case: * we are shutting down and there is no DC at the time, * but then why were we shutting down then anyway? * (probably because of an internal error) * * Worst case: * we get shot for having resources "running" when the really weren't * * the alternative however means blocking here for too long, which * isnt acceptable */ fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, rc, NULL); if (rc > 0) { last_resource_update = rc; } done: /* the return code is a call number, not an error code */ crm_trace("Sent resource state update message: %d for %s=%d on %s", rc, op->op_type, op->interval, op->rsc_id); fsa_register_cib_callback(rc, FALSE, NULL, cib_rsc_callback); cleanup: free_xml(update); return rc; } void do_lrm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data) { CRM_CHECK(FALSE, return); } gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op) { char *op_id = NULL; char *op_key = NULL; int update_id = 0; gboolean removed = FALSE; lrmd_rsc_info_t *rsc = NULL; struct recurring_op_s *pending = NULL; CRM_CHECK(op != NULL, return FALSE); CRM_CHECK(op->rsc_id != NULL, return FALSE); op_id = make_stop_id(op->rsc_id, op->call_id); pending = g_hash_table_lookup(lrm_state->pending_ops, op_id); op_key = generate_op_key(op->rsc_id, op->op_type, op->interval); rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0); if (op->op_status == PCMK_LRM_OP_ERROR) { switch(op->rc) { case PCMK_OCF_NOT_RUNNING: case PCMK_OCF_RUNNING_MASTER: case PCMK_OCF_DEGRADED: case PCMK_OCF_DEGRADED_MASTER: /* Leave it up to the TE/PE to decide if this is an error */ op->op_status = PCMK_LRM_OP_DONE; break; default: /* Nothing to do */ break; } } if (op->op_status != PCMK_LRM_OP_CANCELLED) { if (safe_str_eq(op->op_type, RSC_NOTIFY)) { /* Keep notify ops out of the CIB */ send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } else { update_id = do_update_resource(lrm_state, rsc, op); } } else if (op->interval == 0) { /* This will occur when "crm resource cleanup" is called while actions are in-flight */ crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id); send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } else if (pending == NULL) { /* We don't need to do anything for cancelled ops * that are not in our pending op list. There are no * transition actions waiting on these operations. */ } else if (op->user_data == NULL) { /* At this point we have a pending entry, but no transition * key present in the user_data field. report this */ crm_err("Op %s (call=%d): No user data", op_key, op->call_id); } else if (pending->remove) { /* The tengine canceled this op, we have been waiting for the cancel to finish. */ delete_op_entry(lrm_state, op, op->rsc_id, op_key, op->call_id); } else if (pending && op->rsc_deleted) { /* The tengine initiated this op, but it was cancelled outside of the * tengine's control during a resource cleanup/re-probe request. The tengine * must be alerted that this operation completed, otherwise the tengine * will continue waiting for this update to occur until it is timed out. * We don't want this update going to the cib though, so use a direct ack. */ crm_trace("Op %s (call=%d): cancelled due to rsc deletion", op_key, op->call_id); send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } else { /* Before a stop is called, no need to direct ack */ crm_trace("Op %s (call=%d): no delete event required", op_key, op->call_id); } if ((op->interval == 0) && g_hash_table_remove(lrm_state->pending_ops, op_id)) { removed = TRUE; crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed", op_key, op->call_id, op_id, g_hash_table_size(lrm_state->pending_ops)); } else if(op->interval != 0 && op->op_status == PCMK_LRM_OP_CANCELLED) { removed = TRUE; g_hash_table_remove(lrm_state->pending_ops, op_id); } switch (op->op_status) { case PCMK_LRM_OP_CANCELLED: crm_info("Operation %s: %s (node=%s, call=%d, confirmed=%s)", op_key, services_lrm_status_str(op->op_status), lrm_state->node_name, op->call_id, removed ? "true" : "false"); break; case PCMK_LRM_OP_DONE: crm_notice("Operation %s: %s (node=%s, call=%d, rc=%d, cib-update=%d, confirmed=%s)", op_key, services_ocf_exitcode_str(op->rc), lrm_state->node_name, op->call_id, op->rc, update_id, removed ? "true" : "false"); break; case PCMK_LRM_OP_TIMEOUT: crm_err("Operation %s: %s (node=%s, call=%d, timeout=%dms)", op_key, services_lrm_status_str(op->op_status), lrm_state->node_name, op->call_id, op->timeout); break; default: crm_err("Operation %s (node=%s, call=%d, status=%d, cib-update=%d, confirmed=%s) %s", op_key, lrm_state->node_name, op->call_id, op->op_status, update_id, removed ? "true" : "false", services_lrm_status_str(op->op_status)); } if (op->output) { char *prefix = crm_strdup_printf("%s-%s_%s_%d:%d", lrm_state->node_name, op->rsc_id, op->op_type, op->interval, op->call_id); if (op->rc) { crm_log_output(LOG_NOTICE, prefix, op->output); } else { crm_log_output(LOG_DEBUG, prefix, op->output); } free(prefix); } if (op->rsc_deleted) { crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key); delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL); } /* If a shutdown was escalated while operations were pending, * then the FSA will be stalled right now... allow it to continue */ mainloop_set_trigger(fsa_source); update_history_cache(lrm_state, rsc, op); lrmd_free_rsc_info(rsc); free(op_key); free(op_id); return TRUE; } diff --git a/crmd/te_events.c b/crmd/te_events.c index b81a13e0a0..5b8659afbe 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -1,590 +1,589 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc); gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node) { const char *target_uuid = NULL; const char *router = NULL; const char *router_uuid = NULL; xmlNode *last_action = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; if (graph == NULL || graph->complete) { return FALSE; } gIter = graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; if (synapse->confirmed) { continue; } gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->type == action_type_pseudo || action->confirmed) { continue; } else if (action->type == action_type_crm) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { continue; } } target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (router) { crm_node_t *node = crm_get_peer(0, router); if (node) { router_uuid = node->uuid; } } if (safe_str_eq(target_uuid, down_node) || safe_str_eq(router_uuid, down_node)) { action->failed = TRUE; synapse->failed = TRUE; last_action = action->xml; stop_te_timer(action->timer); update_graph(graph, action); if (synapse->executed) { crm_notice("Action %d (%s) was pending on %s (offline)", action->id, ID(action->xml), down_node); } else { crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } } } } if (last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } static gboolean update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *value = NULL; const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); const char *on_uname = crm_peer_uname(event_node_uuid); const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); - if (rc == 99) { + if (rc == CRM_DIRECT_NACK_RC) { /* this is an internal code for "we're busy, try again" */ return FALSE; } else if (rc == target_rc) { return FALSE; } if (safe_str_eq(origin, "build_active_RAs")) { crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", id, rc, on_uname); return FALSE; } CRM_CHECK(on_uname != NULL, return TRUE); if (failed_stop_offset == NULL) { failed_stop_offset = strdup(INFINITY_S); } if (failed_start_offset == NULL) { failed_start_offset = strdup(INFINITY_S); } CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if (do_update || interval > 0) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_START)) { do_update = TRUE; value = failed_start_offset; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { do_update = TRUE; } if (value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE "++"; } if (do_update) { char *now = crm_itoa(time(NULL)); gboolean is_remote_node = FALSE; if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) { is_remote_node = TRUE; } crm_warn("Updating failcount for %s on %s after failed %s:" " rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now); attr_name = crm_concat("fail-count", rsc_id, '-'); update_attrd(on_uname, attr_name, value, NULL, is_remote_node); free(attr_name); attr_name = crm_concat("last-failure", rsc_id, '-'); update_attrd(on_uname, attr_name, now, NULL, is_remote_node); free(attr_name); free(now); } bail: free(rsc_id); free(task); return TRUE; } static int status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc) { int status = orig_status; if (target_rc == rc) { crm_trace("Target rc: == %d", rc); if (status != PCMK_LRM_OP_DONE) { crm_trace("Re-mapping op status to" " PCMK_LRM_OP_DONE for rc=%d", rc); status = PCMK_LRM_OP_DONE; } } else { status = PCMK_LRM_OP_ERROR; } - /* 99 is the code we use for direct nack's */ - if (rc != 99 && status != PCMK_LRM_OP_DONE) { + if ((rc != CRM_DIRECT_NACK_RC) && (status != PCMK_LRM_OP_DONE)) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s", action->id, task, uname, target_rc, rc, services_lrm_status_str(status)); } return status; } static void process_remote_node_action(crm_action_t *action, xmlNode *event) { xmlNode *child = NULL; /* The whole point of this function is to detect when a remote-node * is integrated into the cluster, and abort the transition if that remote-node * was fenced earlier in the transition. This allows a new transition to be * generated so resources can be placed on the new node. */ if (crm_remote_peer_cache_size() == 0) { return; } else if (action->type != action_type_rsc) { return; } else if (action->failed || action->confirmed == FALSE) { return; } else if (safe_str_neq(crm_element_value(action->xml, XML_LRM_ATTR_TASK), "start")) { return; } for (child = __xml_first_child(action->xml); child != NULL; child = __xml_next(child)) { const char *provider; const char *type; const char *rsc; crm_node_t *remote_peer; if (safe_str_neq(crm_element_name(child), XML_CIB_TAG_RESOURCE)) { continue; } provider = crm_element_value(child, XML_AGENT_ATTR_PROVIDER); type = crm_element_value(child, XML_ATTR_TYPE); rsc = ID(child); if (safe_str_neq(provider, "pacemaker") || safe_str_neq(type, "remote") || rsc == NULL) { break; } remote_peer = crm_get_peer_full(0, rsc, CRM_GET_PEER_REMOTE); if (remote_peer == NULL) { break; } /* A remote node will be placed in the "lost" state after * it has been successfully fenced. After successfully connecting * to a remote-node after being fenced, we need to abort the transition * so resources can be placed on the newly integrated remote-node */ if (safe_str_eq(remote_peer->state, CRM_NODE_LOST)) { abort_transition(INFINITY, tg_restart, "Remote-node re-discovered.", event); } return; } } /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = NULL; crm_action_t *action = NULL; action = get_action(action_id, FALSE); if (action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc, target_rc); if (op_status != PCMK_LRM_OP_DONE) { update_failcount(event, event_node, op_rc, target_rc, FALSE); } /* Process OP status */ switch (op_status) { case PCMK_LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case PCMK_LRM_OP_DONE: break; case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case PCMK_LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); te_action_confirmed(action); update_graph(transition_graph, action); trigger_graph(); if (action->failed) { allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); if (crm_is_true(allow_fail)) { action->failed = FALSE; } } if (action->failed) { abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event); } this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_info("Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); /* determine if this action affects a remote-node's online/offline status */ process_remote_node_action(action, event); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->id == id) { if (confirmed) { stop_te_timer(action->timer); te_action_confirmed(action); } return action; } } } return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { const char *task = NULL; const char *target = NULL; crm_action_t *action = (crm_action_t *) gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_neq(CRMD_ACTION_CANCEL, task)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if (safe_str_neq(task, id)) { crm_trace("Wrong key %s for %s on %s", task, id, node); continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (node && safe_str_neq(target, node)) { crm_trace("Wrong node %s for %s on %s", target, id, node); continue; } crm_trace("Found %s on %s", id, node); return action; } } return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter, bool quiet) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; /* lookup event */ gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (action->type != action_type_crm) { continue; } else if (safe_str_eq(this_action, CRM_OP_LRM_REFRESH)) { continue; } else if (filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if (safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; id = action->id; break; } if (match != NULL) { /* stop this event's timer if it had one */ break; } } if (match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); } else if (id > 0) { crm_err("No match for action %d", id); } else if(quiet == FALSE) { crm_warn("No match for shutdown action on %s", target); } return match; } gboolean process_graph_event(xmlNode * event, const char *event_node) { int rc = -1; int status = -1; int callid = -1; int action = -1; int target_rc = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean stop_early = FALSE; gboolean passed = FALSE; const char *id = NULL; const char *desc = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); /* */ id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); crm_element_value_int(event, XML_LRM_ATTR_RC, &rc); crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status); crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid); magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY); if (magic == NULL) { /* non-change */ return FALSE; } if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action, &target_rc) == FALSE) { crm_err("Invalid event %s.%d detected: %s", id, callid, magic); abort_transition(INFINITY, tg_restart, "Bad event", event); return FALSE; } if (status == PCMK_LRM_OP_PENDING) { goto bail; } if (transition_num == -1) { desc = "initiated outside of the cluster"; abort_transition(INFINITY, tg_restart, "Unexpected event", event); } else if (action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) { desc = "initiated by a different node"; abort_transition(INFINITY, tg_restart, "Foreign event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->id != transition_num) { desc = "arrived really late"; abort_transition(INFINITY, tg_restart, "Old event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->complete) { desc = "arrived late"; abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if (match_graph_event(action, event, event_node, status, rc, target_rc) < 0) { desc = "unknown"; abort_transition(INFINITY, tg_restart, "Unknown event", event); } else if (rc == target_rc) { passed = TRUE; crm_trace("Processed update to %s: %s", id, magic); } if (passed == FALSE) { if (update_failcount(event, event_node, rc, target_rc, transition_num == -1)) { /* Turns out this wasn't an lrm status refresh update aferall */ stop_early = FALSE; desc = "failed"; } crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num, action, id, callid, services_ocf_exitcode_str(rc), desc); } bail: free(update_te_uuid); return stop_early; }